From 1122a26f2abe4245ccdaed95ec23f63fe086b332 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Wed, 30 Sep 2009 13:52:12 +0200
Subject: block: use normal I/O path for discard requests

prepare_discard_fn() was being called in a place where memory allocation
was effectively impossible.  This makes it inappropriate for all but
the most trivial translations of Linux's DISCARD operation to the block
command set.  Additionally adding a payload there makes the ownership
of the bio backing unclear as it's now allocated by the device driver
and not the submitter as usual.

It is replaced with QUEUE_FLAG_DISCARD which is used to indicate whether
the queue supports discard operations or not.  blkdev_issue_discard now
allocates a one-page, sector-length payload which is the right thing
for the common ATA and SCSI implementations.

The mtd implementation of prepare_discard_fn() is replaced with simply
checking for the request being a discard.

Largely based on a previous patch from Matthew Wilcox <matthew@wil.cx>
which did the prepare_discard_fn but not the different payload allocation
yet.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 block/blk-barrier.c  | 35 ++++++++++++++++++++++++++++++-----
 block/blk-core.c     |  3 +--
 block/blk-settings.c | 17 -----------------
 3 files changed, 31 insertions(+), 24 deletions(-)

(limited to 'block')

diff --git a/block/blk-barrier.c b/block/blk-barrier.c
index 6593ab39cfe9..21f5025c3945 100644
--- a/block/blk-barrier.c
+++ b/block/blk-barrier.c
@@ -350,6 +350,7 @@ static void blkdev_discard_end_io(struct bio *bio, int err)
 
 	if (bio->bi_private)
 		complete(bio->bi_private);
+	__free_page(bio_page(bio));
 
 	bio_put(bio);
 }
@@ -372,26 +373,44 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 	struct request_queue *q = bdev_get_queue(bdev);
 	int type = flags & DISCARD_FL_BARRIER ?
 		DISCARD_BARRIER : DISCARD_NOBARRIER;
+	struct bio *bio;
+	struct page *page;
 	int ret = 0;
 
 	if (!q)
 		return -ENXIO;
 
-	if (!q->prepare_discard_fn)
+	if (!blk_queue_discard(q))
 		return -EOPNOTSUPP;
 
 	while (nr_sects && !ret) {
-		struct bio *bio = bio_alloc(gfp_mask, 0);
-		if (!bio)
-			return -ENOMEM;
+		unsigned int sector_size = q->limits.logical_block_size;
 
+		bio = bio_alloc(gfp_mask, 1);
+		if (!bio)
+			goto out;
+		bio->bi_sector = sector;
 		bio->bi_end_io = blkdev_discard_end_io;
 		bio->bi_bdev = bdev;
 		if (flags & DISCARD_FL_WAIT)
 			bio->bi_private = &wait;
 
-		bio->bi_sector = sector;
+		/*
+		 * Add a zeroed one-sector payload as that's what
+		 * our current implementations need.  If we'll ever need
+		 * more the interface will need revisiting.
+		 */
+		page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+		if (!page)
+			goto out_free_bio;
+		if (bio_add_pc_page(q, bio, page, sector_size, 0) < sector_size)
+			goto out_free_page;
 
+		/*
+		 * And override the bio size - the way discard works we
+		 * touch many more blocks on disk than the actual payload
+		 * length.
+		 */
 		if (nr_sects > queue_max_hw_sectors(q)) {
 			bio->bi_size = queue_max_hw_sectors(q) << 9;
 			nr_sects -= queue_max_hw_sectors(q);
@@ -414,5 +433,11 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 		bio_put(bio);
 	}
 	return ret;
+out_free_page:
+	__free_page(page);
+out_free_bio:
+	bio_put(bio);
+out:
+	return -ENOMEM;
 }
 EXPORT_SYMBOL(blkdev_issue_discard);
diff --git a/block/blk-core.c b/block/blk-core.c
index 8135228e4b29..80a020dd1580 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1124,7 +1124,6 @@ void init_request_from_bio(struct request *req, struct bio *bio)
 		req->cmd_flags |= REQ_DISCARD;
 		if (bio_rw_flagged(bio, BIO_RW_BARRIER))
 			req->cmd_flags |= REQ_SOFTBARRIER;
-		req->q->prepare_discard_fn(req->q, req);
 	} else if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER)))
 		req->cmd_flags |= REQ_HARDBARRIER;
 
@@ -1470,7 +1469,7 @@ static inline void __generic_make_request(struct bio *bio)
 			goto end_io;
 
 		if (bio_rw_flagged(bio, BIO_RW_DISCARD) &&
-		    !q->prepare_discard_fn) {
+		    !blk_queue_discard(q)) {
 			err = -EOPNOTSUPP;
 			goto end_io;
 		}
diff --git a/block/blk-settings.c b/block/blk-settings.c
index eaf122ff5f16..d29498ef1eb5 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -33,23 +33,6 @@ void blk_queue_prep_rq(struct request_queue *q, prep_rq_fn *pfn)
 }
 EXPORT_SYMBOL(blk_queue_prep_rq);
 
-/**
- * blk_queue_set_discard - set a discard_sectors function for queue
- * @q:		queue
- * @dfn:	prepare_discard function
- *
- * It's possible for a queue to register a discard callback which is used
- * to transform a discard request into the appropriate type for the
- * hardware. If none is registered, then discard requests are failed
- * with %EOPNOTSUPP.
- *
- */
-void blk_queue_set_discard(struct request_queue *q, prepare_discard_fn *dfn)
-{
-	q->prepare_discard_fn = dfn;
-}
-EXPORT_SYMBOL(blk_queue_set_discard);
-
 /**
  * blk_queue_merge_bvec - set a merge_bvec function for queue
  * @q:		queue
-- 
cgit v1.2.2


From ca80650cfbde5b17a5fa957a261c7973f84599a7 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Wed, 30 Sep 2009 13:54:20 +0200
Subject: block: allow large discard requests

Currently we set the bio size to the byte equivalent of the blocks to
be trimmed when submitting the initial DISCARD ioctl.  That means it
is subject to the max_hw_sectors limitation of the HBA which is
much lower than the size of a DISCARD request we can support.
Add a separate max_discard_sectors tunable to limit the size for discard
requests.

We limit the max discard request size in bytes to 32bit as that is the
limit for bio->bi_size.  This could be much larger if we had a way to pass
that information through the block layer.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-barrier.c  | 10 ++++++----
 block/blk-core.c     |  3 ++-
 block/blk-settings.c | 13 +++++++++++++
 3 files changed, 21 insertions(+), 5 deletions(-)

(limited to 'block')

diff --git a/block/blk-barrier.c b/block/blk-barrier.c
index 21f5025c3945..8873b9b439ff 100644
--- a/block/blk-barrier.c
+++ b/block/blk-barrier.c
@@ -385,6 +385,8 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 
 	while (nr_sects && !ret) {
 		unsigned int sector_size = q->limits.logical_block_size;
+		unsigned int max_discard_sectors =
+			min(q->limits.max_discard_sectors, UINT_MAX >> 9);
 
 		bio = bio_alloc(gfp_mask, 1);
 		if (!bio)
@@ -411,10 +413,10 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 		 * touch many more blocks on disk than the actual payload
 		 * length.
 		 */
-		if (nr_sects > queue_max_hw_sectors(q)) {
-			bio->bi_size = queue_max_hw_sectors(q) << 9;
-			nr_sects -= queue_max_hw_sectors(q);
-			sector += queue_max_hw_sectors(q);
+		if (nr_sects > max_discard_sectors) {
+			bio->bi_size = max_discard_sectors << 9;
+			nr_sects -= max_discard_sectors;
+			sector += max_discard_sectors;
 		} else {
 			bio->bi_size = nr_sects << 9;
 			nr_sects = 0;
diff --git a/block/blk-core.c b/block/blk-core.c
index 80a020dd1580..34504f309728 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1436,7 +1436,8 @@ static inline void __generic_make_request(struct bio *bio)
 			goto end_io;
 		}
 
-		if (unlikely(nr_sectors > queue_max_hw_sectors(q))) {
+		if (unlikely(!bio_rw_flagged(bio, BIO_RW_DISCARD) &&
+			     nr_sectors > queue_max_hw_sectors(q))) {
 			printk(KERN_ERR "bio too big device %s (%u > %u)\n",
 			       bdevname(bio->bi_bdev, b),
 			       bio_sectors(bio),
diff --git a/block/blk-settings.c b/block/blk-settings.c
index d29498ef1eb5..e0695bca7027 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -96,6 +96,7 @@ void blk_set_default_limits(struct queue_limits *lim)
 	lim->max_segment_size = MAX_SEGMENT_SIZE;
 	lim->max_sectors = BLK_DEF_MAX_SECTORS;
 	lim->max_hw_sectors = INT_MAX;
+	lim->max_discard_sectors = SAFE_MAX_SECTORS;
 	lim->logical_block_size = lim->physical_block_size = lim->io_min = 512;
 	lim->bounce_pfn = (unsigned long)(BLK_BOUNCE_ANY >> PAGE_SHIFT);
 	lim->alignment_offset = 0;
@@ -238,6 +239,18 @@ void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_sectors)
 }
 EXPORT_SYMBOL(blk_queue_max_hw_sectors);
 
+/**
+ * blk_queue_max_discard_sectors - set max sectors for a single discard
+ * @q:  the request queue for the device
+ * @max_discard: maximum number of sectors to discard
+ **/
+void blk_queue_max_discard_sectors(struct request_queue *q,
+		unsigned int max_discard_sectors)
+{
+	q->limits.max_discard_sectors = max_discard_sectors;
+}
+EXPORT_SYMBOL(blk_queue_max_discard_sectors);
+
 /**
  * blk_queue_max_phys_segments - set max phys segments for a request for this queue
  * @q:  the request queue for the device
-- 
cgit v1.2.2


From 1a35e0f6443f4266dad4c569c55c57a9032596fa Mon Sep 17 00:00:00 2001
From: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Date: Thu, 1 Oct 2009 21:16:13 +0200
Subject: Add a tracepoint for block request remapping

Since 2.6.31 now has request-based device-mapper, it's useful to have
a tracepoint for request-remapping as well as bio-remapping.
This patch adds a tracepoint for request-remapping, trace_block_rq_remap().

Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Cc: Alasdair G Kergon <agk@redhat.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-core.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index 34504f309728..ddaaea4fdffc 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -34,6 +34,7 @@
 #include "blk.h"
 
 EXPORT_TRACEPOINT_SYMBOL_GPL(block_remap);
+EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap);
 EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete);
 
 static int __make_request(struct request_queue *q, struct bio *bio);
-- 
cgit v1.2.2


From 492af6350a5ccf087e4964104a276ed358811458 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Sat, 3 Oct 2009 09:37:51 +0200
Subject: block: remove the anticipatory IO scheduler

AS is mostly a subset of CFQ, so there's little point in still
providing this separate IO scheduler. Hopefully at some point we
can get down to one single IO scheduler again, at least this brings
us closer by having only one intelligent IO scheduler.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/Kconfig.iosched |   22 +-
 block/Makefile        |    1 -
 block/as-iosched.c    | 1520 -------------------------------------------------
 block/elevator.c      |   10 +-
 4 files changed, 6 insertions(+), 1547 deletions(-)
 delete mode 100644 block/as-iosched.c

(limited to 'block')

diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
index 7e803fc88770..baad3dae3655 100644
--- a/block/Kconfig.iosched
+++ b/block/Kconfig.iosched
@@ -12,24 +12,14 @@ config IOSCHED_NOOP
 	  that do their own scheduling and require only minimal assistance from
 	  the kernel.
 
-config IOSCHED_AS
-	tristate "Anticipatory I/O scheduler"
-	default y
-	---help---
-	  The anticipatory I/O scheduler is generally a good choice for most
-	  environments, but is quite large and complex when compared to the
-	  deadline I/O scheduler, it can also be slower in some cases
-	  especially some database loads.
-
 config IOSCHED_DEADLINE
 	tristate "Deadline I/O scheduler"
 	default y
 	---help---
-	  The deadline I/O scheduler is simple and compact, and is often as
-	  good as the anticipatory I/O scheduler, and in some database
-	  workloads, better. In the case of a single process performing I/O to
-	  a disk at any one time, its behaviour is almost identical to the
-	  anticipatory I/O scheduler and so is a good choice.
+	  The deadline I/O scheduler is simple and compact. It will provide
+	  CSCAN service with FIFO expiration of requests, switching to
+	  a new point in the service tree and doing a batch of IO from there
+	  in case of expiry.
 
 config IOSCHED_CFQ
 	tristate "CFQ I/O scheduler"
@@ -47,9 +37,6 @@ choice
 	  Select the I/O scheduler which will be used by default for all
 	  block devices.
 
-	config DEFAULT_AS
-		bool "Anticipatory" if IOSCHED_AS=y
-
 	config DEFAULT_DEADLINE
 		bool "Deadline" if IOSCHED_DEADLINE=y
 
@@ -63,7 +50,6 @@ endchoice
 
 config DEFAULT_IOSCHED
 	string
-	default "anticipatory" if DEFAULT_AS
 	default "deadline" if DEFAULT_DEADLINE
 	default "cfq" if DEFAULT_CFQ
 	default "noop" if DEFAULT_NOOP
diff --git a/block/Makefile b/block/Makefile
index ba74ca6bfa14..7914108952f2 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -9,7 +9,6 @@ obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \
 
 obj-$(CONFIG_BLK_DEV_BSG)	+= bsg.o
 obj-$(CONFIG_IOSCHED_NOOP)	+= noop-iosched.o
-obj-$(CONFIG_IOSCHED_AS)	+= as-iosched.o
 obj-$(CONFIG_IOSCHED_DEADLINE)	+= deadline-iosched.o
 obj-$(CONFIG_IOSCHED_CFQ)	+= cfq-iosched.o
 
diff --git a/block/as-iosched.c b/block/as-iosched.c
deleted file mode 100644
index ce8ba57c6557..000000000000
--- a/block/as-iosched.c
+++ /dev/null
@@ -1,1520 +0,0 @@
-/*
- *  Anticipatory & deadline i/o scheduler.
- *
- *  Copyright (C) 2002 Jens Axboe <axboe@kernel.dk>
- *                     Nick Piggin <nickpiggin@yahoo.com.au>
- *
- */
-#include <linux/kernel.h>
-#include <linux/fs.h>
-#include <linux/blkdev.h>
-#include <linux/elevator.h>
-#include <linux/bio.h>
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/init.h>
-#include <linux/compiler.h>
-#include <linux/rbtree.h>
-#include <linux/interrupt.h>
-
-/*
- * See Documentation/block/as-iosched.txt
- */
-
-/*
- * max time before a read is submitted.
- */
-#define default_read_expire (HZ / 8)
-
-/*
- * ditto for writes, these limits are not hard, even
- * if the disk is capable of satisfying them.
- */
-#define default_write_expire (HZ / 4)
-
-/*
- * read_batch_expire describes how long we will allow a stream of reads to
- * persist before looking to see whether it is time to switch over to writes.
- */
-#define default_read_batch_expire (HZ / 2)
-
-/*
- * write_batch_expire describes how long we want a stream of writes to run for.
- * This is not a hard limit, but a target we set for the auto-tuning thingy.
- * See, the problem is: we can send a lot of writes to disk cache / TCQ in
- * a short amount of time...
- */
-#define default_write_batch_expire (HZ / 8)
-
-/*
- * max time we may wait to anticipate a read (default around 6ms)
- */
-#define default_antic_expire ((HZ / 150) ? HZ / 150 : 1)
-
-/*
- * Keep track of up to 20ms thinktimes. We can go as big as we like here,
- * however huge values tend to interfere and not decay fast enough. A program
- * might be in a non-io phase of operation. Waiting on user input for example,
- * or doing a lengthy computation. A small penalty can be justified there, and
- * will still catch out those processes that constantly have large thinktimes.
- */
-#define MAX_THINKTIME (HZ/50UL)
-
-/* Bits in as_io_context.state */
-enum as_io_states {
-	AS_TASK_RUNNING=0,	/* Process has not exited */
-	AS_TASK_IOSTARTED,	/* Process has started some IO */
-	AS_TASK_IORUNNING,	/* Process has completed some IO */
-};
-
-enum anticipation_status {
-	ANTIC_OFF=0,		/* Not anticipating (normal operation)	*/
-	ANTIC_WAIT_REQ,		/* The last read has not yet completed  */
-	ANTIC_WAIT_NEXT,	/* Currently anticipating a request vs
-				   last read (which has completed) */
-	ANTIC_FINISHED,		/* Anticipating but have found a candidate
-				 * or timed out */
-};
-
-struct as_data {
-	/*
-	 * run time data
-	 */
-
-	struct request_queue *q;	/* the "owner" queue */
-
-	/*
-	 * requests (as_rq s) are present on both sort_list and fifo_list
-	 */
-	struct rb_root sort_list[2];
-	struct list_head fifo_list[2];
-
-	struct request *next_rq[2];	/* next in sort order */
-	sector_t last_sector[2];	/* last SYNC & ASYNC sectors */
-
-	unsigned long exit_prob;	/* probability a task will exit while
-					   being waited on */
-	unsigned long exit_no_coop;	/* probablility an exited task will
-					   not be part of a later cooperating
-					   request */
-	unsigned long new_ttime_total; 	/* mean thinktime on new proc */
-	unsigned long new_ttime_mean;
-	u64 new_seek_total;		/* mean seek on new proc */
-	sector_t new_seek_mean;
-
-	unsigned long current_batch_expires;
-	unsigned long last_check_fifo[2];
-	int changed_batch;		/* 1: waiting for old batch to end */
-	int new_batch;			/* 1: waiting on first read complete */
-	int batch_data_dir;		/* current batch SYNC / ASYNC */
-	int write_batch_count;		/* max # of reqs in a write batch */
-	int current_write_count;	/* how many requests left this batch */
-	int write_batch_idled;		/* has the write batch gone idle? */
-
-	enum anticipation_status antic_status;
-	unsigned long antic_start;	/* jiffies: when it started */
-	struct timer_list antic_timer;	/* anticipatory scheduling timer */
-	struct work_struct antic_work;	/* Deferred unplugging */
-	struct io_context *io_context;	/* Identify the expected process */
-	int ioc_finished; /* IO associated with io_context is finished */
-	int nr_dispatched;
-
-	/*
-	 * settings that change how the i/o scheduler behaves
-	 */
-	unsigned long fifo_expire[2];
-	unsigned long batch_expire[2];
-	unsigned long antic_expire;
-};
-
-/*
- * per-request data.
- */
-enum arq_state {
-	AS_RQ_NEW=0,		/* New - not referenced and not on any lists */
-	AS_RQ_QUEUED,		/* In the request queue. It belongs to the
-				   scheduler */
-	AS_RQ_DISPATCHED,	/* On the dispatch list. It belongs to the
-				   driver now */
-	AS_RQ_PRESCHED,		/* Debug poisoning for requests being used */
-	AS_RQ_REMOVED,
-	AS_RQ_MERGED,
-	AS_RQ_POSTSCHED,	/* when they shouldn't be */
-};
-
-#define RQ_IOC(rq)	((struct io_context *) (rq)->elevator_private)
-#define RQ_STATE(rq)	((enum arq_state)(rq)->elevator_private2)
-#define RQ_SET_STATE(rq, state)	((rq)->elevator_private2 = (void *) state)
-
-static DEFINE_PER_CPU(unsigned long, as_ioc_count);
-static struct completion *ioc_gone;
-static DEFINE_SPINLOCK(ioc_gone_lock);
-
-static void as_move_to_dispatch(struct as_data *ad, struct request *rq);
-static void as_antic_stop(struct as_data *ad);
-
-/*
- * IO Context helper functions
- */
-
-/* Called to deallocate the as_io_context */
-static void free_as_io_context(struct as_io_context *aic)
-{
-	kfree(aic);
-	elv_ioc_count_dec(as_ioc_count);
-	if (ioc_gone) {
-		/*
-		 * AS scheduler is exiting, grab exit lock and check
-		 * the pending io context count. If it hits zero,
-		 * complete ioc_gone and set it back to NULL.
-		 */
-		spin_lock(&ioc_gone_lock);
-		if (ioc_gone && !elv_ioc_count_read(as_ioc_count)) {
-			complete(ioc_gone);
-			ioc_gone = NULL;
-		}
-		spin_unlock(&ioc_gone_lock);
-	}
-}
-
-static void as_trim(struct io_context *ioc)
-{
-	spin_lock_irq(&ioc->lock);
-	if (ioc->aic)
-		free_as_io_context(ioc->aic);
-	ioc->aic = NULL;
-	spin_unlock_irq(&ioc->lock);
-}
-
-/* Called when the task exits */
-static void exit_as_io_context(struct as_io_context *aic)
-{
-	WARN_ON(!test_bit(AS_TASK_RUNNING, &aic->state));
-	clear_bit(AS_TASK_RUNNING, &aic->state);
-}
-
-static struct as_io_context *alloc_as_io_context(void)
-{
-	struct as_io_context *ret;
-
-	ret = kmalloc(sizeof(*ret), GFP_ATOMIC);
-	if (ret) {
-		ret->dtor = free_as_io_context;
-		ret->exit = exit_as_io_context;
-		ret->state = 1 << AS_TASK_RUNNING;
-		atomic_set(&ret->nr_queued, 0);
-		atomic_set(&ret->nr_dispatched, 0);
-		spin_lock_init(&ret->lock);
-		ret->ttime_total = 0;
-		ret->ttime_samples = 0;
-		ret->ttime_mean = 0;
-		ret->seek_total = 0;
-		ret->seek_samples = 0;
-		ret->seek_mean = 0;
-		elv_ioc_count_inc(as_ioc_count);
-	}
-
-	return ret;
-}
-
-/*
- * If the current task has no AS IO context then create one and initialise it.
- * Then take a ref on the task's io context and return it.
- */
-static struct io_context *as_get_io_context(int node)
-{
-	struct io_context *ioc = get_io_context(GFP_ATOMIC, node);
-	if (ioc && !ioc->aic) {
-		ioc->aic = alloc_as_io_context();
-		if (!ioc->aic) {
-			put_io_context(ioc);
-			ioc = NULL;
-		}
-	}
-	return ioc;
-}
-
-static void as_put_io_context(struct request *rq)
-{
-	struct as_io_context *aic;
-
-	if (unlikely(!RQ_IOC(rq)))
-		return;
-
-	aic = RQ_IOC(rq)->aic;
-
-	if (rq_is_sync(rq) && aic) {
-		unsigned long flags;
-
-		spin_lock_irqsave(&aic->lock, flags);
-		set_bit(AS_TASK_IORUNNING, &aic->state);
-		aic->last_end_request = jiffies;
-		spin_unlock_irqrestore(&aic->lock, flags);
-	}
-
-	put_io_context(RQ_IOC(rq));
-}
-
-/*
- * rb tree support functions
- */
-#define RQ_RB_ROOT(ad, rq)	(&(ad)->sort_list[rq_is_sync((rq))])
-
-static void as_add_rq_rb(struct as_data *ad, struct request *rq)
-{
-	struct request *alias;
-
-	while ((unlikely(alias = elv_rb_add(RQ_RB_ROOT(ad, rq), rq)))) {
-		as_move_to_dispatch(ad, alias);
-		as_antic_stop(ad);
-	}
-}
-
-static inline void as_del_rq_rb(struct as_data *ad, struct request *rq)
-{
-	elv_rb_del(RQ_RB_ROOT(ad, rq), rq);
-}
-
-/*
- * IO Scheduler proper
- */
-
-#define MAXBACK (1024 * 1024)	/*
-				 * Maximum distance the disk will go backward
-				 * for a request.
-				 */
-
-#define BACK_PENALTY	2
-
-/*
- * as_choose_req selects the preferred one of two requests of the same data_dir
- * ignoring time - eg. timeouts, which is the job of as_dispatch_request
- */
-static struct request *
-as_choose_req(struct as_data *ad, struct request *rq1, struct request *rq2)
-{
-	int data_dir;
-	sector_t last, s1, s2, d1, d2;
-	int r1_wrap=0, r2_wrap=0;	/* requests are behind the disk head */
-	const sector_t maxback = MAXBACK;
-
-	if (rq1 == NULL || rq1 == rq2)
-		return rq2;
-	if (rq2 == NULL)
-		return rq1;
-
-	data_dir = rq_is_sync(rq1);
-
-	last = ad->last_sector[data_dir];
-	s1 = blk_rq_pos(rq1);
-	s2 = blk_rq_pos(rq2);
-
-	BUG_ON(data_dir != rq_is_sync(rq2));
-
-	/*
-	 * Strict one way elevator _except_ in the case where we allow
-	 * short backward seeks which are biased as twice the cost of a
-	 * similar forward seek.
-	 */
-	if (s1 >= last)
-		d1 = s1 - last;
-	else if (s1+maxback >= last)
-		d1 = (last - s1)*BACK_PENALTY;
-	else {
-		r1_wrap = 1;
-		d1 = 0; /* shut up, gcc */
-	}
-
-	if (s2 >= last)
-		d2 = s2 - last;
-	else if (s2+maxback >= last)
-		d2 = (last - s2)*BACK_PENALTY;
-	else {
-		r2_wrap = 1;
-		d2 = 0;
-	}
-
-	/* Found required data */
-	if (!r1_wrap && r2_wrap)
-		return rq1;
-	else if (!r2_wrap && r1_wrap)
-		return rq2;
-	else if (r1_wrap && r2_wrap) {
-		/* both behind the head */
-		if (s1 <= s2)
-			return rq1;
-		else
-			return rq2;
-	}
-
-	/* Both requests in front of the head */
-	if (d1 < d2)
-		return rq1;
-	else if (d2 < d1)
-		return rq2;
-	else {
-		if (s1 >= s2)
-			return rq1;
-		else
-			return rq2;
-	}
-}
-
-/*
- * as_find_next_rq finds the next request after @prev in elevator order.
- * this with as_choose_req form the basis for how the scheduler chooses
- * what request to process next. Anticipation works on top of this.
- */
-static struct request *
-as_find_next_rq(struct as_data *ad, struct request *last)
-{
-	struct rb_node *rbnext = rb_next(&last->rb_node);
-	struct rb_node *rbprev = rb_prev(&last->rb_node);
-	struct request *next = NULL, *prev = NULL;
-
-	BUG_ON(RB_EMPTY_NODE(&last->rb_node));
-
-	if (rbprev)
-		prev = rb_entry_rq(rbprev);
-
-	if (rbnext)
-		next = rb_entry_rq(rbnext);
-	else {
-		const int data_dir = rq_is_sync(last);
-
-		rbnext = rb_first(&ad->sort_list[data_dir]);
-		if (rbnext && rbnext != &last->rb_node)
-			next = rb_entry_rq(rbnext);
-	}
-
-	return as_choose_req(ad, next, prev);
-}
-
-/*
- * anticipatory scheduling functions follow
- */
-
-/*
- * as_antic_expired tells us when we have anticipated too long.
- * The funny "absolute difference" math on the elapsed time is to handle
- * jiffy wraps, and disks which have been idle for 0x80000000 jiffies.
- */
-static int as_antic_expired(struct as_data *ad)
-{
-	long delta_jif;
-
-	delta_jif = jiffies - ad->antic_start;
-	if (unlikely(delta_jif < 0))
-		delta_jif = -delta_jif;
-	if (delta_jif < ad->antic_expire)
-		return 0;
-
-	return 1;
-}
-
-/*
- * as_antic_waitnext starts anticipating that a nice request will soon be
- * submitted. See also as_antic_waitreq
- */
-static void as_antic_waitnext(struct as_data *ad)
-{
-	unsigned long timeout;
-
-	BUG_ON(ad->antic_status != ANTIC_OFF
-			&& ad->antic_status != ANTIC_WAIT_REQ);
-
-	timeout = ad->antic_start + ad->antic_expire;
-
-	mod_timer(&ad->antic_timer, timeout);
-
-	ad->antic_status = ANTIC_WAIT_NEXT;
-}
-
-/*
- * as_antic_waitreq starts anticipating. We don't start timing the anticipation
- * until the request that we're anticipating on has finished. This means we
- * are timing from when the candidate process wakes up hopefully.
- */
-static void as_antic_waitreq(struct as_data *ad)
-{
-	BUG_ON(ad->antic_status == ANTIC_FINISHED);
-	if (ad->antic_status == ANTIC_OFF) {
-		if (!ad->io_context || ad->ioc_finished)
-			as_antic_waitnext(ad);
-		else
-			ad->antic_status = ANTIC_WAIT_REQ;
-	}
-}
-
-/*
- * This is called directly by the functions in this file to stop anticipation.
- * We kill the timer and schedule a call to the request_fn asap.
- */
-static void as_antic_stop(struct as_data *ad)
-{
-	int status = ad->antic_status;
-
-	if (status == ANTIC_WAIT_REQ || status == ANTIC_WAIT_NEXT) {
-		if (status == ANTIC_WAIT_NEXT)
-			del_timer(&ad->antic_timer);
-		ad->antic_status = ANTIC_FINISHED;
-		/* see as_work_handler */
-		kblockd_schedule_work(ad->q, &ad->antic_work);
-	}
-}
-
-/*
- * as_antic_timeout is the timer function set by as_antic_waitnext.
- */
-static void as_antic_timeout(unsigned long data)
-{
-	struct request_queue *q = (struct request_queue *)data;
-	struct as_data *ad = q->elevator->elevator_data;
-	unsigned long flags;
-
-	spin_lock_irqsave(q->queue_lock, flags);
-	if (ad->antic_status == ANTIC_WAIT_REQ
-			|| ad->antic_status == ANTIC_WAIT_NEXT) {
-		struct as_io_context *aic;
-		spin_lock(&ad->io_context->lock);
-		aic = ad->io_context->aic;
-
-		ad->antic_status = ANTIC_FINISHED;
-		kblockd_schedule_work(q, &ad->antic_work);
-
-		if (aic->ttime_samples == 0) {
-			/* process anticipated on has exited or timed out*/
-			ad->exit_prob = (7*ad->exit_prob + 256)/8;
-		}
-		if (!test_bit(AS_TASK_RUNNING, &aic->state)) {
-			/* process not "saved" by a cooperating request */
-			ad->exit_no_coop = (7*ad->exit_no_coop + 256)/8;
-		}
-		spin_unlock(&ad->io_context->lock);
-	}
-	spin_unlock_irqrestore(q->queue_lock, flags);
-}
-
-static void as_update_thinktime(struct as_data *ad, struct as_io_context *aic,
-				unsigned long ttime)
-{
-	/* fixed point: 1.0 == 1<<8 */
-	if (aic->ttime_samples == 0) {
-		ad->new_ttime_total = (7*ad->new_ttime_total + 256*ttime) / 8;
-		ad->new_ttime_mean = ad->new_ttime_total / 256;
-
-		ad->exit_prob = (7*ad->exit_prob)/8;
-	}
-	aic->ttime_samples = (7*aic->ttime_samples + 256) / 8;
-	aic->ttime_total = (7*aic->ttime_total + 256*ttime) / 8;
-	aic->ttime_mean = (aic->ttime_total + 128) / aic->ttime_samples;
-}
-
-static void as_update_seekdist(struct as_data *ad, struct as_io_context *aic,
-				sector_t sdist)
-{
-	u64 total;
-
-	if (aic->seek_samples == 0) {
-		ad->new_seek_total = (7*ad->new_seek_total + 256*(u64)sdist)/8;
-		ad->new_seek_mean = ad->new_seek_total / 256;
-	}
-
-	/*
-	 * Don't allow the seek distance to get too large from the
-	 * odd fragment, pagein, etc
-	 */
-	if (aic->seek_samples <= 60) /* second&third seek */
-		sdist = min(sdist, (aic->seek_mean * 4) + 2*1024*1024);
-	else
-		sdist = min(sdist, (aic->seek_mean * 4)	+ 2*1024*64);
-
-	aic->seek_samples = (7*aic->seek_samples + 256) / 8;
-	aic->seek_total = (7*aic->seek_total + (u64)256*sdist) / 8;
-	total = aic->seek_total + (aic->seek_samples/2);
-	do_div(total, aic->seek_samples);
-	aic->seek_mean = (sector_t)total;
-}
-
-/*
- * as_update_iohist keeps a decaying histogram of IO thinktimes, and
- * updates @aic->ttime_mean based on that. It is called when a new
- * request is queued.
- */
-static void as_update_iohist(struct as_data *ad, struct as_io_context *aic,
-				struct request *rq)
-{
-	int data_dir = rq_is_sync(rq);
-	unsigned long thinktime = 0;
-	sector_t seek_dist;
-
-	if (aic == NULL)
-		return;
-
-	if (data_dir == BLK_RW_SYNC) {
-		unsigned long in_flight = atomic_read(&aic->nr_queued)
-					+ atomic_read(&aic->nr_dispatched);
-		spin_lock(&aic->lock);
-		if (test_bit(AS_TASK_IORUNNING, &aic->state) ||
-			test_bit(AS_TASK_IOSTARTED, &aic->state)) {
-			/* Calculate read -> read thinktime */
-			if (test_bit(AS_TASK_IORUNNING, &aic->state)
-							&& in_flight == 0) {
-				thinktime = jiffies - aic->last_end_request;
-				thinktime = min(thinktime, MAX_THINKTIME-1);
-			}
-			as_update_thinktime(ad, aic, thinktime);
-
-			/* Calculate read -> read seek distance */
-			if (aic->last_request_pos < blk_rq_pos(rq))
-				seek_dist = blk_rq_pos(rq) -
-					    aic->last_request_pos;
-			else
-				seek_dist = aic->last_request_pos -
-					    blk_rq_pos(rq);
-			as_update_seekdist(ad, aic, seek_dist);
-		}
-		aic->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
-		set_bit(AS_TASK_IOSTARTED, &aic->state);
-		spin_unlock(&aic->lock);
-	}
-}
-
-/*
- * as_close_req decides if one request is considered "close" to the
- * previous one issued.
- */
-static int as_close_req(struct as_data *ad, struct as_io_context *aic,
-			struct request *rq)
-{
-	unsigned long delay;	/* jiffies */
-	sector_t last = ad->last_sector[ad->batch_data_dir];
-	sector_t next = blk_rq_pos(rq);
-	sector_t delta; /* acceptable close offset (in sectors) */
-	sector_t s;
-
-	if (ad->antic_status == ANTIC_OFF || !ad->ioc_finished)
-		delay = 0;
-	else
-		delay = jiffies - ad->antic_start;
-
-	if (delay == 0)
-		delta = 8192;
-	else if (delay <= (20 * HZ / 1000) && delay <= ad->antic_expire)
-		delta = 8192 << delay;
-	else
-		return 1;
-
-	if ((last <= next + (delta>>1)) && (next <= last + delta))
-		return 1;
-
-	if (last < next)
-		s = next - last;
-	else
-		s = last - next;
-
-	if (aic->seek_samples == 0) {
-		/*
-		 * Process has just started IO. Use past statistics to
-		 * gauge success possibility
-		 */
-		if (ad->new_seek_mean > s) {
-			/* this request is better than what we're expecting */
-			return 1;
-		}
-
-	} else {
-		if (aic->seek_mean > s) {
-			/* this request is better than what we're expecting */
-			return 1;
-		}
-	}
-
-	return 0;
-}
-
-/*
- * as_can_break_anticipation returns true if we have been anticipating this
- * request.
- *
- * It also returns true if the process against which we are anticipating
- * submits a write - that's presumably an fsync, O_SYNC write, etc. We want to
- * dispatch it ASAP, because we know that application will not be submitting
- * any new reads.
- *
- * If the task which has submitted the request has exited, break anticipation.
- *
- * If this task has queued some other IO, do not enter enticipation.
- */
-static int as_can_break_anticipation(struct as_data *ad, struct request *rq)
-{
-	struct io_context *ioc;
-	struct as_io_context *aic;
-
-	ioc = ad->io_context;
-	BUG_ON(!ioc);
-	spin_lock(&ioc->lock);
-
-	if (rq && ioc == RQ_IOC(rq)) {
-		/* request from same process */
-		spin_unlock(&ioc->lock);
-		return 1;
-	}
-
-	if (ad->ioc_finished && as_antic_expired(ad)) {
-		/*
-		 * In this situation status should really be FINISHED,
-		 * however the timer hasn't had the chance to run yet.
-		 */
-		spin_unlock(&ioc->lock);
-		return 1;
-	}
-
-	aic = ioc->aic;
-	if (!aic) {
-		spin_unlock(&ioc->lock);
-		return 0;
-	}
-
-	if (atomic_read(&aic->nr_queued) > 0) {
-		/* process has more requests queued */
-		spin_unlock(&ioc->lock);
-		return 1;
-	}
-
-	if (atomic_read(&aic->nr_dispatched) > 0) {
-		/* process has more requests dispatched */
-		spin_unlock(&ioc->lock);
-		return 1;
-	}
-
-	if (rq && rq_is_sync(rq) && as_close_req(ad, aic, rq)) {
-		/*
-		 * Found a close request that is not one of ours.
-		 *
-		 * This makes close requests from another process update
-		 * our IO history. Is generally useful when there are
-		 * two or more cooperating processes working in the same
-		 * area.
-		 */
-		if (!test_bit(AS_TASK_RUNNING, &aic->state)) {
-			if (aic->ttime_samples == 0)
-				ad->exit_prob = (7*ad->exit_prob + 256)/8;
-
-			ad->exit_no_coop = (7*ad->exit_no_coop)/8;
-		}
-
-		as_update_iohist(ad, aic, rq);
-		spin_unlock(&ioc->lock);
-		return 1;
-	}
-
-	if (!test_bit(AS_TASK_RUNNING, &aic->state)) {
-		/* process anticipated on has exited */
-		if (aic->ttime_samples == 0)
-			ad->exit_prob = (7*ad->exit_prob + 256)/8;
-
-		if (ad->exit_no_coop > 128) {
-			spin_unlock(&ioc->lock);
-			return 1;
-		}
-	}
-
-	if (aic->ttime_samples == 0) {
-		if (ad->new_ttime_mean > ad->antic_expire) {
-			spin_unlock(&ioc->lock);
-			return 1;
-		}
-		if (ad->exit_prob * ad->exit_no_coop > 128*256) {
-			spin_unlock(&ioc->lock);
-			return 1;
-		}
-	} else if (aic->ttime_mean > ad->antic_expire) {
-		/* the process thinks too much between requests */
-		spin_unlock(&ioc->lock);
-		return 1;
-	}
-	spin_unlock(&ioc->lock);
-	return 0;
-}
-
-/*
- * as_can_anticipate indicates whether we should either run rq
- * or keep anticipating a better request.
- */
-static int as_can_anticipate(struct as_data *ad, struct request *rq)
-{
-#if 0 /* disable for now, we need to check tag level as well */
-	/*
-	 * SSD device without seek penalty, disable idling
-	 */
-	if (blk_queue_nonrot(ad->q)) axman
-		return 0;
-#endif
-
-	if (!ad->io_context)
-		/*
-		 * Last request submitted was a write
-		 */
-		return 0;
-
-	if (ad->antic_status == ANTIC_FINISHED)
-		/*
-		 * Don't restart if we have just finished. Run the next request
-		 */
-		return 0;
-
-	if (as_can_break_anticipation(ad, rq))
-		/*
-		 * This request is a good candidate. Don't keep anticipating,
-		 * run it.
-		 */
-		return 0;
-
-	/*
-	 * OK from here, we haven't finished, and don't have a decent request!
-	 * Status is either ANTIC_OFF so start waiting,
-	 * ANTIC_WAIT_REQ so continue waiting for request to finish
-	 * or ANTIC_WAIT_NEXT so continue waiting for an acceptable request.
-	 */
-
-	return 1;
-}
-
-/*
- * as_update_rq must be called whenever a request (rq) is added to
- * the sort_list. This function keeps caches up to date, and checks if the
- * request might be one we are "anticipating"
- */
-static void as_update_rq(struct as_data *ad, struct request *rq)
-{
-	const int data_dir = rq_is_sync(rq);
-
-	/* keep the next_rq cache up to date */
-	ad->next_rq[data_dir] = as_choose_req(ad, rq, ad->next_rq[data_dir]);
-
-	/*
-	 * have we been anticipating this request?
-	 * or does it come from the same process as the one we are anticipating
-	 * for?
-	 */
-	if (ad->antic_status == ANTIC_WAIT_REQ
-			|| ad->antic_status == ANTIC_WAIT_NEXT) {
-		if (as_can_break_anticipation(ad, rq))
-			as_antic_stop(ad);
-	}
-}
-
-/*
- * Gathers timings and resizes the write batch automatically
- */
-static void update_write_batch(struct as_data *ad)
-{
-	unsigned long batch = ad->batch_expire[BLK_RW_ASYNC];
-	long write_time;
-
-	write_time = (jiffies - ad->current_batch_expires) + batch;
-	if (write_time < 0)
-		write_time = 0;
-
-	if (write_time > batch && !ad->write_batch_idled) {
-		if (write_time > batch * 3)
-			ad->write_batch_count /= 2;
-		else
-			ad->write_batch_count--;
-	} else if (write_time < batch && ad->current_write_count == 0) {
-		if (batch > write_time * 3)
-			ad->write_batch_count *= 2;
-		else
-			ad->write_batch_count++;
-	}
-
-	if (ad->write_batch_count < 1)
-		ad->write_batch_count = 1;
-}
-
-/*
- * as_completed_request is to be called when a request has completed and
- * returned something to the requesting process, be it an error or data.
- */
-static void as_completed_request(struct request_queue *q, struct request *rq)
-{
-	struct as_data *ad = q->elevator->elevator_data;
-
-	WARN_ON(!list_empty(&rq->queuelist));
-
-	if (RQ_STATE(rq) != AS_RQ_REMOVED) {
-		WARN(1, "rq->state %d\n", RQ_STATE(rq));
-		goto out;
-	}
-
-	if (ad->changed_batch && ad->nr_dispatched == 1) {
-		ad->current_batch_expires = jiffies +
-					ad->batch_expire[ad->batch_data_dir];
-		kblockd_schedule_work(q, &ad->antic_work);
-		ad->changed_batch = 0;
-
-		if (ad->batch_data_dir == BLK_RW_SYNC)
-			ad->new_batch = 1;
-	}
-	WARN_ON(ad->nr_dispatched == 0);
-	ad->nr_dispatched--;
-
-	/*
-	 * Start counting the batch from when a request of that direction is
-	 * actually serviced. This should help devices with big TCQ windows
-	 * and writeback caches
-	 */
-	if (ad->new_batch && ad->batch_data_dir == rq_is_sync(rq)) {
-		update_write_batch(ad);
-		ad->current_batch_expires = jiffies +
-				ad->batch_expire[BLK_RW_SYNC];
-		ad->new_batch = 0;
-	}
-
-	if (ad->io_context == RQ_IOC(rq) && ad->io_context) {
-		ad->antic_start = jiffies;
-		ad->ioc_finished = 1;
-		if (ad->antic_status == ANTIC_WAIT_REQ) {
-			/*
-			 * We were waiting on this request, now anticipate
-			 * the next one
-			 */
-			as_antic_waitnext(ad);
-		}
-	}
-
-	as_put_io_context(rq);
-out:
-	RQ_SET_STATE(rq, AS_RQ_POSTSCHED);
-}
-
-/*
- * as_remove_queued_request removes a request from the pre dispatch queue
- * without updating refcounts. It is expected the caller will drop the
- * reference unless it replaces the request at somepart of the elevator
- * (ie. the dispatch queue)
- */
-static void as_remove_queued_request(struct request_queue *q,
-				     struct request *rq)
-{
-	const int data_dir = rq_is_sync(rq);
-	struct as_data *ad = q->elevator->elevator_data;
-	struct io_context *ioc;
-
-	WARN_ON(RQ_STATE(rq) != AS_RQ_QUEUED);
-
-	ioc = RQ_IOC(rq);
-	if (ioc && ioc->aic) {
-		BUG_ON(!atomic_read(&ioc->aic->nr_queued));
-		atomic_dec(&ioc->aic->nr_queued);
-	}
-
-	/*
-	 * Update the "next_rq" cache if we are about to remove its
-	 * entry
-	 */
-	if (ad->next_rq[data_dir] == rq)
-		ad->next_rq[data_dir] = as_find_next_rq(ad, rq);
-
-	rq_fifo_clear(rq);
-	as_del_rq_rb(ad, rq);
-}
-
-/*
- * as_fifo_expired returns 0 if there are no expired requests on the fifo,
- * 1 otherwise.  It is ratelimited so that we only perform the check once per
- * `fifo_expire' interval.  Otherwise a large number of expired requests
- * would create a hopeless seekstorm.
- *
- * See as_antic_expired comment.
- */
-static int as_fifo_expired(struct as_data *ad, int adir)
-{
-	struct request *rq;
-	long delta_jif;
-
-	delta_jif = jiffies - ad->last_check_fifo[adir];
-	if (unlikely(delta_jif < 0))
-		delta_jif = -delta_jif;
-	if (delta_jif < ad->fifo_expire[adir])
-		return 0;
-
-	ad->last_check_fifo[adir] = jiffies;
-
-	if (list_empty(&ad->fifo_list[adir]))
-		return 0;
-
-	rq = rq_entry_fifo(ad->fifo_list[adir].next);
-
-	return time_after(jiffies, rq_fifo_time(rq));
-}
-
-/*
- * as_batch_expired returns true if the current batch has expired. A batch
- * is a set of reads or a set of writes.
- */
-static inline int as_batch_expired(struct as_data *ad)
-{
-	if (ad->changed_batch || ad->new_batch)
-		return 0;
-
-	if (ad->batch_data_dir == BLK_RW_SYNC)
-		/* TODO! add a check so a complete fifo gets written? */
-		return time_after(jiffies, ad->current_batch_expires);
-
-	return time_after(jiffies, ad->current_batch_expires)
-		|| ad->current_write_count == 0;
-}
-
-/*
- * move an entry to dispatch queue
- */
-static void as_move_to_dispatch(struct as_data *ad, struct request *rq)
-{
-	const int data_dir = rq_is_sync(rq);
-
-	BUG_ON(RB_EMPTY_NODE(&rq->rb_node));
-
-	as_antic_stop(ad);
-	ad->antic_status = ANTIC_OFF;
-
-	/*
-	 * This has to be set in order to be correctly updated by
-	 * as_find_next_rq
-	 */
-	ad->last_sector[data_dir] = blk_rq_pos(rq) + blk_rq_sectors(rq);
-
-	if (data_dir == BLK_RW_SYNC) {
-		struct io_context *ioc = RQ_IOC(rq);
-		/* In case we have to anticipate after this */
-		copy_io_context(&ad->io_context, &ioc);
-	} else {
-		if (ad->io_context) {
-			put_io_context(ad->io_context);
-			ad->io_context = NULL;
-		}
-
-		if (ad->current_write_count != 0)
-			ad->current_write_count--;
-	}
-	ad->ioc_finished = 0;
-
-	ad->next_rq[data_dir] = as_find_next_rq(ad, rq);
-
-	/*
-	 * take it off the sort and fifo list, add to dispatch queue
-	 */
-	as_remove_queued_request(ad->q, rq);
-	WARN_ON(RQ_STATE(rq) != AS_RQ_QUEUED);
-
-	elv_dispatch_sort(ad->q, rq);
-
-	RQ_SET_STATE(rq, AS_RQ_DISPATCHED);
-	if (RQ_IOC(rq) && RQ_IOC(rq)->aic)
-		atomic_inc(&RQ_IOC(rq)->aic->nr_dispatched);
-	ad->nr_dispatched++;
-}
-
-/*
- * as_dispatch_request selects the best request according to
- * read/write expire, batch expire, etc, and moves it to the dispatch
- * queue. Returns 1 if a request was found, 0 otherwise.
- */
-static int as_dispatch_request(struct request_queue *q, int force)
-{
-	struct as_data *ad = q->elevator->elevator_data;
-	const int reads = !list_empty(&ad->fifo_list[BLK_RW_SYNC]);
-	const int writes = !list_empty(&ad->fifo_list[BLK_RW_ASYNC]);
-	struct request *rq;
-
-	if (unlikely(force)) {
-		/*
-		 * Forced dispatch, accounting is useless.  Reset
-		 * accounting states and dump fifo_lists.  Note that
-		 * batch_data_dir is reset to BLK_RW_SYNC to avoid
-		 * screwing write batch accounting as write batch
-		 * accounting occurs on W->R transition.
-		 */
-		int dispatched = 0;
-
-		ad->batch_data_dir = BLK_RW_SYNC;
-		ad->changed_batch = 0;
-		ad->new_batch = 0;
-
-		while (ad->next_rq[BLK_RW_SYNC]) {
-			as_move_to_dispatch(ad, ad->next_rq[BLK_RW_SYNC]);
-			dispatched++;
-		}
-		ad->last_check_fifo[BLK_RW_SYNC] = jiffies;
-
-		while (ad->next_rq[BLK_RW_ASYNC]) {
-			as_move_to_dispatch(ad, ad->next_rq[BLK_RW_ASYNC]);
-			dispatched++;
-		}
-		ad->last_check_fifo[BLK_RW_ASYNC] = jiffies;
-
-		return dispatched;
-	}
-
-	/* Signal that the write batch was uncontended, so we can't time it */
-	if (ad->batch_data_dir == BLK_RW_ASYNC && !reads) {
-		if (ad->current_write_count == 0 || !writes)
-			ad->write_batch_idled = 1;
-	}
-
-	if (!(reads || writes)
-		|| ad->antic_status == ANTIC_WAIT_REQ
-		|| ad->antic_status == ANTIC_WAIT_NEXT
-		|| ad->changed_batch)
-		return 0;
-
-	if (!(reads && writes && as_batch_expired(ad))) {
-		/*
-		 * batch is still running or no reads or no writes
-		 */
-		rq = ad->next_rq[ad->batch_data_dir];
-
-		if (ad->batch_data_dir == BLK_RW_SYNC && ad->antic_expire) {
-			if (as_fifo_expired(ad, BLK_RW_SYNC))
-				goto fifo_expired;
-
-			if (as_can_anticipate(ad, rq)) {
-				as_antic_waitreq(ad);
-				return 0;
-			}
-		}
-
-		if (rq) {
-			/* we have a "next request" */
-			if (reads && !writes)
-				ad->current_batch_expires =
-					jiffies + ad->batch_expire[BLK_RW_SYNC];
-			goto dispatch_request;
-		}
-	}
-
-	/*
-	 * at this point we are not running a batch. select the appropriate
-	 * data direction (read / write)
-	 */
-
-	if (reads) {
-		BUG_ON(RB_EMPTY_ROOT(&ad->sort_list[BLK_RW_SYNC]));
-
-		if (writes && ad->batch_data_dir == BLK_RW_SYNC)
-			/*
-			 * Last batch was a read, switch to writes
-			 */
-			goto dispatch_writes;
-
-		if (ad->batch_data_dir == BLK_RW_ASYNC) {
-			WARN_ON(ad->new_batch);
-			ad->changed_batch = 1;
-		}
-		ad->batch_data_dir = BLK_RW_SYNC;
-		rq = rq_entry_fifo(ad->fifo_list[BLK_RW_SYNC].next);
-		ad->last_check_fifo[ad->batch_data_dir] = jiffies;
-		goto dispatch_request;
-	}
-
-	/*
-	 * the last batch was a read
-	 */
-
-	if (writes) {
-dispatch_writes:
-		BUG_ON(RB_EMPTY_ROOT(&ad->sort_list[BLK_RW_ASYNC]));
-
-		if (ad->batch_data_dir == BLK_RW_SYNC) {
-			ad->changed_batch = 1;
-
-			/*
-			 * new_batch might be 1 when the queue runs out of
-			 * reads. A subsequent submission of a write might
-			 * cause a change of batch before the read is finished.
-			 */
-			ad->new_batch = 0;
-		}
-		ad->batch_data_dir = BLK_RW_ASYNC;
-		ad->current_write_count = ad->write_batch_count;
-		ad->write_batch_idled = 0;
-		rq = rq_entry_fifo(ad->fifo_list[BLK_RW_ASYNC].next);
-		ad->last_check_fifo[BLK_RW_ASYNC] = jiffies;
-		goto dispatch_request;
-	}
-
-	BUG();
-	return 0;
-
-dispatch_request:
-	/*
-	 * If a request has expired, service it.
-	 */
-
-	if (as_fifo_expired(ad, ad->batch_data_dir)) {
-fifo_expired:
-		rq = rq_entry_fifo(ad->fifo_list[ad->batch_data_dir].next);
-	}
-
-	if (ad->changed_batch) {
-		WARN_ON(ad->new_batch);
-
-		if (ad->nr_dispatched)
-			return 0;
-
-		if (ad->batch_data_dir == BLK_RW_ASYNC)
-			ad->current_batch_expires = jiffies +
-					ad->batch_expire[BLK_RW_ASYNC];
-		else
-			ad->new_batch = 1;
-
-		ad->changed_batch = 0;
-	}
-
-	/*
-	 * rq is the selected appropriate request.
-	 */
-	as_move_to_dispatch(ad, rq);
-
-	return 1;
-}
-
-/*
- * add rq to rbtree and fifo
- */
-static void as_add_request(struct request_queue *q, struct request *rq)
-{
-	struct as_data *ad = q->elevator->elevator_data;
-	int data_dir;
-
-	RQ_SET_STATE(rq, AS_RQ_NEW);
-
-	data_dir = rq_is_sync(rq);
-
-	rq->elevator_private = as_get_io_context(q->node);
-
-	if (RQ_IOC(rq)) {
-		as_update_iohist(ad, RQ_IOC(rq)->aic, rq);
-		atomic_inc(&RQ_IOC(rq)->aic->nr_queued);
-	}
-
-	as_add_rq_rb(ad, rq);
-
-	/*
-	 * set expire time and add to fifo list
-	 */
-	rq_set_fifo_time(rq, jiffies + ad->fifo_expire[data_dir]);
-	list_add_tail(&rq->queuelist, &ad->fifo_list[data_dir]);
-
-	as_update_rq(ad, rq); /* keep state machine up to date */
-	RQ_SET_STATE(rq, AS_RQ_QUEUED);
-}
-
-static void as_activate_request(struct request_queue *q, struct request *rq)
-{
-	WARN_ON(RQ_STATE(rq) != AS_RQ_DISPATCHED);
-	RQ_SET_STATE(rq, AS_RQ_REMOVED);
-	if (RQ_IOC(rq) && RQ_IOC(rq)->aic)
-		atomic_dec(&RQ_IOC(rq)->aic->nr_dispatched);
-}
-
-static void as_deactivate_request(struct request_queue *q, struct request *rq)
-{
-	WARN_ON(RQ_STATE(rq) != AS_RQ_REMOVED);
-	RQ_SET_STATE(rq, AS_RQ_DISPATCHED);
-	if (RQ_IOC(rq) && RQ_IOC(rq)->aic)
-		atomic_inc(&RQ_IOC(rq)->aic->nr_dispatched);
-}
-
-/*
- * as_queue_empty tells us if there are requests left in the device. It may
- * not be the case that a driver can get the next request even if the queue
- * is not empty - it is used in the block layer to check for plugging and
- * merging opportunities
- */
-static int as_queue_empty(struct request_queue *q)
-{
-	struct as_data *ad = q->elevator->elevator_data;
-
-	return list_empty(&ad->fifo_list[BLK_RW_ASYNC])
-		&& list_empty(&ad->fifo_list[BLK_RW_SYNC]);
-}
-
-static int
-as_merge(struct request_queue *q, struct request **req, struct bio *bio)
-{
-	struct as_data *ad = q->elevator->elevator_data;
-	sector_t rb_key = bio->bi_sector + bio_sectors(bio);
-	struct request *__rq;
-
-	/*
-	 * check for front merge
-	 */
-	__rq = elv_rb_find(&ad->sort_list[bio_data_dir(bio)], rb_key);
-	if (__rq && elv_rq_merge_ok(__rq, bio)) {
-		*req = __rq;
-		return ELEVATOR_FRONT_MERGE;
-	}
-
-	return ELEVATOR_NO_MERGE;
-}
-
-static void as_merged_request(struct request_queue *q, struct request *req,
-			      int type)
-{
-	struct as_data *ad = q->elevator->elevator_data;
-
-	/*
-	 * if the merge was a front merge, we need to reposition request
-	 */
-	if (type == ELEVATOR_FRONT_MERGE) {
-		as_del_rq_rb(ad, req);
-		as_add_rq_rb(ad, req);
-		/*
-		 * Note! At this stage of this and the next function, our next
-		 * request may not be optimal - eg the request may have "grown"
-		 * behind the disk head. We currently don't bother adjusting.
-		 */
-	}
-}
-
-static void as_merged_requests(struct request_queue *q, struct request *req,
-			 	struct request *next)
-{
-	/*
-	 * if next expires before rq, assign its expire time to arq
-	 * and move into next position (next will be deleted) in fifo
-	 */
-	if (!list_empty(&req->queuelist) && !list_empty(&next->queuelist)) {
-		if (time_before(rq_fifo_time(next), rq_fifo_time(req))) {
-			list_move(&req->queuelist, &next->queuelist);
-			rq_set_fifo_time(req, rq_fifo_time(next));
-		}
-	}
-
-	/*
-	 * kill knowledge of next, this one is a goner
-	 */
-	as_remove_queued_request(q, next);
-	as_put_io_context(next);
-
-	RQ_SET_STATE(next, AS_RQ_MERGED);
-}
-
-/*
- * This is executed in a "deferred" process context, by kblockd. It calls the
- * driver's request_fn so the driver can submit that request.
- *
- * IMPORTANT! This guy will reenter the elevator, so set up all queue global
- * state before calling, and don't rely on any state over calls.
- *
- * FIXME! dispatch queue is not a queue at all!
- */
-static void as_work_handler(struct work_struct *work)
-{
-	struct as_data *ad = container_of(work, struct as_data, antic_work);
-
-	blk_run_queue(ad->q);
-}
-
-static int as_may_queue(struct request_queue *q, int rw)
-{
-	int ret = ELV_MQUEUE_MAY;
-	struct as_data *ad = q->elevator->elevator_data;
-	struct io_context *ioc;
-	if (ad->antic_status == ANTIC_WAIT_REQ ||
-			ad->antic_status == ANTIC_WAIT_NEXT) {
-		ioc = as_get_io_context(q->node);
-		if (ad->io_context == ioc)
-			ret = ELV_MQUEUE_MUST;
-		put_io_context(ioc);
-	}
-
-	return ret;
-}
-
-static void as_exit_queue(struct elevator_queue *e)
-{
-	struct as_data *ad = e->elevator_data;
-
-	del_timer_sync(&ad->antic_timer);
-	cancel_work_sync(&ad->antic_work);
-
-	BUG_ON(!list_empty(&ad->fifo_list[BLK_RW_SYNC]));
-	BUG_ON(!list_empty(&ad->fifo_list[BLK_RW_ASYNC]));
-
-	put_io_context(ad->io_context);
-	kfree(ad);
-}
-
-/*
- * initialize elevator private data (as_data).
- */
-static void *as_init_queue(struct request_queue *q)
-{
-	struct as_data *ad;
-
-	ad = kmalloc_node(sizeof(*ad), GFP_KERNEL | __GFP_ZERO, q->node);
-	if (!ad)
-		return NULL;
-
-	ad->q = q; /* Identify what queue the data belongs to */
-
-	/* anticipatory scheduling helpers */
-	ad->antic_timer.function = as_antic_timeout;
-	ad->antic_timer.data = (unsigned long)q;
-	init_timer(&ad->antic_timer);
-	INIT_WORK(&ad->antic_work, as_work_handler);
-
-	INIT_LIST_HEAD(&ad->fifo_list[BLK_RW_SYNC]);
-	INIT_LIST_HEAD(&ad->fifo_list[BLK_RW_ASYNC]);
-	ad->sort_list[BLK_RW_SYNC] = RB_ROOT;
-	ad->sort_list[BLK_RW_ASYNC] = RB_ROOT;
-	ad->fifo_expire[BLK_RW_SYNC] = default_read_expire;
-	ad->fifo_expire[BLK_RW_ASYNC] = default_write_expire;
-	ad->antic_expire = default_antic_expire;
-	ad->batch_expire[BLK_RW_SYNC] = default_read_batch_expire;
-	ad->batch_expire[BLK_RW_ASYNC] = default_write_batch_expire;
-
-	ad->current_batch_expires = jiffies + ad->batch_expire[BLK_RW_SYNC];
-	ad->write_batch_count = ad->batch_expire[BLK_RW_ASYNC] / 10;
-	if (ad->write_batch_count < 2)
-		ad->write_batch_count = 2;
-
-	return ad;
-}
-
-/*
- * sysfs parts below
- */
-
-static ssize_t
-as_var_show(unsigned int var, char *page)
-{
-	return sprintf(page, "%d\n", var);
-}
-
-static ssize_t
-as_var_store(unsigned long *var, const char *page, size_t count)
-{
-	char *p = (char *) page;
-
-	*var = simple_strtoul(p, &p, 10);
-	return count;
-}
-
-static ssize_t est_time_show(struct elevator_queue *e, char *page)
-{
-	struct as_data *ad = e->elevator_data;
-	int pos = 0;
-
-	pos += sprintf(page+pos, "%lu %% exit probability\n",
-				100*ad->exit_prob/256);
-	pos += sprintf(page+pos, "%lu %% probability of exiting without a "
-				"cooperating process submitting IO\n",
-				100*ad->exit_no_coop/256);
-	pos += sprintf(page+pos, "%lu ms new thinktime\n", ad->new_ttime_mean);
-	pos += sprintf(page+pos, "%llu sectors new seek distance\n",
-				(unsigned long long)ad->new_seek_mean);
-
-	return pos;
-}
-
-#define SHOW_FUNCTION(__FUNC, __VAR)				\
-static ssize_t __FUNC(struct elevator_queue *e, char *page)	\
-{								\
-	struct as_data *ad = e->elevator_data;			\
-	return as_var_show(jiffies_to_msecs((__VAR)), (page));	\
-}
-SHOW_FUNCTION(as_read_expire_show, ad->fifo_expire[BLK_RW_SYNC]);
-SHOW_FUNCTION(as_write_expire_show, ad->fifo_expire[BLK_RW_ASYNC]);
-SHOW_FUNCTION(as_antic_expire_show, ad->antic_expire);
-SHOW_FUNCTION(as_read_batch_expire_show, ad->batch_expire[BLK_RW_SYNC]);
-SHOW_FUNCTION(as_write_batch_expire_show, ad->batch_expire[BLK_RW_ASYNC]);
-#undef SHOW_FUNCTION
-
-#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX)				\
-static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)	\
-{									\
-	struct as_data *ad = e->elevator_data;				\
-	int ret = as_var_store(__PTR, (page), count);			\
-	if (*(__PTR) < (MIN))						\
-		*(__PTR) = (MIN);					\
-	else if (*(__PTR) > (MAX))					\
-		*(__PTR) = (MAX);					\
-	*(__PTR) = msecs_to_jiffies(*(__PTR));				\
-	return ret;							\
-}
-STORE_FUNCTION(as_read_expire_store, &ad->fifo_expire[BLK_RW_SYNC], 0, INT_MAX);
-STORE_FUNCTION(as_write_expire_store,
-			&ad->fifo_expire[BLK_RW_ASYNC], 0, INT_MAX);
-STORE_FUNCTION(as_antic_expire_store, &ad->antic_expire, 0, INT_MAX);
-STORE_FUNCTION(as_read_batch_expire_store,
-			&ad->batch_expire[BLK_RW_SYNC], 0, INT_MAX);
-STORE_FUNCTION(as_write_batch_expire_store,
-			&ad->batch_expire[BLK_RW_ASYNC], 0, INT_MAX);
-#undef STORE_FUNCTION
-
-#define AS_ATTR(name) \
-	__ATTR(name, S_IRUGO|S_IWUSR, as_##name##_show, as_##name##_store)
-
-static struct elv_fs_entry as_attrs[] = {
-	__ATTR_RO(est_time),
-	AS_ATTR(read_expire),
-	AS_ATTR(write_expire),
-	AS_ATTR(antic_expire),
-	AS_ATTR(read_batch_expire),
-	AS_ATTR(write_batch_expire),
-	__ATTR_NULL
-};
-
-static struct elevator_type iosched_as = {
-	.ops = {
-		.elevator_merge_fn = 		as_merge,
-		.elevator_merged_fn =		as_merged_request,
-		.elevator_merge_req_fn =	as_merged_requests,
-		.elevator_dispatch_fn =		as_dispatch_request,
-		.elevator_add_req_fn =		as_add_request,
-		.elevator_activate_req_fn =	as_activate_request,
-		.elevator_deactivate_req_fn = 	as_deactivate_request,
-		.elevator_queue_empty_fn =	as_queue_empty,
-		.elevator_completed_req_fn =	as_completed_request,
-		.elevator_former_req_fn =	elv_rb_former_request,
-		.elevator_latter_req_fn =	elv_rb_latter_request,
-		.elevator_may_queue_fn =	as_may_queue,
-		.elevator_init_fn =		as_init_queue,
-		.elevator_exit_fn =		as_exit_queue,
-		.trim =				as_trim,
-	},
-
-	.elevator_attrs = as_attrs,
-	.elevator_name = "anticipatory",
-	.elevator_owner = THIS_MODULE,
-};
-
-static int __init as_init(void)
-{
-	elv_register(&iosched_as);
-
-	return 0;
-}
-
-static void __exit as_exit(void)
-{
-	DECLARE_COMPLETION_ONSTACK(all_gone);
-	elv_unregister(&iosched_as);
-	ioc_gone = &all_gone;
-	/* ioc_gone's update must be visible before reading ioc_count */
-	smp_wmb();
-	if (elv_ioc_count_read(as_ioc_count))
-		wait_for_completion(&all_gone);
-	synchronize_rcu();
-}
-
-module_init(as_init);
-module_exit(as_exit);
-
-MODULE_AUTHOR("Nick Piggin");
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("anticipatory IO scheduler");
diff --git a/block/elevator.c b/block/elevator.c
index 1975b619c86d..bb30f0e92d4d 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -154,10 +154,7 @@ static struct elevator_type *elevator_get(const char *name)
 
 		spin_unlock(&elv_list_lock);
 
-		if (!strcmp(name, "anticipatory"))
-			sprintf(elv, "as-iosched");
-		else
-			sprintf(elv, "%s-iosched", name);
+		sprintf(elv, "%s-iosched", name);
 
 		request_module("%s", elv);
 		spin_lock(&elv_list_lock);
@@ -193,10 +190,7 @@ static int __init elevator_setup(char *str)
 	 * Be backwards-compatible with previous kernels, so users
 	 * won't get the wrong elevator.
 	 */
-	if (!strcmp(str, "as"))
-		strcpy(chosen_elevator, "anticipatory");
-	else
-		strncpy(chosen_elevator, str, sizeof(chosen_elevator) - 1);
+	strncpy(chosen_elevator, str, sizeof(chosen_elevator) - 1);
 	return 1;
 }
 
-- 
cgit v1.2.2


From 08dc8726d4be85bca793141c827574fd32a681bb Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Sat, 3 Oct 2009 09:40:47 +0200
Subject: block: CFQ is more than a desktop scheduler

Update Kconfig.iosched entry.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/Kconfig.iosched | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
index baad3dae3655..8bd105115a69 100644
--- a/block/Kconfig.iosched
+++ b/block/Kconfig.iosched
@@ -27,7 +27,9 @@ config IOSCHED_CFQ
 	---help---
 	  The CFQ I/O scheduler tries to distribute bandwidth equally
 	  among all processes in the system. It should provide a fair
-	  working environment, suitable for desktop systems.
+	  and low latency working environment, suitable for both desktop
+	  and server systems.
+
 	  This is the default I/O scheduler.
 
 choice
-- 
cgit v1.2.2


From b2c18e1e08a5a9663094d57bb4be2f02226ee61c Mon Sep 17 00:00:00 2001
From: Jeff Moyer <jmoyer@redhat.com>
Date: Fri, 23 Oct 2009 17:14:49 -0400
Subject: cfq: calculate the seek_mean per cfq_queue not per cfq_io_context

async cfq_queue's are already shared between processes within the same
priority, and forthcoming patches will change the mapping of cic to sync
cfq_queue from 1:1 to 1:N.  So, calculate the seekiness of a process
based on the cfq_queue instead of the cfq_io_context.

Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 68 ++++++++++++++++++++++++++---------------------------
 1 file changed, 33 insertions(+), 35 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 069a61017c02..78cc8ee5da41 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -112,6 +112,11 @@ struct cfq_queue {
 	unsigned short ioprio, org_ioprio;
 	unsigned short ioprio_class, org_ioprio_class;
 
+	unsigned int seek_samples;
+	u64 seek_total;
+	sector_t seek_mean;
+	sector_t last_request_pos;
+
 	pid_t pid;
 };
 
@@ -962,16 +967,16 @@ static inline sector_t cfq_dist_from_last(struct cfq_data *cfqd,
 		return cfqd->last_position - blk_rq_pos(rq);
 }
 
-#define CIC_SEEK_THR	8 * 1024
-#define CIC_SEEKY(cic)	((cic)->seek_mean > CIC_SEEK_THR)
+#define CFQQ_SEEK_THR		8 * 1024
+#define CFQQ_SEEKY(cfqq)	((cfqq)->seek_mean > CFQQ_SEEK_THR)
 
-static inline int cfq_rq_close(struct cfq_data *cfqd, struct request *rq)
+static inline int cfq_rq_close(struct cfq_data *cfqd, struct cfq_queue *cfqq,
+			       struct request *rq)
 {
-	struct cfq_io_context *cic = cfqd->active_cic;
-	sector_t sdist = cic->seek_mean;
+	sector_t sdist = cfqq->seek_mean;
 
-	if (!sample_valid(cic->seek_samples))
-		sdist = CIC_SEEK_THR;
+	if (!sample_valid(cfqq->seek_samples))
+		sdist = CFQQ_SEEK_THR;
 
 	return cfq_dist_from_last(cfqd, rq) <= sdist;
 }
@@ -1000,7 +1005,7 @@ static struct cfq_queue *cfqq_close(struct cfq_data *cfqd,
 	 * will contain the closest sector.
 	 */
 	__cfqq = rb_entry(parent, struct cfq_queue, p_node);
-	if (cfq_rq_close(cfqd, __cfqq->next_rq))
+	if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq))
 		return __cfqq;
 
 	if (blk_rq_pos(__cfqq->next_rq) < sector)
@@ -1011,7 +1016,7 @@ static struct cfq_queue *cfqq_close(struct cfq_data *cfqd,
 		return NULL;
 
 	__cfqq = rb_entry(node, struct cfq_queue, p_node);
-	if (cfq_rq_close(cfqd, __cfqq->next_rq))
+	if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq))
 		return __cfqq;
 
 	return NULL;
@@ -1033,13 +1038,6 @@ static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd,
 {
 	struct cfq_queue *cfqq;
 
-	/*
-	 * A valid cfq_io_context is necessary to compare requests against
-	 * the seek_mean of the current cfqq.
-	 */
-	if (!cfqd->active_cic)
-		return NULL;
-
 	/*
 	 * We should notice if some of the queues are cooperating, eg
 	 * working closely on the same area of the disk. In that case,
@@ -1110,7 +1108,7 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
 	 * seeks. so allow a little bit of time for him to submit a new rq
 	 */
 	sl = cfqd->cfq_slice_idle;
-	if (sample_valid(cic->seek_samples) && CIC_SEEKY(cic))
+	if (sample_valid(cfqq->seek_samples) && CFQQ_SEEKY(cfqq))
 		sl = min(sl, msecs_to_jiffies(CFQ_MIN_TT));
 
 	mod_timer(&cfqd->idle_slice_timer, jiffies + sl);
@@ -1947,33 +1945,33 @@ cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_io_context *cic)
 }
 
 static void
-cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_io_context *cic,
+cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 		       struct request *rq)
 {
 	sector_t sdist;
 	u64 total;
 
-	if (!cic->last_request_pos)
+	if (!cfqq->last_request_pos)
 		sdist = 0;
-	else if (cic->last_request_pos < blk_rq_pos(rq))
-		sdist = blk_rq_pos(rq) - cic->last_request_pos;
+	else if (cfqq->last_request_pos < blk_rq_pos(rq))
+		sdist = blk_rq_pos(rq) - cfqq->last_request_pos;
 	else
-		sdist = cic->last_request_pos - blk_rq_pos(rq);
+		sdist = cfqq->last_request_pos - blk_rq_pos(rq);
 
 	/*
 	 * Don't allow the seek distance to get too large from the
 	 * odd fragment, pagein, etc
 	 */
-	if (cic->seek_samples <= 60) /* second&third seek */
-		sdist = min(sdist, (cic->seek_mean * 4) + 2*1024*1024);
+	if (cfqq->seek_samples <= 60) /* second&third seek */
+		sdist = min(sdist, (cfqq->seek_mean * 4) + 2*1024*1024);
 	else
-		sdist = min(sdist, (cic->seek_mean * 4)	+ 2*1024*64);
+		sdist = min(sdist, (cfqq->seek_mean * 4) + 2*1024*64);
 
-	cic->seek_samples = (7*cic->seek_samples + 256) / 8;
-	cic->seek_total = (7*cic->seek_total + (u64)256*sdist) / 8;
-	total = cic->seek_total + (cic->seek_samples/2);
-	do_div(total, cic->seek_samples);
-	cic->seek_mean = (sector_t)total;
+	cfqq->seek_samples = (7*cfqq->seek_samples + 256) / 8;
+	cfqq->seek_total = (7*cfqq->seek_total + (u64)256*sdist) / 8;
+	total = cfqq->seek_total + (cfqq->seek_samples/2);
+	do_div(total, cfqq->seek_samples);
+	cfqq->seek_mean = (sector_t)total;
 }
 
 /*
@@ -1995,11 +1993,11 @@ cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 	enable_idle = old_idle = cfq_cfqq_idle_window(cfqq);
 
 	if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle ||
-	    (!cfqd->cfq_latency && cfqd->hw_tag && CIC_SEEKY(cic)))
+	    (!cfqd->cfq_latency && cfqd->hw_tag && CFQQ_SEEKY(cfqq)))
 		enable_idle = 0;
 	else if (sample_valid(cic->ttime_samples)) {
 		unsigned int slice_idle = cfqd->cfq_slice_idle;
-		if (sample_valid(cic->seek_samples) && CIC_SEEKY(cic))
+		if (sample_valid(cfqq->seek_samples) && CFQQ_SEEKY(cfqq))
 			slice_idle = msecs_to_jiffies(CFQ_MIN_TT);
 		if (cic->ttime_mean > slice_idle)
 			enable_idle = 0;
@@ -2066,7 +2064,7 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
 	 * if this request is as-good as one we would expect from the
 	 * current cfqq, let it preempt
 	 */
-	if (cfq_rq_close(cfqd, rq))
+	if (cfq_rq_close(cfqd, cfqq, rq))
 		return true;
 
 	return false;
@@ -2108,10 +2106,10 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 		cfqq->meta_pending++;
 
 	cfq_update_io_thinktime(cfqd, cic);
-	cfq_update_io_seektime(cfqd, cic, rq);
+	cfq_update_io_seektime(cfqd, cfqq, rq);
 	cfq_update_idle_window(cfqd, cfqq, cic);
 
-	cic->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
+	cfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
 
 	if (cfqq == cfqd->active_queue) {
 		/*
-- 
cgit v1.2.2


From df5fe3e8e13883f58dc97489076bbcc150789a21 Mon Sep 17 00:00:00 2001
From: Jeff Moyer <jmoyer@redhat.com>
Date: Fri, 23 Oct 2009 17:14:50 -0400
Subject: cfq: merge cooperating cfq_queues

When cooperating cfq_queues are detected currently, they are allowed to
skip ahead in the scheduling order.  It is much more efficient to
automatically share the cfq_queue data structure between cooperating processes.
Performance of the read-test2 benchmark (which is written to emulate the
dump(8) utility) went from 12MB/s to 90MB/s on my SATA disk.  NFS servers
with multiple nfsd threads also saw performance increases.

Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 89 +++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 87 insertions(+), 2 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 78cc8ee5da41..f0994aedb390 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -118,6 +118,8 @@ struct cfq_queue {
 	sector_t last_request_pos;
 
 	pid_t pid;
+
+	struct cfq_queue *new_cfqq;
 };
 
 /*
@@ -1047,6 +1049,12 @@ static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd,
 	if (!cfqq)
 		return NULL;
 
+	/*
+	 * It only makes sense to merge sync queues.
+	 */
+	if (!cfq_cfqq_sync(cfqq))
+		return NULL;
+
 	if (cfq_cfqq_coop(cfqq))
 		return NULL;
 
@@ -1167,6 +1175,43 @@ cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 	return 2 * (base_rq + base_rq * (CFQ_PRIO_LISTS - 1 - cfqq->ioprio));
 }
 
+/*
+ * Must be called with the queue_lock held.
+ */
+static int cfqq_process_refs(struct cfq_queue *cfqq)
+{
+	int process_refs, io_refs;
+
+	io_refs = cfqq->allocated[READ] + cfqq->allocated[WRITE];
+	process_refs = atomic_read(&cfqq->ref) - io_refs;
+	BUG_ON(process_refs < 0);
+	return process_refs;
+}
+
+static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq)
+{
+	int process_refs;
+	struct cfq_queue *__cfqq;
+
+	/* Avoid a circular list and skip interim queue merges */
+	while ((__cfqq = new_cfqq->new_cfqq)) {
+		if (__cfqq == cfqq)
+			return;
+		new_cfqq = __cfqq;
+	}
+
+	process_refs = cfqq_process_refs(cfqq);
+	/*
+	 * If the process for the cfqq has gone away, there is no
+	 * sense in merging the queues.
+	 */
+	if (process_refs == 0)
+		return;
+
+	cfqq->new_cfqq = new_cfqq;
+	atomic_add(process_refs, &new_cfqq->ref);
+}
+
 /*
  * Select a queue for service. If we have a current active queue,
  * check whether to continue servicing it, or retrieve and set a new one.
@@ -1196,11 +1241,14 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
 	 * If another queue has a request waiting within our mean seek
 	 * distance, let it run.  The expire code will check for close
 	 * cooperators and put the close queue at the front of the service
-	 * tree.
+	 * tree.  If possible, merge the expiring queue with the new cfqq.
 	 */
 	new_cfqq = cfq_close_cooperator(cfqd, cfqq, 0);
-	if (new_cfqq)
+	if (new_cfqq) {
+		if (!cfqq->new_cfqq)
+			cfq_setup_merge(cfqq, new_cfqq);
 		goto expire;
+	}
 
 	/*
 	 * No requests pending. If the active queue still has requests in
@@ -1511,11 +1559,29 @@ static void cfq_free_io_context(struct io_context *ioc)
 
 static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
+	struct cfq_queue *__cfqq, *next;
+
 	if (unlikely(cfqq == cfqd->active_queue)) {
 		__cfq_slice_expired(cfqd, cfqq, 0);
 		cfq_schedule_dispatch(cfqd);
 	}
 
+	/*
+	 * If this queue was scheduled to merge with another queue, be
+	 * sure to drop the reference taken on that queue (and others in
+	 * the merge chain).  See cfq_setup_merge and cfq_merge_cfqqs.
+	 */
+	__cfqq = cfqq->new_cfqq;
+	while (__cfqq) {
+		if (__cfqq == cfqq) {
+			WARN(1, "cfqq->new_cfqq loop detected\n");
+			break;
+		}
+		next = __cfqq->new_cfqq;
+		cfq_put_queue(__cfqq);
+		__cfqq = next;
+	}
+
 	cfq_put_queue(cfqq);
 }
 
@@ -2323,6 +2389,16 @@ static void cfq_put_request(struct request *rq)
 	}
 }
 
+static struct cfq_queue *
+cfq_merge_cfqqs(struct cfq_data *cfqd, struct cfq_io_context *cic,
+		struct cfq_queue *cfqq)
+{
+	cfq_log_cfqq(cfqd, cfqq, "merging with queue %p", cfqq->new_cfqq);
+	cic_set_cfqq(cic, cfqq->new_cfqq, 1);
+	cfq_put_queue(cfqq);
+	return cic_to_cfqq(cic, 1);
+}
+
 /*
  * Allocate cfq data structures associated with this request.
  */
@@ -2349,6 +2425,15 @@ cfq_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask)
 	if (!cfqq || cfqq == &cfqd->oom_cfqq) {
 		cfqq = cfq_get_queue(cfqd, is_sync, cic->ioc, gfp_mask);
 		cic_set_cfqq(cic, cfqq, is_sync);
+	} else {
+		/*
+		 * Check to see if this queue is scheduled to merge with
+		 * another, closely cooperating queue.  The merging of
+		 * queues happens here as it must be done in process context.
+		 * The reference on new_cfqq was taken in merge_cfqqs.
+		 */
+		if (cfqq->new_cfqq)
+			cfqq = cfq_merge_cfqqs(cfqd, cic, cfqq);
 	}
 
 	cfqq->allocated[rw]++;
-- 
cgit v1.2.2


From b3b6d0408c953524f979468562e7e210d8634150 Mon Sep 17 00:00:00 2001
From: Jeff Moyer <jmoyer@redhat.com>
Date: Fri, 23 Oct 2009 17:14:51 -0400
Subject: cfq: change the meaning of the cfqq_coop flag

The flag used to indicate that a cfqq was allowed to jump ahead in the
scheduling order due to submitting a request close to the queue that
just executed.  Since closely cooperating queues are now merged, the flag
holds little meaning.  Change it to indicate that multiple queues were
merged.  This will later be used to allow the breaking up of merged queues
when they are no longer cooperating.

Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 20 ++++++--------------
 1 file changed, 6 insertions(+), 14 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index f0994aedb390..5e01a0a92c02 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -202,7 +202,7 @@ enum cfqq_state_flags {
 	CFQ_CFQQ_FLAG_prio_changed,	/* task priority has changed */
 	CFQ_CFQQ_FLAG_slice_new,	/* no requests dispatched in slice */
 	CFQ_CFQQ_FLAG_sync,		/* synchronous queue */
-	CFQ_CFQQ_FLAG_coop,		/* has done a coop jump of the queue */
+	CFQ_CFQQ_FLAG_coop,		/* cfqq is shared */
 };
 
 #define CFQ_CFQQ_FNS(name)						\
@@ -950,11 +950,8 @@ static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
 static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd,
 					      struct cfq_queue *cfqq)
 {
-	if (!cfqq) {
+	if (!cfqq)
 		cfqq = cfq_get_next_queue(cfqd);
-		if (cfqq)
-			cfq_clear_cfqq_coop(cfqq);
-	}
 
 	__cfq_set_active_queue(cfqd, cfqq);
 	return cfqq;
@@ -1035,8 +1032,7 @@ static struct cfq_queue *cfqq_close(struct cfq_data *cfqd,
  * assumption.
  */
 static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd,
-					      struct cfq_queue *cur_cfqq,
-					      bool probe)
+					      struct cfq_queue *cur_cfqq)
 {
 	struct cfq_queue *cfqq;
 
@@ -1055,11 +1051,6 @@ static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd,
 	if (!cfq_cfqq_sync(cfqq))
 		return NULL;
 
-	if (cfq_cfqq_coop(cfqq))
-		return NULL;
-
-	if (!probe)
-		cfq_mark_cfqq_coop(cfqq);
 	return cfqq;
 }
 
@@ -1243,7 +1234,7 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
 	 * cooperators and put the close queue at the front of the service
 	 * tree.  If possible, merge the expiring queue with the new cfqq.
 	 */
-	new_cfqq = cfq_close_cooperator(cfqd, cfqq, 0);
+	new_cfqq = cfq_close_cooperator(cfqd, cfqq);
 	if (new_cfqq) {
 		if (!cfqq->new_cfqq)
 			cfq_setup_merge(cfqq, new_cfqq);
@@ -2294,7 +2285,7 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
 		 */
 		if (cfq_slice_used(cfqq) || cfq_class_idle(cfqq))
 			cfq_slice_expired(cfqd, 1);
-		else if (cfqq_empty && !cfq_close_cooperator(cfqd, cfqq, 1) &&
+		else if (cfqq_empty && !cfq_close_cooperator(cfqd, cfqq) &&
 			 sync && !rq_noidle(rq))
 			cfq_arm_slice_timer(cfqd);
 	}
@@ -2395,6 +2386,7 @@ cfq_merge_cfqqs(struct cfq_data *cfqd, struct cfq_io_context *cic,
 {
 	cfq_log_cfqq(cfqd, cfqq, "merging with queue %p", cfqq->new_cfqq);
 	cic_set_cfqq(cic, cfqq->new_cfqq, 1);
+	cfq_mark_cfqq_coop(cfqq->new_cfqq);
 	cfq_put_queue(cfqq);
 	return cic_to_cfqq(cic, 1);
 }
-- 
cgit v1.2.2


From e6c5bc737ab71e4af6025ef7d150f5a26ae5f146 Mon Sep 17 00:00:00 2001
From: Jeff Moyer <jmoyer@redhat.com>
Date: Fri, 23 Oct 2009 17:14:52 -0400
Subject: cfq: break apart merged cfqqs if they stop cooperating

cfq_queues are merged if they are issuing requests within the mean seek
distance of one another.  This patch detects when the coopearting stops and
breaks the queues back up.

Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 79 +++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 76 insertions(+), 3 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 5e01a0a92c02..47d6aaca0c51 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -38,6 +38,12 @@ static int cfq_slice_idle = HZ / 125;
  */
 #define CFQ_MIN_TT		(2)
 
+/*
+ * Allow merged cfqqs to perform this amount of seeky I/O before
+ * deciding to break the queues up again.
+ */
+#define CFQQ_COOP_TOUT		(HZ)
+
 #define CFQ_SLICE_SCALE		(5)
 #define CFQ_HW_QUEUE_MIN	(5)
 
@@ -116,6 +122,7 @@ struct cfq_queue {
 	u64 seek_total;
 	sector_t seek_mean;
 	sector_t last_request_pos;
+	unsigned long seeky_start;
 
 	pid_t pid;
 
@@ -1036,6 +1043,11 @@ static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd,
 {
 	struct cfq_queue *cfqq;
 
+	if (!cfq_cfqq_sync(cur_cfqq))
+		return NULL;
+	if (CFQQ_SEEKY(cur_cfqq))
+		return NULL;
+
 	/*
 	 * We should notice if some of the queues are cooperating, eg
 	 * working closely on the same area of the disk. In that case,
@@ -1050,6 +1062,8 @@ static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd,
 	 */
 	if (!cfq_cfqq_sync(cfqq))
 		return NULL;
+	if (CFQQ_SEEKY(cfqq))
+		return NULL;
 
 	return cfqq;
 }
@@ -1181,7 +1195,7 @@ static int cfqq_process_refs(struct cfq_queue *cfqq)
 
 static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq)
 {
-	int process_refs;
+	int process_refs, new_process_refs;
 	struct cfq_queue *__cfqq;
 
 	/* Avoid a circular list and skip interim queue merges */
@@ -1199,8 +1213,17 @@ static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq)
 	if (process_refs == 0)
 		return;
 
-	cfqq->new_cfqq = new_cfqq;
-	atomic_add(process_refs, &new_cfqq->ref);
+	/*
+	 * Merge in the direction of the lesser amount of work.
+	 */
+	new_process_refs = cfqq_process_refs(new_cfqq);
+	if (new_process_refs >= process_refs) {
+		cfqq->new_cfqq = new_cfqq;
+		atomic_add(process_refs, &new_cfqq->ref);
+	} else {
+		new_cfqq->new_cfqq = cfqq;
+		atomic_add(new_process_refs, &cfqq->ref);
+	}
 }
 
 /*
@@ -2029,6 +2052,19 @@ cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 	total = cfqq->seek_total + (cfqq->seek_samples/2);
 	do_div(total, cfqq->seek_samples);
 	cfqq->seek_mean = (sector_t)total;
+
+	/*
+	 * If this cfqq is shared between multiple processes, check to
+	 * make sure that those processes are still issuing I/Os within
+	 * the mean seek distance.  If not, it may be time to break the
+	 * queues apart again.
+	 */
+	if (cfq_cfqq_coop(cfqq)) {
+		if (CFQQ_SEEKY(cfqq) && !cfqq->seeky_start)
+			cfqq->seeky_start = jiffies;
+		else if (!CFQQ_SEEKY(cfqq))
+			cfqq->seeky_start = 0;
+	}
 }
 
 /*
@@ -2391,6 +2427,32 @@ cfq_merge_cfqqs(struct cfq_data *cfqd, struct cfq_io_context *cic,
 	return cic_to_cfqq(cic, 1);
 }
 
+static int should_split_cfqq(struct cfq_queue *cfqq)
+{
+	if (cfqq->seeky_start &&
+	    time_after(jiffies, cfqq->seeky_start + CFQQ_COOP_TOUT))
+		return 1;
+	return 0;
+}
+
+/*
+ * Returns NULL if a new cfqq should be allocated, or the old cfqq if this
+ * was the last process referring to said cfqq.
+ */
+static struct cfq_queue *
+split_cfqq(struct cfq_io_context *cic, struct cfq_queue *cfqq)
+{
+	if (cfqq_process_refs(cfqq) == 1) {
+		cfqq->seeky_start = 0;
+		cfqq->pid = current->pid;
+		cfq_clear_cfqq_coop(cfqq);
+		return cfqq;
+	}
+
+	cic_set_cfqq(cic, NULL, 1);
+	cfq_put_queue(cfqq);
+	return NULL;
+}
 /*
  * Allocate cfq data structures associated with this request.
  */
@@ -2413,11 +2475,22 @@ cfq_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask)
 	if (!cic)
 		goto queue_fail;
 
+new_queue:
 	cfqq = cic_to_cfqq(cic, is_sync);
 	if (!cfqq || cfqq == &cfqd->oom_cfqq) {
 		cfqq = cfq_get_queue(cfqd, is_sync, cic->ioc, gfp_mask);
 		cic_set_cfqq(cic, cfqq, is_sync);
 	} else {
+		/*
+		 * If the queue was seeky for too long, break it apart.
+		 */
+		if (cfq_cfqq_coop(cfqq) && should_split_cfqq(cfqq)) {
+			cfq_log_cfqq(cfqd, cfqq, "breaking apart cfqq");
+			cfqq = split_cfqq(cic, cfqq);
+			if (!cfqq)
+				goto new_queue;
+		}
+
 		/*
 		 * Check to see if this queue is scheduled to merge with
 		 * another, closely cooperating queue.  The merging of
-- 
cgit v1.2.2


From 1a1238a7dd48e48b3bba8f426a1d61c22c80d6d1 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shaohua.li@intel.com>
Date: Tue, 27 Oct 2009 08:46:23 +0100
Subject: cfq-iosched: improve hw_tag detection

If active queue hasn't enough requests and idle window opens, cfq will not
dispatch sufficient requests to hardware. In such situation, current code
will zero hw_tag. But this is because cfq doesn't dispatch enough requests
instead of hardware queue doesn't work. Don't zero hw_tag in such case.

Signed-off-by: Shaohua Li <shaohua.li@intel.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 47d6aaca0c51..418da9a49bb0 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -2257,6 +2257,8 @@ static void cfq_insert_request(struct request_queue *q, struct request *rq)
  */
 static void cfq_update_hw_tag(struct cfq_data *cfqd)
 {
+	struct cfq_queue *cfqq = cfqd->active_queue;
+
 	if (rq_in_driver(cfqd) > cfqd->rq_in_driver_peak)
 		cfqd->rq_in_driver_peak = rq_in_driver(cfqd);
 
@@ -2264,6 +2266,16 @@ static void cfq_update_hw_tag(struct cfq_data *cfqd)
 	    rq_in_driver(cfqd) <= CFQ_HW_QUEUE_MIN)
 		return;
 
+	/*
+	 * If active queue hasn't enough requests and can idle, cfq might not
+	 * dispatch sufficient requests to hardware. Don't zero hw_tag in this
+	 * case
+	 */
+	if (cfqq && cfq_cfqq_idle_window(cfqq) &&
+	    cfqq->dispatched + cfqq->queued[0] + cfqq->queued[1] <
+	    CFQ_HW_QUEUE_MIN && rq_in_driver(cfqd) < CFQ_HW_QUEUE_MIN)
+		return;
+
 	if (cfqd->hw_tag_samples++ < 50)
 		return;
 
-- 
cgit v1.2.2


From 5db5d64277bf390056b1a87d0bb288c8b8553f96 Mon Sep 17 00:00:00 2001
From: Corrado Zoccolo <czoccolo@gmail.com>
Date: Mon, 26 Oct 2009 22:44:04 +0100
Subject: cfq-iosched: adapt slice to number of processes doing I/O

When the number of processes performing I/O concurrently increases,
a fixed time slice per process will cause large latencies.

This patch, if low_latency mode is enabled,  will scale the time slice
assigned to each process according to a 300ms target latency.

In order to keep fairness among processes:
* The number of active processes is computed using a special form of
running average, that quickly follows sudden increases (to keep latency low),
and decrease slowly (to have fairness in spite of rapid decreases of this
value).

To safeguard sequential bandwidth, we impose a minimum time slice
(computed using 2*cfq_slice_idle as base, adjusted according to priority
and async-ness).

Signed-off-by: Corrado Zoccolo <czoccolo@gmail.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 51 insertions(+), 2 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 418da9a49bb0..97d946585bc3 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -27,6 +27,8 @@ static const int cfq_slice_sync = HZ / 10;
 static int cfq_slice_async = HZ / 25;
 static const int cfq_slice_async_rq = 2;
 static int cfq_slice_idle = HZ / 125;
+static const int cfq_target_latency = HZ * 3/10; /* 300 ms */
+static const int cfq_hist_divisor = 4;
 
 /*
  * offset from end of service tree
@@ -148,6 +150,8 @@ struct cfq_data {
 	struct rb_root prio_trees[CFQ_PRIO_LISTS];
 
 	unsigned int busy_queues;
+	unsigned int busy_rt_queues;
+	unsigned int busy_queues_avg[2];
 
 	int rq_in_driver[2];
 	int sync_flight;
@@ -315,10 +319,52 @@ cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 	return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio);
 }
 
+/*
+ * get averaged number of queues of RT/BE priority.
+ * average is updated, with a formula that gives more weight to higher numbers,
+ * to quickly follows sudden increases and decrease slowly
+ */
+
+static inline unsigned
+cfq_get_avg_queues(struct cfq_data *cfqd, bool rt) {
+	unsigned min_q, max_q;
+	unsigned mult  = cfq_hist_divisor - 1;
+	unsigned round = cfq_hist_divisor / 2;
+	unsigned busy = cfqd->busy_rt_queues;
+
+	if (!rt)
+		busy = cfqd->busy_queues - cfqd->busy_rt_queues;
+
+	min_q = min(cfqd->busy_queues_avg[rt], busy);
+	max_q = max(cfqd->busy_queues_avg[rt], busy);
+	cfqd->busy_queues_avg[rt] = (mult * max_q + min_q + round) /
+		cfq_hist_divisor;
+	return cfqd->busy_queues_avg[rt];
+}
+
 static inline void
 cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
-	cfqq->slice_end = cfq_prio_to_slice(cfqd, cfqq) + jiffies;
+	unsigned slice = cfq_prio_to_slice(cfqd, cfqq);
+	if (cfqd->cfq_latency) {
+		/* interested queues (we consider only the ones with the same
+		 * priority class) */
+		unsigned iq = cfq_get_avg_queues(cfqd, cfq_class_rt(cfqq));
+		unsigned sync_slice = cfqd->cfq_slice[1];
+		unsigned expect_latency = sync_slice * iq;
+		if (expect_latency > cfq_target_latency) {
+			unsigned base_low_slice = 2 * cfqd->cfq_slice_idle;
+			/* scale low_slice according to IO priority
+			 * and sync vs async */
+			unsigned low_slice =
+				min(slice, base_low_slice * slice / sync_slice);
+			/* the adapted slice value is scaled to fit all iqs
+			 * into the target latency */
+			slice = max(slice * cfq_target_latency / expect_latency,
+				    low_slice);
+		}
+	}
+	cfqq->slice_end = jiffies + slice;
 	cfq_log_cfqq(cfqd, cfqq, "set_slice=%lu", cfqq->slice_end - jiffies);
 }
 
@@ -669,7 +715,8 @@ static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 	BUG_ON(cfq_cfqq_on_rr(cfqq));
 	cfq_mark_cfqq_on_rr(cfqq);
 	cfqd->busy_queues++;
-
+	if (cfq_class_rt(cfqq))
+		cfqd->busy_rt_queues++;
 	cfq_resort_rr_list(cfqd, cfqq);
 }
 
@@ -692,6 +739,8 @@ static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 
 	BUG_ON(!cfqd->busy_queues);
 	cfqd->busy_queues--;
+	if (cfq_class_rt(cfqq))
+		cfqd->busy_rt_queues--;
 }
 
 /*
-- 
cgit v1.2.2


From aa6f6a3de18131348f70951efb2c56d806033e09 Mon Sep 17 00:00:00 2001
From: Corrado Zoccolo <czoccolo@gmail.com>
Date: Mon, 26 Oct 2009 22:44:33 +0100
Subject: cfq-iosched: preparation to handle multiple service trees

We embed a pointer to the service tree in each queue, to handle multiple
service trees easily.
Service trees are enriched with a counter.
cfq_add_rq_rb is invoked after putting the rq in the fifo, to ensure
that all fields in rq are properly initialized.

Signed-off-by: Corrado Zoccolo <czoccolo@gmail.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 30 +++++++++++++++++++-----------
 1 file changed, 19 insertions(+), 11 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 97d946585bc3..c95c69e199f4 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -75,8 +75,9 @@ static DEFINE_SPINLOCK(ioc_gone_lock);
 struct cfq_rb_root {
 	struct rb_root rb;
 	struct rb_node *left;
+	unsigned count;
 };
-#define CFQ_RB_ROOT	(struct cfq_rb_root) { RB_ROOT, NULL, }
+#define CFQ_RB_ROOT	(struct cfq_rb_root) { RB_ROOT, NULL, 0, }
 
 /*
  * Per process-grouping structure
@@ -128,6 +129,7 @@ struct cfq_queue {
 
 	pid_t pid;
 
+	struct cfq_rb_root *service_tree;
 	struct cfq_queue *new_cfqq;
 };
 
@@ -503,6 +505,7 @@ static void cfq_rb_erase(struct rb_node *n, struct cfq_rb_root *root)
 	if (root->left == n)
 		root->left = NULL;
 	rb_erase_init(n, &root->rb);
+	--root->count;
 }
 
 /*
@@ -553,11 +556,12 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 	struct rb_node **p, *parent;
 	struct cfq_queue *__cfqq;
 	unsigned long rb_key;
+	struct cfq_rb_root *service_tree = &cfqd->service_tree;
 	int left;
 
 	if (cfq_class_idle(cfqq)) {
 		rb_key = CFQ_IDLE_DELAY;
-		parent = rb_last(&cfqd->service_tree.rb);
+		parent = rb_last(&service_tree->rb);
 		if (parent && parent != &cfqq->rb_node) {
 			__cfqq = rb_entry(parent, struct cfq_queue, rb_node);
 			rb_key += __cfqq->rb_key;
@@ -575,7 +579,7 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 		cfqq->slice_resid = 0;
 	} else {
 		rb_key = -HZ;
-		__cfqq = cfq_rb_first(&cfqd->service_tree);
+		__cfqq = cfq_rb_first(service_tree);
 		rb_key += __cfqq ? __cfqq->rb_key : jiffies;
 	}
 
@@ -586,12 +590,14 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 		if (rb_key == cfqq->rb_key)
 			return;
 
-		cfq_rb_erase(&cfqq->rb_node, &cfqd->service_tree);
+		cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree);
+		cfqq->service_tree = NULL;
 	}
 
 	left = 1;
 	parent = NULL;
-	p = &cfqd->service_tree.rb.rb_node;
+	cfqq->service_tree = service_tree;
+	p = &service_tree->rb.rb_node;
 	while (*p) {
 		struct rb_node **n;
 
@@ -623,11 +629,12 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 	}
 
 	if (left)
-		cfqd->service_tree.left = &cfqq->rb_node;
+		service_tree->left = &cfqq->rb_node;
 
 	cfqq->rb_key = rb_key;
 	rb_link_node(&cfqq->rb_node, parent, p);
-	rb_insert_color(&cfqq->rb_node, &cfqd->service_tree.rb);
+	rb_insert_color(&cfqq->rb_node, &service_tree->rb);
+	service_tree->count++;
 }
 
 static struct cfq_queue *
@@ -730,8 +737,10 @@ static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 	BUG_ON(!cfq_cfqq_on_rr(cfqq));
 	cfq_clear_cfqq_on_rr(cfqq);
 
-	if (!RB_EMPTY_NODE(&cfqq->rb_node))
-		cfq_rb_erase(&cfqq->rb_node, &cfqd->service_tree);
+	if (!RB_EMPTY_NODE(&cfqq->rb_node)) {
+		cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree);
+		cfqq->service_tree = NULL;
+	}
 	if (cfqq->p_root) {
 		rb_erase(&cfqq->p_node, cfqq->p_root);
 		cfqq->p_root = NULL;
@@ -2292,10 +2301,9 @@ static void cfq_insert_request(struct request_queue *q, struct request *rq)
 	cfq_log_cfqq(cfqd, cfqq, "insert_request");
 	cfq_init_prio_data(cfqq, RQ_CIC(rq)->ioc);
 
-	cfq_add_rq_rb(rq);
-
 	rq_set_fifo_time(rq, jiffies + cfqd->cfq_fifo_expire[rq_is_sync(rq)]);
 	list_add_tail(&rq->queuelist, &cfqq->fifo);
+	cfq_add_rq_rb(rq);
 
 	cfq_rq_enqueued(cfqd, cfqq, rq);
 }
-- 
cgit v1.2.2


From c0324a020e5b351f100569b128715985f1023af8 Mon Sep 17 00:00:00 2001
From: Corrado Zoccolo <czoccolo@gmail.com>
Date: Tue, 27 Oct 2009 19:16:03 +0100
Subject: cfq-iosched: reimplement priorities using different service trees

We use different service trees for different priority classes.
This allows a simplification in the service tree insertion code, that no
longer has to consider priority while walking the tree.

Signed-off-by: Corrado Zoccolo <czoccolo@gmail.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 116 +++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 82 insertions(+), 34 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index c95c69e199f4..6e5c3d715ebe 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -133,6 +133,16 @@ struct cfq_queue {
 	struct cfq_queue *new_cfqq;
 };
 
+/*
+ * Index in the service_trees.
+ * IDLE is handled separately, so it has negative index
+ */
+enum wl_prio_t {
+	IDLE_WORKLOAD = -1,
+	BE_WORKLOAD = 0,
+	RT_WORKLOAD = 1
+};
+
 /*
  * Per block device queue structure
  */
@@ -140,9 +150,15 @@ struct cfq_data {
 	struct request_queue *queue;
 
 	/*
-	 * rr list of queues with requests and the count of them
+	 * rr lists of queues with requests, onle rr for each priority class.
+	 * Counts are embedded in the cfq_rb_root
+	 */
+	struct cfq_rb_root service_trees[2];
+	struct cfq_rb_root service_tree_idle;
+	/*
+	 * The priority currently being served
 	 */
-	struct cfq_rb_root service_tree;
+	enum wl_prio_t serving_prio;
 
 	/*
 	 * Each priority tree is sorted by next_request position.  These
@@ -152,7 +168,6 @@ struct cfq_data {
 	struct rb_root prio_trees[CFQ_PRIO_LISTS];
 
 	unsigned int busy_queues;
-	unsigned int busy_rt_queues;
 	unsigned int busy_queues_avg[2];
 
 	int rq_in_driver[2];
@@ -205,6 +220,15 @@ struct cfq_data {
 	unsigned long last_end_sync_rq;
 };
 
+static struct cfq_rb_root *service_tree_for(enum wl_prio_t prio,
+					    struct cfq_data *cfqd)
+{
+	if (prio == IDLE_WORKLOAD)
+		return &cfqd->service_tree_idle;
+
+	return &cfqd->service_trees[prio];
+}
+
 enum cfqq_state_flags {
 	CFQ_CFQQ_FLAG_on_rr = 0,	/* on round-robin busy list */
 	CFQ_CFQQ_FLAG_wait_request,	/* waiting for a request */
@@ -249,6 +273,23 @@ CFQ_CFQQ_FNS(coop);
 #define cfq_log(cfqd, fmt, args...)	\
 	blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args)
 
+static inline enum wl_prio_t cfqq_prio(struct cfq_queue *cfqq)
+{
+	if (cfq_class_idle(cfqq))
+		return IDLE_WORKLOAD;
+	if (cfq_class_rt(cfqq))
+		return RT_WORKLOAD;
+	return BE_WORKLOAD;
+}
+
+static inline int cfq_busy_queues_wl(enum wl_prio_t wl, struct cfq_data *cfqd)
+{
+	if (wl == IDLE_WORKLOAD)
+		return cfqd->service_tree_idle.count;
+
+	return cfqd->service_trees[wl].count;
+}
+
 static void cfq_dispatch_insert(struct request_queue *, struct request *);
 static struct cfq_queue *cfq_get_queue(struct cfq_data *, bool,
 				       struct io_context *, gfp_t);
@@ -332,10 +373,7 @@ cfq_get_avg_queues(struct cfq_data *cfqd, bool rt) {
 	unsigned min_q, max_q;
 	unsigned mult  = cfq_hist_divisor - 1;
 	unsigned round = cfq_hist_divisor / 2;
-	unsigned busy = cfqd->busy_rt_queues;
-
-	if (!rt)
-		busy = cfqd->busy_queues - cfqd->busy_rt_queues;
+	unsigned busy = cfq_busy_queues_wl(rt, cfqd);
 
 	min_q = min(cfqd->busy_queues_avg[rt], busy);
 	max_q = max(cfqd->busy_queues_avg[rt], busy);
@@ -546,7 +584,7 @@ static unsigned long cfq_slice_offset(struct cfq_data *cfqd,
 }
 
 /*
- * The cfqd->service_tree holds all pending cfq_queue's that have
+ * The cfqd->service_trees holds all pending cfq_queue's that have
  * requests waiting to be processed. It is sorted in the order that
  * we will service the queues.
  */
@@ -556,9 +594,10 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 	struct rb_node **p, *parent;
 	struct cfq_queue *__cfqq;
 	unsigned long rb_key;
-	struct cfq_rb_root *service_tree = &cfqd->service_tree;
+	struct cfq_rb_root *service_tree;
 	int left;
 
+	service_tree = service_tree_for(cfqq_prio(cfqq), cfqd);
 	if (cfq_class_idle(cfqq)) {
 		rb_key = CFQ_IDLE_DELAY;
 		parent = rb_last(&service_tree->rb);
@@ -587,7 +626,8 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 		/*
 		 * same position, nothing more to do
 		 */
-		if (rb_key == cfqq->rb_key)
+		if (rb_key == cfqq->rb_key &&
+		    cfqq->service_tree == service_tree)
 			return;
 
 		cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree);
@@ -605,25 +645,14 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 		__cfqq = rb_entry(parent, struct cfq_queue, rb_node);
 
 		/*
-		 * sort RT queues first, we always want to give
-		 * preference to them. IDLE queues goes to the back.
-		 * after that, sort on the next service time.
+		 * sort by key, that represents service time.
 		 */
-		if (cfq_class_rt(cfqq) > cfq_class_rt(__cfqq))
+		if (time_before(rb_key, __cfqq->rb_key))
 			n = &(*p)->rb_left;
-		else if (cfq_class_rt(cfqq) < cfq_class_rt(__cfqq))
-			n = &(*p)->rb_right;
-		else if (cfq_class_idle(cfqq) < cfq_class_idle(__cfqq))
-			n = &(*p)->rb_left;
-		else if (cfq_class_idle(cfqq) > cfq_class_idle(__cfqq))
-			n = &(*p)->rb_right;
-		else if (time_before(rb_key, __cfqq->rb_key))
-			n = &(*p)->rb_left;
-		else
+		else {
 			n = &(*p)->rb_right;
-
-		if (n == &(*p)->rb_right)
 			left = 0;
+		}
 
 		p = n;
 	}
@@ -722,8 +751,7 @@ static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 	BUG_ON(cfq_cfqq_on_rr(cfqq));
 	cfq_mark_cfqq_on_rr(cfqq);
 	cfqd->busy_queues++;
-	if (cfq_class_rt(cfqq))
-		cfqd->busy_rt_queues++;
+
 	cfq_resort_rr_list(cfqd, cfqq);
 }
 
@@ -748,8 +776,6 @@ static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 
 	BUG_ON(!cfqd->busy_queues);
 	cfqd->busy_queues--;
-	if (cfq_class_rt(cfqq))
-		cfqd->busy_rt_queues--;
 }
 
 /*
@@ -1003,10 +1029,12 @@ static inline void cfq_slice_expired(struct cfq_data *cfqd, bool timed_out)
  */
 static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
 {
-	if (RB_EMPTY_ROOT(&cfqd->service_tree.rb))
-		return NULL;
+	struct cfq_rb_root *service_tree =
+		service_tree_for(cfqd->serving_prio, cfqd);
 
-	return cfq_rb_first(&cfqd->service_tree);
+	if (RB_EMPTY_ROOT(&service_tree->rb))
+		return NULL;
+	return cfq_rb_first(service_tree);
 }
 
 /*
@@ -1123,6 +1151,12 @@ static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd,
 	if (CFQQ_SEEKY(cfqq))
 		return NULL;
 
+	/*
+	 * Do not merge queues of different priority classes
+	 */
+	if (cfq_class_rt(cfqq) != cfq_class_rt(cur_cfqq))
+		return NULL;
+
 	return cfqq;
 }
 
@@ -1336,6 +1370,14 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
 expire:
 	cfq_slice_expired(cfqd, 0);
 new_queue:
+	if (!new_cfqq) {
+		if (cfq_busy_queues_wl(RT_WORKLOAD, cfqd))
+			cfqd->serving_prio = RT_WORKLOAD;
+		else if (cfq_busy_queues_wl(BE_WORKLOAD, cfqd))
+			cfqd->serving_prio = BE_WORKLOAD;
+		else
+			cfqd->serving_prio = IDLE_WORKLOAD;
+	}
 	cfqq = cfq_set_active_queue(cfqd, new_cfqq);
 keep_queue:
 	return cfqq;
@@ -1362,8 +1404,12 @@ static int cfq_forced_dispatch(struct cfq_data *cfqd)
 {
 	struct cfq_queue *cfqq;
 	int dispatched = 0;
+	int i;
+	for (i = 0; i < 2; ++i)
+		while ((cfqq = cfq_rb_first(&cfqd->service_trees[i])) != NULL)
+			dispatched += __cfq_forced_dispatch_cfqq(cfqq);
 
-	while ((cfqq = cfq_rb_first(&cfqd->service_tree)) != NULL)
+	while ((cfqq = cfq_rb_first(&cfqd->service_tree_idle)) != NULL)
 		dispatched += __cfq_forced_dispatch_cfqq(cfqq);
 
 	cfq_slice_expired(cfqd, 0);
@@ -2710,7 +2756,9 @@ static void *cfq_init_queue(struct request_queue *q)
 	if (!cfqd)
 		return NULL;
 
-	cfqd->service_tree = CFQ_RB_ROOT;
+	for (i = 0; i < 2; ++i)
+		cfqd->service_trees[i] = CFQ_RB_ROOT;
+	cfqd->service_tree_idle = CFQ_RB_ROOT;
 
 	/*
 	 * Not strictly needed (since RB_ROOT just clears the node and we
-- 
cgit v1.2.2


From a6d44e982d3734583b3b4e1d36921af8cfd61fc0 Mon Sep 17 00:00:00 2001
From: Corrado Zoccolo <czoccolo@gmail.com>
Date: Mon, 26 Oct 2009 22:45:11 +0100
Subject: cfq-iosched: enable idling for last queue on priority class

cfq can disable idling for queues in various circumstances.
When workloads of different priorities are competing, if the higher
priority queue has idling disabled, lower priority queues may steal
its disk share. For example, in a scenario with an RT process
performing seeky reads vs a BE process performing sequential reads,
on an NCQ enabled hardware, with low_latency unset,
the RT process will dispatch only the few pending requests every full
slice of service for the BE process.

The patch solves this issue by always performing idle on the last
queue at a given priority class > idle. If the same process, or one
that can pre-empt it (so at the same priority or higher), submits a
new request within the idle window, the lower priority queue won't
dispatch, saving the disk bandwidth for higher priority ones.

Note: this doesn't touch the non_rotational + NCQ case (no hardware
to test if this is a benefit in that case).

Signed-off-by: Corrado Zoccolo <czoccolo@gmail.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 34 +++++++++++++++++++++++++++++++---
 1 file changed, 31 insertions(+), 3 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 6e5c3d715ebe..76afa3696894 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1160,6 +1160,34 @@ static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd,
 	return cfqq;
 }
 
+/*
+ * Determine whether we should enforce idle window for this queue.
+ */
+
+static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq)
+{
+	enum wl_prio_t prio = cfqq_prio(cfqq);
+	struct cfq_rb_root *service_tree;
+
+	/* We never do for idle class queues. */
+	if (prio == IDLE_WORKLOAD)
+		return false;
+
+	/* We do for queues that were marked with idle window flag. */
+	if (cfq_cfqq_idle_window(cfqq))
+		return true;
+
+	/*
+	 * Otherwise, we do only if they are the last ones
+	 * in their service tree.
+	 */
+	service_tree = service_tree_for(prio, cfqd);
+	if (service_tree->count == 0)
+		return true;
+
+	return (service_tree->count == 1 && cfq_rb_first(service_tree) == cfqq);
+}
+
 static void cfq_arm_slice_timer(struct cfq_data *cfqd)
 {
 	struct cfq_queue *cfqq = cfqd->active_queue;
@@ -1180,7 +1208,7 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
 	/*
 	 * idle is disabled, either manually or by past process history
 	 */
-	if (!cfqd->cfq_slice_idle || !cfq_cfqq_idle_window(cfqq))
+	if (!cfqd->cfq_slice_idle || !cfq_should_idle(cfqd, cfqq))
 		return;
 
 	/*
@@ -1362,7 +1390,7 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
 	 * conditions to happen (or time out) before selecting a new queue.
 	 */
 	if (timer_pending(&cfqd->idle_slice_timer) ||
-	    (cfqq->dispatched && cfq_cfqq_idle_window(cfqq))) {
+	    (cfqq->dispatched && cfq_should_idle(cfqd, cfqq))) {
 		cfqq = NULL;
 		goto keep_queue;
 	}
@@ -1427,7 +1455,7 @@ static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 	/*
 	 * Drain async requests before we start sync IO
 	 */
-	if (cfq_cfqq_idle_window(cfqq) && cfqd->rq_in_driver[BLK_RW_ASYNC])
+	if (cfq_should_idle(cfqd, cfqq) && cfqd->rq_in_driver[BLK_RW_ASYNC])
 		return false;
 
 	/*
-- 
cgit v1.2.2


From 718eee0579b802aabe3bafacf09d0a9b0830f1dd Mon Sep 17 00:00:00 2001
From: Corrado Zoccolo <czoccolo@gmail.com>
Date: Mon, 26 Oct 2009 22:45:29 +0100
Subject: cfq-iosched: fairness for sync no-idle queues

Currently no-idle queues in cfq are not serviced fairly:
even if they can only dispatch a small number of requests at a time,
they have to compete with idling queues to be serviced, experiencing
large latencies.

We should notice, instead, that no-idle queues are the ones that would
benefit most from having low latency, in fact they are any of:
* processes with large think times (e.g. interactive ones like file
  managers)
* seeky (e.g. programs faulting in their code at startup)
* or marked as no-idle from upper levels, to improve latencies of those
  requests.

This patch improves the fairness and latency for those queues, by:
* separating sync idle, sync no-idle and async queues in separate
  service_trees, for each priority
* service all no-idle queues together
* and idling when the last no-idle queue has been serviced, to
  anticipate for more no-idle work
* the timeslices allotted for idle and no-idle service_trees are
  computed proportionally to the number of processes in each set.

Servicing all no-idle queues together should have a performance boost
for NCQ-capable drives, without compromising fairness.

Signed-off-by: Corrado Zoccolo <czoccolo@gmail.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 200 +++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 168 insertions(+), 32 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 76afa3696894..859f534ae9ef 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -134,7 +134,7 @@ struct cfq_queue {
 };
 
 /*
- * Index in the service_trees.
+ * First index in the service_trees.
  * IDLE is handled separately, so it has negative index
  */
 enum wl_prio_t {
@@ -143,6 +143,16 @@ enum wl_prio_t {
 	RT_WORKLOAD = 1
 };
 
+/*
+ * Second index in the service_trees.
+ */
+enum wl_type_t {
+	ASYNC_WORKLOAD = 0,
+	SYNC_NOIDLE_WORKLOAD = 1,
+	SYNC_WORKLOAD = 2
+};
+
+
 /*
  * Per block device queue structure
  */
@@ -153,12 +163,14 @@ struct cfq_data {
 	 * rr lists of queues with requests, onle rr for each priority class.
 	 * Counts are embedded in the cfq_rb_root
 	 */
-	struct cfq_rb_root service_trees[2];
+	struct cfq_rb_root service_trees[2][3];
 	struct cfq_rb_root service_tree_idle;
 	/*
 	 * The priority currently being served
 	 */
 	enum wl_prio_t serving_prio;
+	enum wl_type_t serving_type;
+	unsigned long workload_expires;
 
 	/*
 	 * Each priority tree is sorted by next_request position.  These
@@ -221,12 +233,13 @@ struct cfq_data {
 };
 
 static struct cfq_rb_root *service_tree_for(enum wl_prio_t prio,
+					    enum wl_type_t type,
 					    struct cfq_data *cfqd)
 {
 	if (prio == IDLE_WORKLOAD)
 		return &cfqd->service_tree_idle;
 
-	return &cfqd->service_trees[prio];
+	return &cfqd->service_trees[prio][type];
 }
 
 enum cfqq_state_flags {
@@ -282,12 +295,24 @@ static inline enum wl_prio_t cfqq_prio(struct cfq_queue *cfqq)
 	return BE_WORKLOAD;
 }
 
+
+static enum wl_type_t cfqq_type(struct cfq_queue *cfqq)
+{
+	if (!cfq_cfqq_sync(cfqq))
+		return ASYNC_WORKLOAD;
+	if (!cfq_cfqq_idle_window(cfqq))
+		return SYNC_NOIDLE_WORKLOAD;
+	return SYNC_WORKLOAD;
+}
+
 static inline int cfq_busy_queues_wl(enum wl_prio_t wl, struct cfq_data *cfqd)
 {
 	if (wl == IDLE_WORKLOAD)
 		return cfqd->service_tree_idle.count;
 
-	return cfqd->service_trees[wl].count;
+	return cfqd->service_trees[wl][ASYNC_WORKLOAD].count
+		+ cfqd->service_trees[wl][SYNC_NOIDLE_WORKLOAD].count
+		+ cfqd->service_trees[wl][SYNC_WORKLOAD].count;
 }
 
 static void cfq_dispatch_insert(struct request_queue *, struct request *);
@@ -597,7 +622,7 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 	struct cfq_rb_root *service_tree;
 	int left;
 
-	service_tree = service_tree_for(cfqq_prio(cfqq), cfqd);
+	service_tree = service_tree_for(cfqq_prio(cfqq), cfqq_type(cfqq), cfqd);
 	if (cfq_class_idle(cfqq)) {
 		rb_key = CFQ_IDLE_DELAY;
 		parent = rb_last(&service_tree->rb);
@@ -1030,7 +1055,7 @@ static inline void cfq_slice_expired(struct cfq_data *cfqd, bool timed_out)
 static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
 {
 	struct cfq_rb_root *service_tree =
-		service_tree_for(cfqd->serving_prio, cfqd);
+		service_tree_for(cfqd->serving_prio, cfqd->serving_type, cfqd);
 
 	if (RB_EMPTY_ROOT(&service_tree->rb))
 		return NULL;
@@ -1167,7 +1192,7 @@ static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd,
 static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
 	enum wl_prio_t prio = cfqq_prio(cfqq);
-	struct cfq_rb_root *service_tree;
+	struct cfq_rb_root *service_tree = cfqq->service_tree;
 
 	/* We never do for idle class queues. */
 	if (prio == IDLE_WORKLOAD)
@@ -1181,7 +1206,9 @@ static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 	 * Otherwise, we do only if they are the last ones
 	 * in their service tree.
 	 */
-	service_tree = service_tree_for(prio, cfqd);
+	if (!service_tree)
+		service_tree = service_tree_for(prio, cfqq_type(cfqq), cfqd);
+
 	if (service_tree->count == 0)
 		return true;
 
@@ -1235,14 +1262,20 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
 
 	cfq_mark_cfqq_wait_request(cfqq);
 
-	/*
-	 * we don't want to idle for seeks, but we do want to allow
-	 * fair distribution of slice time for a process doing back-to-back
-	 * seeks. so allow a little bit of time for him to submit a new rq
-	 */
 	sl = cfqd->cfq_slice_idle;
-	if (sample_valid(cfqq->seek_samples) && CFQQ_SEEKY(cfqq))
+	/* are we servicing noidle tree, and there are more queues?
+	 * non-rotational or NCQ: no idle
+	 * non-NCQ rotational : very small idle, to allow
+	 *     fair distribution of slice time for a process doing back-to-back
+	 *     seeks.
+	 */
+	if (cfqd->serving_type == SYNC_NOIDLE_WORKLOAD &&
+	    service_tree_for(cfqd->serving_prio, SYNC_NOIDLE_WORKLOAD, cfqd)
+		->count > 0) {
+		if (blk_queue_nonrot(cfqd->queue) || cfqd->hw_tag)
+			return;
 		sl = min(sl, msecs_to_jiffies(CFQ_MIN_TT));
+	}
 
 	mod_timer(&cfqd->idle_slice_timer, jiffies + sl);
 	cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu", sl);
@@ -1346,6 +1379,106 @@ static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq)
 	}
 }
 
+static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd, enum wl_prio_t prio,
+				    bool prio_changed)
+{
+	struct cfq_queue *queue;
+	int i;
+	bool key_valid = false;
+	unsigned long lowest_key = 0;
+	enum wl_type_t cur_best = SYNC_NOIDLE_WORKLOAD;
+
+	if (prio_changed) {
+		/*
+		 * When priorities switched, we prefer starting
+		 * from SYNC_NOIDLE (first choice), or just SYNC
+		 * over ASYNC
+		 */
+		if (service_tree_for(prio, cur_best, cfqd)->count)
+			return cur_best;
+		cur_best = SYNC_WORKLOAD;
+		if (service_tree_for(prio, cur_best, cfqd)->count)
+			return cur_best;
+
+		return ASYNC_WORKLOAD;
+	}
+
+	for (i = 0; i < 3; ++i) {
+		/* otherwise, select the one with lowest rb_key */
+		queue = cfq_rb_first(service_tree_for(prio, i, cfqd));
+		if (queue &&
+		    (!key_valid || time_before(queue->rb_key, lowest_key))) {
+			lowest_key = queue->rb_key;
+			cur_best = i;
+			key_valid = true;
+		}
+	}
+
+	return cur_best;
+}
+
+static void choose_service_tree(struct cfq_data *cfqd)
+{
+	enum wl_prio_t previous_prio = cfqd->serving_prio;
+	bool prio_changed;
+	unsigned slice;
+	unsigned count;
+
+	/* Choose next priority. RT > BE > IDLE */
+	if (cfq_busy_queues_wl(RT_WORKLOAD, cfqd))
+		cfqd->serving_prio = RT_WORKLOAD;
+	else if (cfq_busy_queues_wl(BE_WORKLOAD, cfqd))
+		cfqd->serving_prio = BE_WORKLOAD;
+	else {
+		cfqd->serving_prio = IDLE_WORKLOAD;
+		cfqd->workload_expires = jiffies + 1;
+		return;
+	}
+
+	/*
+	 * For RT and BE, we have to choose also the type
+	 * (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload
+	 * expiration time
+	 */
+	prio_changed = (cfqd->serving_prio != previous_prio);
+	count = service_tree_for(cfqd->serving_prio, cfqd->serving_type, cfqd)
+		->count;
+
+	/*
+	 * If priority didn't change, check workload expiration,
+	 * and that we still have other queues ready
+	 */
+	if (!prio_changed && count &&
+	    !time_after(jiffies, cfqd->workload_expires))
+		return;
+
+	/* otherwise select new workload type */
+	cfqd->serving_type =
+		cfq_choose_wl(cfqd, cfqd->serving_prio, prio_changed);
+	count = service_tree_for(cfqd->serving_prio, cfqd->serving_type, cfqd)
+		->count;
+
+	/*
+	 * the workload slice is computed as a fraction of target latency
+	 * proportional to the number of queues in that workload, over
+	 * all the queues in the same priority class
+	 */
+	slice = cfq_target_latency * count /
+		max_t(unsigned, cfqd->busy_queues_avg[cfqd->serving_prio],
+		      cfq_busy_queues_wl(cfqd->serving_prio, cfqd));
+
+	if (cfqd->serving_type == ASYNC_WORKLOAD)
+		/* async workload slice is scaled down according to
+		 * the sync/async slice ratio. */
+		slice = slice * cfqd->cfq_slice[0] / cfqd->cfq_slice[1];
+	else
+		/* sync workload slice is at least 2 * cfq_slice_idle */
+		slice = max(slice, 2 * cfqd->cfq_slice_idle);
+
+	slice = max_t(unsigned, slice, CFQ_MIN_TT);
+	cfqd->workload_expires = jiffies + slice;
+}
+
 /*
  * Select a queue for service. If we have a current active queue,
  * check whether to continue servicing it, or retrieve and set a new one.
@@ -1398,14 +1531,13 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
 expire:
 	cfq_slice_expired(cfqd, 0);
 new_queue:
-	if (!new_cfqq) {
-		if (cfq_busy_queues_wl(RT_WORKLOAD, cfqd))
-			cfqd->serving_prio = RT_WORKLOAD;
-		else if (cfq_busy_queues_wl(BE_WORKLOAD, cfqd))
-			cfqd->serving_prio = BE_WORKLOAD;
-		else
-			cfqd->serving_prio = IDLE_WORKLOAD;
-	}
+	/*
+	 * Current queue expired. Check if we have to switch to a new
+	 * service tree
+	 */
+	if (!new_cfqq)
+		choose_service_tree(cfqd);
+
 	cfqq = cfq_set_active_queue(cfqd, new_cfqq);
 keep_queue:
 	return cfqq;
@@ -1432,10 +1564,12 @@ static int cfq_forced_dispatch(struct cfq_data *cfqd)
 {
 	struct cfq_queue *cfqq;
 	int dispatched = 0;
-	int i;
+	int i, j;
 	for (i = 0; i < 2; ++i)
-		while ((cfqq = cfq_rb_first(&cfqd->service_trees[i])) != NULL)
-			dispatched += __cfq_forced_dispatch_cfqq(cfqq);
+		for (j = 0; j < 3; ++j)
+			while ((cfqq = cfq_rb_first(&cfqd->service_trees[i][j]))
+				!= NULL)
+				dispatched += __cfq_forced_dispatch_cfqq(cfqq);
 
 	while ((cfqq = cfq_rb_first(&cfqd->service_tree_idle)) != NULL)
 		dispatched += __cfq_forced_dispatch_cfqq(cfqq);
@@ -2218,13 +2352,10 @@ cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 	enable_idle = old_idle = cfq_cfqq_idle_window(cfqq);
 
 	if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle ||
-	    (!cfqd->cfq_latency && cfqd->hw_tag && CFQQ_SEEKY(cfqq)))
+	    (sample_valid(cfqq->seek_samples) && CFQQ_SEEKY(cfqq)))
 		enable_idle = 0;
 	else if (sample_valid(cic->ttime_samples)) {
-		unsigned int slice_idle = cfqd->cfq_slice_idle;
-		if (sample_valid(cfqq->seek_samples) && CFQQ_SEEKY(cfqq))
-			slice_idle = msecs_to_jiffies(CFQ_MIN_TT);
-		if (cic->ttime_mean > slice_idle)
+		if (cic->ttime_mean > cfqd->cfq_slice_idle)
 			enable_idle = 0;
 		else
 			enable_idle = 1;
@@ -2262,6 +2393,10 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
 	if (cfq_class_idle(cfqq))
 		return true;
 
+	if (cfqd->serving_type == SYNC_NOIDLE_WORKLOAD
+	    && new_cfqq->service_tree == cfqq->service_tree)
+		return true;
+
 	/*
 	 * if the new request is sync, but the currently running queue is
 	 * not, let the sync request have priority.
@@ -2778,14 +2913,15 @@ static void cfq_exit_queue(struct elevator_queue *e)
 static void *cfq_init_queue(struct request_queue *q)
 {
 	struct cfq_data *cfqd;
-	int i;
+	int i, j;
 
 	cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node);
 	if (!cfqd)
 		return NULL;
 
 	for (i = 0; i < 2; ++i)
-		cfqd->service_trees[i] = CFQ_RB_ROOT;
+		for (j = 0; j < 3; ++j)
+			cfqd->service_trees[i][j] = CFQ_RB_ROOT;
 	cfqd->service_tree_idle = CFQ_RB_ROOT;
 
 	/*
-- 
cgit v1.2.2


From 5869619cb5b26754574375472fe54a390edf34c7 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Wed, 28 Oct 2009 09:27:07 +0100
Subject: cfq-iosched: fix style issue in cfq_get_avg_queues()

Line breaks and bad brace placement.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 859f534ae9ef..aa00d8f2d0b0 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -393,8 +393,8 @@ cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
  * to quickly follows sudden increases and decrease slowly
  */
 
-static inline unsigned
-cfq_get_avg_queues(struct cfq_data *cfqd, bool rt) {
+static inline unsigned cfq_get_avg_queues(struct cfq_data *cfqd, bool rt)
+{
 	unsigned min_q, max_q;
 	unsigned mult  = cfq_hist_divisor - 1;
 	unsigned round = cfq_hist_divisor / 2;
-- 
cgit v1.2.2


From dddb74519aec2081204d203a97578c9fc4e9fb64 Mon Sep 17 00:00:00 2001
From: Corrado Zoccolo <czoccolo@gmail.com>
Date: Mon, 2 Nov 2009 10:40:37 +0100
Subject: cfq-iosched: simplify prio-unboost code

Eliminate redundant checks.

Signed-off-by: Corrado Zoccolo <czoccolo@gmail.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 418da9a49bb0..757010d8fb7a 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -2359,12 +2359,10 @@ static void cfq_prio_boost(struct cfq_queue *cfqq)
 			cfqq->ioprio = IOPRIO_NORM;
 	} else {
 		/*
-		 * check if we need to unboost the queue
+		 * unboost the queue (if needed)
 		 */
-		if (cfqq->ioprio_class != cfqq->org_ioprio_class)
-			cfqq->ioprio_class = cfqq->org_ioprio_class;
-		if (cfqq->ioprio != cfqq->org_ioprio)
-			cfqq->ioprio = cfqq->org_ioprio;
+		cfqq->ioprio_class = cfqq->org_ioprio_class;
+		cfqq->ioprio = cfqq->org_ioprio;
 	}
 }
 
-- 
cgit v1.2.2


From 125c4f221a5352ae08aef2898055b879ad963f01 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Tue, 3 Nov 2009 21:25:45 +0100
Subject: cfq-iosched: fix merge error

We ended up with testing the same condition twice, pretty
pointless. Remove that first if.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 13b612f9f27a..b700f41cafb3 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -2433,7 +2433,6 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
 	 * if this request is as-good as one we would expect from the
 	 * current cfqq, let it preempt
 	 */
-	if (cfq_rq_close(cfqd, cfqq, rq))
 	if (cfq_rq_close(cfqd, cfqq, rq) && (!cfq_cfqq_coop(new_cfqq) ||
 	    cfqd->busy_queues == 1)) {
 		/*
-- 
cgit v1.2.2


From e00ef7997195e4f8e10593727a6286e2e2802159 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Wed, 4 Nov 2009 08:54:55 +0100
Subject: cfq-iosched: get rid of the coop_preempt flag

We need to rework this logic post the cooperating cfq_queue merging,
for now just get rid of it and Jeff Moyer will fix the fall out.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 21 ++-------------------
 1 file changed, 2 insertions(+), 19 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index b700f41cafb3..4ab240c875df 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -253,7 +253,6 @@ enum cfqq_state_flags {
 	CFQ_CFQQ_FLAG_slice_new,	/* no requests dispatched in slice */
 	CFQ_CFQQ_FLAG_sync,		/* synchronous queue */
 	CFQ_CFQQ_FLAG_coop,		/* cfqq is shared */
-	CFQ_CFQQ_FLAG_coop_preempt,	/* coop preempt */
 };
 
 #define CFQ_CFQQ_FNS(name)						\
@@ -280,7 +279,6 @@ CFQ_CFQQ_FNS(prio_changed);
 CFQ_CFQQ_FNS(slice_new);
 CFQ_CFQQ_FNS(sync);
 CFQ_CFQQ_FNS(coop);
-CFQ_CFQQ_FNS(coop_preempt);
 #undef CFQ_CFQQ_FNS
 
 #define cfq_log_cfqq(cfqd, cfqq, fmt, args...)	\
@@ -1070,16 +1068,9 @@ static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
 static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd,
 					      struct cfq_queue *cfqq)
 {
-	if (!cfqq) {
+	if (!cfqq)
 		cfqq = cfq_get_next_queue(cfqd);
 
-		if (cfqq && !cfq_cfqq_coop_preempt(cfqq))
-			cfq_clear_cfqq_coop(cfqq);
-	}
-
-	if (cfqq)
-		cfq_clear_cfqq_coop_preempt(cfqq);
-
 	__cfq_set_active_queue(cfqd, cfqq);
 	return cfqq;
 }
@@ -2433,16 +2424,8 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
 	 * if this request is as-good as one we would expect from the
 	 * current cfqq, let it preempt
 	 */
-	if (cfq_rq_close(cfqd, cfqq, rq) && (!cfq_cfqq_coop(new_cfqq) ||
-	    cfqd->busy_queues == 1)) {
-		/*
-		 * Mark new queue coop_preempt, so its coop flag will not be
-		 * cleared when new queue gets scheduled at the very first time
-		 */
-		cfq_mark_cfqq_coop_preempt(new_cfqq);
-		cfq_mark_cfqq_coop(new_cfqq);
+	if (cfq_rq_close(cfqd, cfqq, rq))
 		return true;
-	}
 
 	return false;
 }
-- 
cgit v1.2.2


From 476d42f138ba82389a92a894d8a630a70d36278f Mon Sep 17 00:00:00 2001
From: H Hartley Sweeten <hartleys@visionengravers.com>
Date: Wed, 4 Nov 2009 09:10:33 +0100
Subject: block/scsi_ioctl.c: quiet sparse noise

Quiet sparse noise about symbol's not being declared.

Symbol blk_default_cmd_filter is only used locally and should be static.

The function blk_scsi_ioctl_init() is a fs_initcall and should also be
static.

Signed-off-by: H Hartley Sweeten <hsweeten@visionengravers.com>
Cc: James Bottomley <James.Bottomley@HansenPartnership.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/scsi_ioctl.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'block')

diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index e5b10017a50b..a8b5a10eb5b0 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -35,7 +35,9 @@
 struct blk_cmd_filter {
 	unsigned long read_ok[BLK_SCSI_CMD_PER_LONG];
 	unsigned long write_ok[BLK_SCSI_CMD_PER_LONG];
-} blk_default_cmd_filter;
+};
+
+static struct blk_cmd_filter blk_default_cmd_filter;
 
 /* Command group 3 is reserved and should never be used.  */
 const unsigned char scsi_command_size_tbl[8] =
@@ -675,7 +677,7 @@ int scsi_cmd_ioctl(struct request_queue *q, struct gendisk *bd_disk, fmode_t mod
 }
 EXPORT_SYMBOL(scsi_cmd_ioctl);
 
-int __init blk_scsi_ioctl_init(void)
+static int __init blk_scsi_ioctl_init(void)
 {
 	blk_set_cmd_filter_defaults(&blk_default_cmd_filter);
 	return 0;
-- 
cgit v1.2.2


From cf7c25cf91f632a3528669fc0876e1fc8355ff9b Mon Sep 17 00:00:00 2001
From: Corrado Zoccolo <czoccolo@gmail.com>
Date: Sun, 8 Nov 2009 17:16:46 +0100
Subject: cfq-iosched: fix next_rq computation

Cfq has a bug in computation of next_rq, that affects transition
between multiple sequential request streams in a single queue
(e.g.: two sequential buffered writers of the same priority),
causing the alternation between the two streams for a transient period.

  8,0    1    18737     0.260400660  5312  D   W 141653311 + 256
  8,0    1    20839     0.273239461  5400  D   W 141653567 + 256
  8,0    1    20841     0.276343885  5394  D   W 142803919 + 256
  8,0    1    20843     0.279490878  5394  D   W 141668927 + 256
  8,0    1    20845     0.292459993  5400  D   W 142804175 + 256
  8,0    1    20847     0.295537247  5400  D   W 141668671 + 256
  8,0    1    20849     0.298656337  5400  D   W 142804431 + 256
  8,0    1    20851     0.311481148  5394  D   W 141668415 + 256
  8,0    1    20853     0.314421305  5394  D   W 142804687 + 256
  8,0    1    20855     0.318960112  5400  D   W 142804943 + 256

The fix makes sure that the next_rq is computed from the last
dispatched request, and not affected by merging.

  8,0    1    37776     4.305161306     0  D   W 141738087 + 256
  8,0    1    37778     4.308298091     0  D   W 141738343 + 256
  8,0    1    37780     4.312885190     0  D   W 141738599 + 256
  8,0    1    37782     4.315933291     0  D   W 141738855 + 256
  8,0    1    37784     4.319064459     0  D   W 141739111 + 256
  8,0    1    37786     4.331918431  5672  D   W 142803007 + 256
  8,0    1    37788     4.334930332  5672  D   W 142803263 + 256
  8,0    1    37790     4.337902723  5672  D   W 142803519 + 256
  8,0    1    37792     4.342359774  5672  D   W 142803775 + 256
  8,0    1    37794     4.345318286     0  D   W 142804031 + 256

Signed-off-by: Corrado Zoccolo <czoccolo@gmail.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 4ab240c875df..829d87d3e00f 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -454,9 +454,9 @@ static inline bool cfq_slice_used(struct cfq_queue *cfqq)
  * behind the head is penalized and only allowed to a certain extent.
  */
 static struct request *
-cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2)
+cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2, sector_t last)
 {
-	sector_t last, s1, s2, d1 = 0, d2 = 0;
+	sector_t s1, s2, d1 = 0, d2 = 0;
 	unsigned long back_max;
 #define CFQ_RQ1_WRAP	0x01 /* request 1 wraps */
 #define CFQ_RQ2_WRAP	0x02 /* request 2 wraps */
@@ -479,8 +479,6 @@ cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2)
 	s1 = blk_rq_pos(rq1);
 	s2 = blk_rq_pos(rq2);
 
-	last = cfqd->last_position;
-
 	/*
 	 * by definition, 1KiB is 2 sectors
 	 */
@@ -595,7 +593,7 @@ cfq_find_next_rq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 			next = rb_entry_rq(rbnext);
 	}
 
-	return cfq_choose_req(cfqd, next, prev);
+	return cfq_choose_req(cfqd, next, prev, blk_rq_pos(last));
 }
 
 static unsigned long cfq_slice_offset(struct cfq_data *cfqd,
@@ -843,7 +841,7 @@ static void cfq_add_rq_rb(struct request *rq)
 	 * check if this request is a better next-serve candidate
 	 */
 	prev = cfqq->next_rq;
-	cfqq->next_rq = cfq_choose_req(cfqd, cfqq->next_rq, rq);
+	cfqq->next_rq = cfq_choose_req(cfqd, cfqq->next_rq, rq, cfqd->last_position);
 
 	/*
 	 * adjust priority tree position, if ->next_rq changes
@@ -950,6 +948,7 @@ static void
 cfq_merged_requests(struct request_queue *q, struct request *rq,
 		    struct request *next)
 {
+	struct cfq_queue *cfqq = RQ_CFQQ(rq);
 	/*
 	 * reposition in fifo if next is older than rq
 	 */
@@ -959,6 +958,8 @@ cfq_merged_requests(struct request_queue *q, struct request *rq,
 		rq_set_fifo_time(rq, rq_fifo_time(next));
 	}
 
+	if (cfqq->next_rq == next)
+		cfqq->next_rq = rq;
 	cfq_remove_request(next);
 }
 
-- 
cgit v1.2.2


From 86b37281411cf1e9bc0a6b5406c45edb7bd9ea5d Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Tue, 10 Nov 2009 11:50:21 +0100
Subject: block: Expose discard granularity

While SSDs track block usage on a per-sector basis, RAID arrays often
have allocation blocks that are bigger.  Allow the discard granularity
and alignment to be set and teach the topology stacking logic how to
handle them.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-settings.c | 46 ++++++++++++++++++++++++++++++++++++----------
 block/blk-sysfs.c    | 22 ++++++++++++++++++++++
 block/genhd.c        | 12 ++++++++++++
 3 files changed, 70 insertions(+), 10 deletions(-)

(limited to 'block')

diff --git a/block/blk-settings.c b/block/blk-settings.c
index 66d4aa8799b7..7f986cafacd5 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -96,7 +96,10 @@ void blk_set_default_limits(struct queue_limits *lim)
 	lim->max_segment_size = MAX_SEGMENT_SIZE;
 	lim->max_sectors = BLK_DEF_MAX_SECTORS;
 	lim->max_hw_sectors = INT_MAX;
-	lim->max_discard_sectors = SAFE_MAX_SECTORS;
+	lim->max_discard_sectors = 0;
+	lim->discard_granularity = 0;
+	lim->discard_alignment = 0;
+	lim->discard_misaligned = 0;
 	lim->logical_block_size = lim->physical_block_size = lim->io_min = 512;
 	lim->bounce_pfn = (unsigned long)(BLK_BOUNCE_ANY >> PAGE_SHIFT);
 	lim->alignment_offset = 0;
@@ -488,6 +491,16 @@ void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b)
 }
 EXPORT_SYMBOL(blk_queue_stack_limits);
 
+static unsigned int lcm(unsigned int a, unsigned int b)
+{
+	if (a && b)
+		return (a * b) / gcd(a, b);
+	else if (b)
+		return b;
+
+	return a;
+}
+
 /**
  * blk_stack_limits - adjust queue_limits for stacked devices
  * @t:	the stacking driver limits (top)
@@ -502,6 +515,10 @@ EXPORT_SYMBOL(blk_queue_stack_limits);
 int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 		     sector_t offset)
 {
+	int ret;
+
+	ret = 0;
+
 	t->max_sectors = min_not_zero(t->max_sectors, b->max_sectors);
 	t->max_hw_sectors = min_not_zero(t->max_hw_sectors, b->max_hw_sectors);
 	t->bounce_pfn = min_not_zero(t->bounce_pfn, b->bounce_pfn);
@@ -531,7 +548,13 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 	if (offset &&
 	    (offset & (b->physical_block_size - 1)) != b->alignment_offset) {
 		t->misaligned = 1;
-		return -1;
+		ret = -1;
+	}
+
+	if (offset &&
+	    (offset & (b->discard_granularity - 1)) != b->discard_alignment) {
+		t->discard_misaligned = 1;
+		ret = -1;
 	}
 
 	/* If top has no alignment offset, inherit from bottom */
@@ -539,23 +562,26 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 		t->alignment_offset =
 			b->alignment_offset & (b->physical_block_size - 1);
 
+	if (!t->discard_alignment)
+		t->discard_alignment =
+			b->discard_alignment & (b->discard_granularity - 1);
+
 	/* Top device aligned on logical block boundary? */
 	if (t->alignment_offset & (t->logical_block_size - 1)) {
 		t->misaligned = 1;
-		return -1;
+		ret = -1;
 	}
 
-	/* Find lcm() of optimal I/O size */
-	if (t->io_opt && b->io_opt)
-		t->io_opt = (t->io_opt * b->io_opt) / gcd(t->io_opt, b->io_opt);
-	else if (b->io_opt)
-		t->io_opt = b->io_opt;
+	/* Find lcm() of optimal I/O size and granularity */
+	t->io_opt = lcm(t->io_opt, b->io_opt);
+	t->discard_granularity = lcm(t->discard_granularity,
+				     b->discard_granularity);
 
 	/* Verify that optimal I/O size is a multiple of io_min */
 	if (t->io_min && t->io_opt % t->io_min)
-		return -1;
+		ret = -1;
 
-	return 0;
+	return ret;
 }
 EXPORT_SYMBOL(blk_stack_limits);
 
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 8a6d81afb284..3147145edc15 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -126,6 +126,16 @@ static ssize_t queue_io_opt_show(struct request_queue *q, char *page)
 	return queue_var_show(queue_io_opt(q), page);
 }
 
+static ssize_t queue_discard_granularity_show(struct request_queue *q, char *page)
+{
+	return queue_var_show(q->limits.discard_granularity, page);
+}
+
+static ssize_t queue_discard_max_show(struct request_queue *q, char *page)
+{
+	return queue_var_show(q->limits.max_discard_sectors << 9, page);
+}
+
 static ssize_t
 queue_max_sectors_store(struct request_queue *q, const char *page, size_t count)
 {
@@ -293,6 +303,16 @@ static struct queue_sysfs_entry queue_io_opt_entry = {
 	.show = queue_io_opt_show,
 };
 
+static struct queue_sysfs_entry queue_discard_granularity_entry = {
+	.attr = {.name = "discard_granularity", .mode = S_IRUGO },
+	.show = queue_discard_granularity_show,
+};
+
+static struct queue_sysfs_entry queue_discard_max_entry = {
+	.attr = {.name = "discard_max_bytes", .mode = S_IRUGO },
+	.show = queue_discard_max_show,
+};
+
 static struct queue_sysfs_entry queue_nonrot_entry = {
 	.attr = {.name = "rotational", .mode = S_IRUGO | S_IWUSR },
 	.show = queue_nonrot_show,
@@ -328,6 +348,8 @@ static struct attribute *default_attrs[] = {
 	&queue_physical_block_size_entry.attr,
 	&queue_io_min_entry.attr,
 	&queue_io_opt_entry.attr,
+	&queue_discard_granularity_entry.attr,
+	&queue_discard_max_entry.attr,
 	&queue_nonrot_entry.attr,
 	&queue_nomerges_entry.attr,
 	&queue_rq_affinity_entry.attr,
diff --git a/block/genhd.c b/block/genhd.c
index 517e4332cb37..b11a4ad7d571 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -861,12 +861,23 @@ static ssize_t disk_alignment_offset_show(struct device *dev,
 	return sprintf(buf, "%d\n", queue_alignment_offset(disk->queue));
 }
 
+static ssize_t disk_discard_alignment_show(struct device *dev,
+					   struct device_attribute *attr,
+					   char *buf)
+{
+	struct gendisk *disk = dev_to_disk(dev);
+
+	return sprintf(buf, "%u\n", queue_discard_alignment(disk->queue));
+}
+
 static DEVICE_ATTR(range, S_IRUGO, disk_range_show, NULL);
 static DEVICE_ATTR(ext_range, S_IRUGO, disk_ext_range_show, NULL);
 static DEVICE_ATTR(removable, S_IRUGO, disk_removable_show, NULL);
 static DEVICE_ATTR(ro, S_IRUGO, disk_ro_show, NULL);
 static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
 static DEVICE_ATTR(alignment_offset, S_IRUGO, disk_alignment_offset_show, NULL);
+static DEVICE_ATTR(discard_alignment, S_IRUGO, disk_discard_alignment_show,
+		   NULL);
 static DEVICE_ATTR(capability, S_IRUGO, disk_capability_show, NULL);
 static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL);
 static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL);
@@ -887,6 +898,7 @@ static struct attribute *disk_attrs[] = {
 	&dev_attr_ro.attr,
 	&dev_attr_size.attr,
 	&dev_attr_alignment_offset.attr,
+	&dev_attr_discard_alignment.attr,
 	&dev_attr_capability.attr,
 	&dev_attr_stat.attr,
 	&dev_attr_inflight.attr,
-- 
cgit v1.2.2


From ad5ebd2fa2557b04a653bb3c3377a47da8f9b8e9 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Wed, 11 Nov 2009 13:47:45 +0100
Subject: block: jiffies fixes

Use HZ-independent calculation of milliseconds.
Add jiffies.h where it was missing since functions or macros
from it are used.

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-settings.c | 3 ++-
 block/bsg.c          | 3 ++-
 block/cfq-iosched.c  | 1 +
 3 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'block')

diff --git a/block/blk-settings.c b/block/blk-settings.c
index 7f986cafacd5..1ebc1fdb9144 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -8,6 +8,7 @@
 #include <linux/blkdev.h>
 #include <linux/bootmem.h>	/* for max_pfn/max_low_pfn */
 #include <linux/gcd.h>
+#include <linux/jiffies.h>
 
 #include "blk.h"
 
@@ -144,7 +145,7 @@ void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn)
 	q->nr_batching = BLK_BATCH_REQ;
 
 	q->unplug_thresh = 4;		/* hmm */
-	q->unplug_delay = (3 * HZ) / 1000;	/* 3 milliseconds */
+	q->unplug_delay = msecs_to_jiffies(3);	/* 3 milliseconds */
 	if (q->unplug_delay == 0)
 		q->unplug_delay = 1;
 
diff --git a/block/bsg.c b/block/bsg.c
index 0676301f16d0..a9fd2d84b53a 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -15,6 +15,7 @@
 #include <linux/blkdev.h>
 #include <linux/poll.h>
 #include <linux/cdev.h>
+#include <linux/jiffies.h>
 #include <linux/percpu.h>
 #include <linux/uio.h>
 #include <linux/idr.h>
@@ -197,7 +198,7 @@ static int blk_fill_sgv4_hdr_rq(struct request_queue *q, struct request *rq,
 	rq->cmd_len = hdr->request_len;
 	rq->cmd_type = REQ_TYPE_BLOCK_PC;
 
-	rq->timeout = (hdr->timeout * HZ) / 1000;
+	rq->timeout = msecs_to_jiffies(hdr->timeout);
 	if (!rq->timeout)
 		rq->timeout = q->sg_timeout;
 	if (!rq->timeout)
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 829d87d3e00f..1bcbd8c79896 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -9,6 +9,7 @@
 #include <linux/module.h>
 #include <linux/blkdev.h>
 #include <linux/elevator.h>
+#include <linux/jiffies.h>
 #include <linux/rbtree.h>
 #include <linux/ioprio.h>
 #include <linux/blktrace_api.h>
-- 
cgit v1.2.2


From 3586e917f2c7df769d173c4ec99554cb40a911e5 Mon Sep 17 00:00:00 2001
From: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Date: Thu, 26 Nov 2009 09:14:11 +0100
Subject: cfq: Make use of service count to estimate the rb_key offset

For the moment, different workload cfq queues are put into different
service trees. But CFQ still uses "busy_queues" to estimate rb_key
offset when inserting a cfq queue into a service tree. I think this
isn't appropriate, and it should make use of service tree count to do
this estimation. This patch is for for-2.6.33 branch.

Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 1bcbd8c79896..467981e19d7a 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -600,11 +600,15 @@ cfq_find_next_rq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 static unsigned long cfq_slice_offset(struct cfq_data *cfqd,
 				      struct cfq_queue *cfqq)
 {
+	struct cfq_rb_root *service_tree;
+
+	service_tree = service_tree_for(cfqq_prio(cfqq), cfqq_type(cfqq), cfqd);
+
 	/*
 	 * just an approximation, should be ok.
 	 */
-	return (cfqd->busy_queues - 1) * (cfq_prio_slice(cfqd, 1, 0) -
-		       cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio));
+	return  service_tree->count * (cfq_prio_slice(cfqd, 1, 0) -
+		   cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio));
 }
 
 /*
-- 
cgit v1.2.2


From 2d4dc890b5c8fabd818a8586607e6843c4375e62 Mon Sep 17 00:00:00 2001
From: Ilya Loginov <isloginov@gmail.com>
Date: Thu, 26 Nov 2009 09:16:19 +0100
Subject: block: add helpers to run flush_dcache_page() against a bio and a
 request's pages

Mtdblock driver doesn't call flush_dcache_page for pages in request.  So,
this causes problems on architectures where the icache doesn't fill from
the dcache or with dcache aliases.  The patch fixes this.

The ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE symbol was introduced to avoid
pointless empty cache-thrashing loops on architectures for which
flush_dcache_page() is a no-op.  Every architecture was provided with this
flush pages on architectires where ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE is
equal 1 or do nothing otherwise.

See "fix mtd_blkdevs problem with caches on some architectures" discussion
on LKML for more information.

Signed-off-by: Ilya Loginov <isloginov@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Peter Horton <phorton@bitbox.co.uk>
Cc: "Ed L. Cashin" <ecashin@coraid.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-core.c | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index 71da5111120c..718897e6d37f 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2358,6 +2358,25 @@ void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
 		rq->rq_disk = bio->bi_bdev->bd_disk;
 }
 
+#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
+/**
+ * rq_flush_dcache_pages - Helper function to flush all pages in a request
+ * @rq: the request to be flushed
+ *
+ * Description:
+ *     Flush all pages in @rq.
+ */
+void rq_flush_dcache_pages(struct request *rq)
+{
+	struct req_iterator iter;
+	struct bio_vec *bvec;
+
+	rq_for_each_segment(bvec, rq, iter)
+		flush_dcache_page(bvec->bv_page);
+}
+EXPORT_SYMBOL_GPL(rq_flush_dcache_pages);
+#endif
+
 /**
  * blk_lld_busy - Check if underlying low-level drivers of a device are busy
  * @q : the queue of the device being checked
-- 
cgit v1.2.2


From c16632bab1a17e357cec66920ceb3f0630009360 Mon Sep 17 00:00:00 2001
From: Corrado Zoccolo <czoccolo@gmail.com>
Date: Thu, 26 Nov 2009 09:41:21 +0100
Subject: cfq-iosched: cleanup unreachable code

cfq_should_idle returns false for no-idle queues that are not the last,
so the control flow will never reach the removed code in a state that
satisfies the if condition.
The unreachable code was added to emulate previous cfq behaviour for
non-NCQ rotational devices. My tests show that even without it, the
performances and fairness are comparable with previous cfq, thanks to
the fact that all seeky queues are grouped together, and that we idle at
the end of the tree.

Signed-off-by: Corrado Zoccolo <czoccolo@gmail.com>
Acked-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 13 -------------
 1 file changed, 13 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 467981e19d7a..c2ef5d17608c 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1269,19 +1269,6 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
 	cfq_mark_cfqq_wait_request(cfqq);
 
 	sl = cfqd->cfq_slice_idle;
-	/* are we servicing noidle tree, and there are more queues?
-	 * non-rotational or NCQ: no idle
-	 * non-NCQ rotational : very small idle, to allow
-	 *     fair distribution of slice time for a process doing back-to-back
-	 *     seeks.
-	 */
-	if (cfqd->serving_type == SYNC_NOIDLE_WORKLOAD &&
-	    service_tree_for(cfqd->serving_prio, SYNC_NOIDLE_WORKLOAD, cfqd)
-		->count > 0) {
-		if (blk_queue_nonrot(cfqd->queue) || cfqd->hw_tag)
-			return;
-		sl = min(sl, msecs_to_jiffies(CFQ_MIN_TT));
-	}
 
 	mod_timer(&cfqd->idle_slice_timer, jiffies + sl);
 	cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu", sl);
-- 
cgit v1.2.2


From e459dd08f45d2aa68abb0c02f8ab045cf8a598b8 Mon Sep 17 00:00:00 2001
From: Corrado Zoccolo <czoccolo@gmail.com>
Date: Thu, 26 Nov 2009 10:02:57 +0100
Subject: cfq-iosched: fix ncq detection code

CFQ's detection of queueing devices initially assumes a queuing device
and detects if the queue depth reaches a certain threshold.
However, it will reconsider this choice periodically.

Unfortunately, if device is considered not queuing, CFQ will force a
unit queue depth for some workloads, thus defeating the detection logic.
This leads to poor performance on queuing hardware,
since the idle window remains enabled.

Given this premise, switching to hw_tag = 0 after we have proved at
least once that the device is NCQ capable is not a good choice.

The new detection code starts in an indeterminate state, in which CFQ behaves
as if hw_tag = 1, and then, if for a long observation period we never saw
large depth, we switch to hw_tag = 0, otherwise we stick to hw_tag = 1,
without reconsidering it again.

Signed-off-by: Corrado Zoccolo <czoccolo@gmail.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 24 +++++++++++++++---------
 1 file changed, 15 insertions(+), 9 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index c2ef5d17608c..47abd24617be 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -191,8 +191,14 @@ struct cfq_data {
 	 */
 	int rq_queued;
 	int hw_tag;
-	int hw_tag_samples;
-	int rq_in_driver_peak;
+	/*
+	 * hw_tag can be
+	 * -1 => indeterminate, (cfq will behave as if NCQ is present, to allow better detection)
+	 *  1 => NCQ is present (hw_tag_est_depth is the estimated max depth)
+	 *  0 => no NCQ
+	 */
+	int hw_tag_est_depth;
+	unsigned int hw_tag_samples;
 
 	/*
 	 * idle window management
@@ -2518,8 +2524,11 @@ static void cfq_update_hw_tag(struct cfq_data *cfqd)
 {
 	struct cfq_queue *cfqq = cfqd->active_queue;
 
-	if (rq_in_driver(cfqd) > cfqd->rq_in_driver_peak)
-		cfqd->rq_in_driver_peak = rq_in_driver(cfqd);
+	if (rq_in_driver(cfqd) > cfqd->hw_tag_est_depth)
+		cfqd->hw_tag_est_depth = rq_in_driver(cfqd);
+
+	if (cfqd->hw_tag == 1)
+		return;
 
 	if (cfqd->rq_queued <= CFQ_HW_QUEUE_MIN &&
 	    rq_in_driver(cfqd) <= CFQ_HW_QUEUE_MIN)
@@ -2538,13 +2547,10 @@ static void cfq_update_hw_tag(struct cfq_data *cfqd)
 	if (cfqd->hw_tag_samples++ < 50)
 		return;
 
-	if (cfqd->rq_in_driver_peak >= CFQ_HW_QUEUE_MIN)
+	if (cfqd->hw_tag_est_depth >= CFQ_HW_QUEUE_MIN)
 		cfqd->hw_tag = 1;
 	else
 		cfqd->hw_tag = 0;
-
-	cfqd->hw_tag_samples = 0;
-	cfqd->rq_in_driver_peak = 0;
 }
 
 static void cfq_completed_request(struct request_queue *q, struct request *rq)
@@ -2951,7 +2957,7 @@ static void *cfq_init_queue(struct request_queue *q)
 	cfqd->cfq_slice_async_rq = cfq_slice_async_rq;
 	cfqd->cfq_slice_idle = cfq_slice_idle;
 	cfqd->cfq_latency = 1;
-	cfqd->hw_tag = 1;
+	cfqd->hw_tag = -1;
 	cfqd->last_end_sync_rq = jiffies;
 	return cfqd;
 }
-- 
cgit v1.2.2


From e4a229196a7c676514c78f6783f8994f64bf681c Mon Sep 17 00:00:00 2001
From: Corrado Zoccolo <czoccolo@gmail.com>
Date: Thu, 26 Nov 2009 10:02:58 +0100
Subject: cfq-iosched: fix no-idle preemption logic

An incoming no-idle queue should preempt the active no-idle queue
 only if the active queue is idling due to service tree empty.
 Previous code was buggy in two ways:
 * it relied on service_tree field to be set on the active queue, while
   it is not set when the code is idling for a new request
 * it didn't check for the service tree empty condition, so could lead to
   LIFO behaviour if multiple queues with depth > 1 were preempting each
   other on an non-NCQ device.

Reported-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Corrado Zoccolo <czoccolo@gmail.com>
Acked-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 47abd24617be..2c1086acddfa 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -2392,8 +2392,9 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
 	if (cfq_class_idle(cfqq))
 		return true;
 
-	if (cfqd->serving_type == SYNC_NOIDLE_WORKLOAD
-	    && new_cfqq->service_tree == cfqq->service_tree)
+	if (cfqd->serving_type == SYNC_NOIDLE_WORKLOAD &&
+	    cfqq_type(new_cfqq) == SYNC_NOIDLE_WORKLOAD &&
+	    new_cfqq->service_tree->count == 1)
 		return true;
 
 	/*
-- 
cgit v1.2.2


From 76280aff1c7e9ae761cac4b48591c43cd7d69159 Mon Sep 17 00:00:00 2001
From: Corrado Zoccolo <czoccolo@gmail.com>
Date: Thu, 26 Nov 2009 10:02:58 +0100
Subject: cfq-iosched: idling on deep seeky sync queues

Seeky sync queues with large depth can gain unfairly big share of disk
 time, at the expense of other seeky queues. This patch ensures that
 idling will be enabled for queues with I/O depth at least 4, and small
 think time. The decision to enable idling is sticky, until an idle
 window times out without seeing a new request.

The reasoning behind the decision is that, if an application is using
large I/O depth, it is already optimized to make full utilization of
the hardware, and therefore we reserve a slice of exclusive use for it.

Reported-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Corrado Zoccolo <czoccolo@gmail.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 2c1086acddfa..15f7238f527f 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -260,6 +260,7 @@ enum cfqq_state_flags {
 	CFQ_CFQQ_FLAG_slice_new,	/* no requests dispatched in slice */
 	CFQ_CFQQ_FLAG_sync,		/* synchronous queue */
 	CFQ_CFQQ_FLAG_coop,		/* cfqq is shared */
+	CFQ_CFQQ_FLAG_deep,		/* sync cfqq experienced large depth */
 };
 
 #define CFQ_CFQQ_FNS(name)						\
@@ -286,6 +287,7 @@ CFQ_CFQQ_FNS(prio_changed);
 CFQ_CFQQ_FNS(slice_new);
 CFQ_CFQQ_FNS(sync);
 CFQ_CFQQ_FNS(coop);
+CFQ_CFQQ_FNS(deep);
 #undef CFQ_CFQQ_FNS
 
 #define cfq_log_cfqq(cfqd, cfqq, fmt, args...)	\
@@ -2350,8 +2352,12 @@ cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 
 	enable_idle = old_idle = cfq_cfqq_idle_window(cfqq);
 
+	if (cfqq->queued[0] + cfqq->queued[1] >= 4)
+		cfq_mark_cfqq_deep(cfqq);
+
 	if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle ||
-	    (sample_valid(cfqq->seek_samples) && CFQQ_SEEKY(cfqq)))
+	    (!cfq_cfqq_deep(cfqq) && sample_valid(cfqq->seek_samples)
+	     && CFQQ_SEEKY(cfqq)))
 		enable_idle = 0;
 	else if (sample_valid(cic->ttime_samples)) {
 		if (cic->ttime_mean > cfqd->cfq_slice_idle)
@@ -2849,6 +2855,11 @@ static void cfq_idle_slice_timer(unsigned long data)
 		 */
 		if (!RB_EMPTY_ROOT(&cfqq->sort_list))
 			goto out_kick;
+
+		/*
+		 * Queue depth flag is reset only when the idle didn't succeed
+		 */
+		cfq_clear_cfqq_deep(cfqq);
 	}
 expire:
 	cfq_slice_expired(cfqd, timed_out);
-- 
cgit v1.2.2


From 8e550632cccae34e265cb066691945515eaa7fb5 Mon Sep 17 00:00:00 2001
From: Corrado Zoccolo <czoccolo@gmail.com>
Date: Thu, 26 Nov 2009 10:02:58 +0100
Subject: cfq-iosched: fix corner cases in idling logic

Idling logic was disabled in some corner cases, leading to unfair share
 for noidle queues.
 * the idle timer was not armed if there were other requests in the
   driver. unfortunately, those requests could come from other workloads,
   or queues for which we don't enable idling. So we will check only
   pending requests from the active queue
 * rq_noidle check on no-idle queue could disable the end of tree idle if
   the last completed request was rq_noidle. Now, we will disable that
   idle only if all the queues served in the no-idle tree had rq_noidle
   requests.

Reported-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Corrado Zoccolo <czoccolo@gmail.com>
Acked-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 32 ++++++++++++++++++++++----------
 1 file changed, 22 insertions(+), 10 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 15f7238f527f..a5de31f76d3b 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -172,6 +172,7 @@ struct cfq_data {
 	enum wl_prio_t serving_prio;
 	enum wl_type_t serving_type;
 	unsigned long workload_expires;
+	bool noidle_tree_requires_idle;
 
 	/*
 	 * Each priority tree is sorted by next_request position.  These
@@ -1253,9 +1254,9 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
 		return;
 
 	/*
-	 * still requests with the driver, don't idle
+	 * still active requests from this queue, don't idle
 	 */
-	if (rq_in_driver(cfqd))
+	if (cfqq->dispatched)
 		return;
 
 	/*
@@ -1478,6 +1479,7 @@ static void choose_service_tree(struct cfq_data *cfqd)
 
 	slice = max_t(unsigned, slice, CFQ_MIN_TT);
 	cfqd->workload_expires = jiffies + slice;
+	cfqd->noidle_tree_requires_idle = false;
 }
 
 /*
@@ -2597,17 +2599,27 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
 			cfq_clear_cfqq_slice_new(cfqq);
 		}
 		/*
-		 * If there are no requests waiting in this queue, and
-		 * there are other queues ready to issue requests, AND
-		 * those other queues are issuing requests within our
-		 * mean seek distance, give them a chance to run instead
-		 * of idling.
+		 * Idling is not enabled on:
+		 * - expired queues
+		 * - idle-priority queues
+		 * - async queues
+		 * - queues with still some requests queued
+		 * - when there is a close cooperator
 		 */
 		if (cfq_slice_used(cfqq) || cfq_class_idle(cfqq))
 			cfq_slice_expired(cfqd, 1);
-		else if (cfqq_empty && !cfq_close_cooperator(cfqd, cfqq) &&
-			 sync && !rq_noidle(rq))
-			cfq_arm_slice_timer(cfqd);
+		else if (sync && cfqq_empty &&
+			 !cfq_close_cooperator(cfqd, cfqq)) {
+			cfqd->noidle_tree_requires_idle |= !rq_noidle(rq);
+			/*
+			 * Idling is enabled for SYNC_WORKLOAD.
+			 * SYNC_NOIDLE_WORKLOAD idles at the end of the tree
+			 * only if we processed at least one !rq_noidle request
+			 */
+			if (cfqd->serving_type == SYNC_WORKLOAD
+			    || cfqd->noidle_tree_requires_idle)
+				cfq_arm_slice_timer(cfqd);
+		}
 	}
 
 	if (!rq_in_driver(cfqd))
-- 
cgit v1.2.2


From 464191c65b85a8ec68a6e1a6293af625287c807e Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Mon, 30 Nov 2009 09:38:13 +0100
Subject: Revert "cfq: Make use of service count to estimate the rb_key offset"

This reverts commit 3586e917f2c7df769d173c4ec99554cb40a911e5.

Corrado Zoccolo <czoccolo@gmail.com> correctly points out, that we need
consistency of rb_key offset across groups. This means we cannot properly
use the per-service_tree service count. Revert this change.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index a5de31f76d3b..71446497d7b6 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -609,15 +609,11 @@ cfq_find_next_rq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 static unsigned long cfq_slice_offset(struct cfq_data *cfqd,
 				      struct cfq_queue *cfqq)
 {
-	struct cfq_rb_root *service_tree;
-
-	service_tree = service_tree_for(cfqq_prio(cfqq), cfqq_type(cfqq), cfqd);
-
 	/*
 	 * just an approximation, should be ok.
 	 */
-	return  service_tree->count * (cfq_prio_slice(cfqd, 1, 0) -
-		   cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio));
+	return (cfqd->busy_queues - 1) * (cfq_prio_slice(cfqd, 1, 0) -
+		       cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio));
 }
 
 /*
-- 
cgit v1.2.2


From 98262f2762f0067375f83824d81ea929e37e6bfe Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Thu, 3 Dec 2009 09:24:48 +0100
Subject: block: Allow devices to indicate whether discarded blocks are zeroed

The discard ioctl is used by mkfs utilities to clear a block device
prior to putting metadata down.  However, not all devices return zeroed
blocks after a discard.  Some drives return stale data, potentially
containing old superblocks.  It is therefore important to know whether
discarded blocks are properly zeroed.

Both ATA and SCSI drives have configuration bits that indicate whether
zeroes are returned after a discard operation.  Implement a block level
interface that allows this information to be bubbled up the stack and
queried via a new block device ioctl.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-settings.c |  2 ++
 block/blk-sysfs.c    | 11 +++++++++++
 block/compat_ioctl.c |  2 ++
 block/ioctl.c        |  2 ++
 4 files changed, 17 insertions(+)

(limited to 'block')

diff --git a/block/blk-settings.c b/block/blk-settings.c
index 1ebc1fdb9144..dd1f1e0e196f 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -101,6 +101,7 @@ void blk_set_default_limits(struct queue_limits *lim)
 	lim->discard_granularity = 0;
 	lim->discard_alignment = 0;
 	lim->discard_misaligned = 0;
+	lim->discard_zeroes_data = -1;
 	lim->logical_block_size = lim->physical_block_size = lim->io_min = 512;
 	lim->bounce_pfn = (unsigned long)(BLK_BOUNCE_ANY >> PAGE_SHIFT);
 	lim->alignment_offset = 0;
@@ -544,6 +545,7 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 
 	t->io_min = max(t->io_min, b->io_min);
 	t->no_cluster |= b->no_cluster;
+	t->discard_zeroes_data &= b->discard_zeroes_data;
 
 	/* Bottom device offset aligned? */
 	if (offset &&
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 3147145edc15..8606c9543fdd 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -136,6 +136,11 @@ static ssize_t queue_discard_max_show(struct request_queue *q, char *page)
 	return queue_var_show(q->limits.max_discard_sectors << 9, page);
 }
 
+static ssize_t queue_discard_zeroes_data_show(struct request_queue *q, char *page)
+{
+	return queue_var_show(queue_discard_zeroes_data(q), page);
+}
+
 static ssize_t
 queue_max_sectors_store(struct request_queue *q, const char *page, size_t count)
 {
@@ -313,6 +318,11 @@ static struct queue_sysfs_entry queue_discard_max_entry = {
 	.show = queue_discard_max_show,
 };
 
+static struct queue_sysfs_entry queue_discard_zeroes_data_entry = {
+	.attr = {.name = "discard_zeroes_data", .mode = S_IRUGO },
+	.show = queue_discard_zeroes_data_show,
+};
+
 static struct queue_sysfs_entry queue_nonrot_entry = {
 	.attr = {.name = "rotational", .mode = S_IRUGO | S_IWUSR },
 	.show = queue_nonrot_show,
@@ -350,6 +360,7 @@ static struct attribute *default_attrs[] = {
 	&queue_io_opt_entry.attr,
 	&queue_discard_granularity_entry.attr,
 	&queue_discard_max_entry.attr,
+	&queue_discard_zeroes_data_entry.attr,
 	&queue_nonrot_entry.attr,
 	&queue_nomerges_entry.attr,
 	&queue_rq_affinity_entry.attr,
diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c
index 9bd086c1a4d5..4eb8e9ea4af5 100644
--- a/block/compat_ioctl.c
+++ b/block/compat_ioctl.c
@@ -747,6 +747,8 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 		return compat_put_uint(arg, bdev_io_opt(bdev));
 	case BLKALIGNOFF:
 		return compat_put_int(arg, bdev_alignment_offset(bdev));
+	case BLKDISCARDZEROES:
+		return compat_put_uint(arg, bdev_discard_zeroes_data(bdev));
 	case BLKFLSBUF:
 	case BLKROSET:
 	case BLKDISCARD:
diff --git a/block/ioctl.c b/block/ioctl.c
index 1f4d1de12b09..be48ea51faee 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -280,6 +280,8 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
 		return put_uint(arg, bdev_io_opt(bdev));
 	case BLKALIGNOFF:
 		return put_int(arg, bdev_alignment_offset(bdev));
+	case BLKDISCARDZEROES:
+		return put_uint(arg, bdev_discard_zeroes_data(bdev));
 	case BLKSECTGET:
 		return put_ushort(arg, queue_max_sectors(bdev_get_queue(bdev)));
 	case BLKRASET:
-- 
cgit v1.2.2


From 474b18ccc264c472abeec50f48469b6477202699 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shaohua.li@intel.com>
Date: Thu, 3 Dec 2009 12:58:05 +0100
Subject: cfq-iosched: no dispatch limit for single queue

Since commit 2f5cb7381b737e24c8046fd4aeab571fb71315f5, each queue can send
up to 4 * 4 requests if only one queue exists. I wonder why we have such limit.
Device supports tag can send more requests. For example, AHCI can send 31
requests. Test (direct aio randread) shows the limits reduce about 4% disk
thoughput.
On the other hand, since we send one request one time, if other queue
pop when current is sending more than cfq_quantum requests, current queue will
stop send requests soon after one request, so sounds there is no big latency.

Signed-off-by: Shaohua Li <shaohua.li@intel.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 71446497d7b6..f5b59e18ebd3 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1618,9 +1618,9 @@ static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 			return false;
 
 		/*
-		 * Sole queue user, allow bigger slice
+		 * Sole queue user, no limit
 		 */
-		max_dispatch *= 4;
+		max_dispatch = -1;
 	}
 
 	/*
-- 
cgit v1.2.2


From bf7919371025412978268efca4b09dd847acb395 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Thu, 3 Dec 2009 12:59:37 -0500
Subject: blkio: Set must_dispatch only if we decided to not dispatch the
 request

o must_dispatch flag should be set only if we decided not to run the queue
  and dispatch the request.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index f5b59e18ebd3..15b53616516a 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -2490,9 +2490,9 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 			if (blk_rq_bytes(rq) > PAGE_CACHE_SIZE ||
 			    cfqd->busy_queues > 1) {
 				del_timer(&cfqd->idle_slice_timer);
-			__blk_run_queue(cfqd->queue);
-			}
-			cfq_mark_cfqq_must_dispatch(cfqq);
+				__blk_run_queue(cfqd->queue);
+			} else
+				cfq_mark_cfqq_must_dispatch(cfqq);
 		}
 	} else if (cfq_should_preempt(cfqd, cfqq, rq)) {
 		/*
-- 
cgit v1.2.2


From cdb16e8f739985b8a5c9f4569b026583bbcd01a5 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Thu, 3 Dec 2009 12:59:38 -0500
Subject: blkio: Introduce the notion of cfq groups

o This patch introduce the notion of cfq groups. Soon we will can have multiple
  groups of different weights in the system.

o Various service trees (prioclass and workload type trees), will become per
  cfq group. So hierarchy looks as follows.

			cfq_groups
			   |
			workload type
			   |
		        cfq queue

o When an scheduling decision has to be taken, first we select the cfq group
  then workload with-in the group and then cfq queue with-in the workload
  type.

o This patch just makes various workload service tree per cfq group and
  introduce the function to be able to choose a group for scheduling.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 108 ++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 75 insertions(+), 33 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 15b53616516a..a4d17265411e 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -132,6 +132,7 @@ struct cfq_queue {
 
 	struct cfq_rb_root *service_tree;
 	struct cfq_queue *new_cfqq;
+	struct cfq_group *cfqg;
 };
 
 /*
@@ -153,25 +154,30 @@ enum wl_type_t {
 	SYNC_WORKLOAD = 2
 };
 
+/* This is per cgroup per device grouping structure */
+struct cfq_group {
+	/*
+	 * rr lists of queues with requests, onle rr for each priority class.
+	 * Counts are embedded in the cfq_rb_root
+	 */
+	struct cfq_rb_root service_trees[2][3];
+	struct cfq_rb_root service_tree_idle;
+};
 
 /*
  * Per block device queue structure
  */
 struct cfq_data {
 	struct request_queue *queue;
+	struct cfq_group root_group;
 
-	/*
-	 * rr lists of queues with requests, onle rr for each priority class.
-	 * Counts are embedded in the cfq_rb_root
-	 */
-	struct cfq_rb_root service_trees[2][3];
-	struct cfq_rb_root service_tree_idle;
 	/*
 	 * The priority currently being served
 	 */
 	enum wl_prio_t serving_prio;
 	enum wl_type_t serving_type;
 	unsigned long workload_expires;
+	struct cfq_group *serving_group;
 	bool noidle_tree_requires_idle;
 
 	/*
@@ -240,14 +246,15 @@ struct cfq_data {
 	unsigned long last_end_sync_rq;
 };
 
-static struct cfq_rb_root *service_tree_for(enum wl_prio_t prio,
+static struct cfq_rb_root *service_tree_for(struct cfq_group *cfqg,
+					    enum wl_prio_t prio,
 					    enum wl_type_t type,
 					    struct cfq_data *cfqd)
 {
 	if (prio == IDLE_WORKLOAD)
-		return &cfqd->service_tree_idle;
+		return &cfqg->service_tree_idle;
 
-	return &cfqd->service_trees[prio][type];
+	return &cfqg->service_trees[prio][type];
 }
 
 enum cfqq_state_flags {
@@ -317,12 +324,14 @@ static enum wl_type_t cfqq_type(struct cfq_queue *cfqq)
 
 static inline int cfq_busy_queues_wl(enum wl_prio_t wl, struct cfq_data *cfqd)
 {
+	struct cfq_group *cfqg = &cfqd->root_group;
+
 	if (wl == IDLE_WORKLOAD)
-		return cfqd->service_tree_idle.count;
+		return cfqg->service_tree_idle.count;
 
-	return cfqd->service_trees[wl][ASYNC_WORKLOAD].count
-		+ cfqd->service_trees[wl][SYNC_NOIDLE_WORKLOAD].count
-		+ cfqd->service_trees[wl][SYNC_WORKLOAD].count;
+	return cfqg->service_trees[wl][ASYNC_WORKLOAD].count
+		+ cfqg->service_trees[wl][SYNC_NOIDLE_WORKLOAD].count
+		+ cfqg->service_trees[wl][SYNC_WORKLOAD].count;
 }
 
 static void cfq_dispatch_insert(struct request_queue *, struct request *);
@@ -612,7 +621,7 @@ static unsigned long cfq_slice_offset(struct cfq_data *cfqd,
 	/*
 	 * just an approximation, should be ok.
 	 */
-	return (cfqd->busy_queues - 1) * (cfq_prio_slice(cfqd, 1, 0) -
+	return (cfqq->cfqg->nr_cfqq - 1) * (cfq_prio_slice(cfqd, 1, 0) -
 		       cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio));
 }
 
@@ -630,7 +639,8 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 	struct cfq_rb_root *service_tree;
 	int left;
 
-	service_tree = service_tree_for(cfqq_prio(cfqq), cfqq_type(cfqq), cfqd);
+	service_tree = service_tree_for(cfqq->cfqg, cfqq_prio(cfqq),
+						cfqq_type(cfqq), cfqd);
 	if (cfq_class_idle(cfqq)) {
 		rb_key = CFQ_IDLE_DELAY;
 		parent = rb_last(&service_tree->rb);
@@ -1066,7 +1076,8 @@ static inline void cfq_slice_expired(struct cfq_data *cfqd, bool timed_out)
 static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
 {
 	struct cfq_rb_root *service_tree =
-		service_tree_for(cfqd->serving_prio, cfqd->serving_type, cfqd);
+		service_tree_for(cfqd->serving_group, cfqd->serving_prio,
+					cfqd->serving_type, cfqd);
 
 	if (RB_EMPTY_ROOT(&service_tree->rb))
 		return NULL;
@@ -1218,7 +1229,8 @@ static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 	 * in their service tree.
 	 */
 	if (!service_tree)
-		service_tree = service_tree_for(prio, cfqq_type(cfqq), cfqd);
+		service_tree = service_tree_for(cfqq->cfqg, prio,
+						cfqq_type(cfqq), cfqd);
 
 	if (service_tree->count == 0)
 		return true;
@@ -1377,8 +1389,9 @@ static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq)
 	}
 }
 
-static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd, enum wl_prio_t prio,
-				    bool prio_changed)
+static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
+				struct cfq_group *cfqg, enum wl_prio_t prio,
+				bool prio_changed)
 {
 	struct cfq_queue *queue;
 	int i;
@@ -1392,10 +1405,10 @@ static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd, enum wl_prio_t prio,
 		 * from SYNC_NOIDLE (first choice), or just SYNC
 		 * over ASYNC
 		 */
-		if (service_tree_for(prio, cur_best, cfqd)->count)
+		if (service_tree_for(cfqg, prio, cur_best, cfqd)->count)
 			return cur_best;
 		cur_best = SYNC_WORKLOAD;
-		if (service_tree_for(prio, cur_best, cfqd)->count)
+		if (service_tree_for(cfqg, prio, cur_best, cfqd)->count)
 			return cur_best;
 
 		return ASYNC_WORKLOAD;
@@ -1403,7 +1416,7 @@ static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd, enum wl_prio_t prio,
 
 	for (i = 0; i < 3; ++i) {
 		/* otherwise, select the one with lowest rb_key */
-		queue = cfq_rb_first(service_tree_for(prio, i, cfqd));
+		queue = cfq_rb_first(service_tree_for(cfqg, prio, i, cfqd));
 		if (queue &&
 		    (!key_valid || time_before(queue->rb_key, lowest_key))) {
 			lowest_key = queue->rb_key;
@@ -1415,12 +1428,13 @@ static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd, enum wl_prio_t prio,
 	return cur_best;
 }
 
-static void choose_service_tree(struct cfq_data *cfqd)
+static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
 {
 	enum wl_prio_t previous_prio = cfqd->serving_prio;
 	bool prio_changed;
 	unsigned slice;
 	unsigned count;
+	struct cfq_rb_root *st;
 
 	/* Choose next priority. RT > BE > IDLE */
 	if (cfq_busy_queues_wl(RT_WORKLOAD, cfqd))
@@ -1439,8 +1453,9 @@ static void choose_service_tree(struct cfq_data *cfqd)
 	 * expiration time
 	 */
 	prio_changed = (cfqd->serving_prio != previous_prio);
-	count = service_tree_for(cfqd->serving_prio, cfqd->serving_type, cfqd)
-		->count;
+	st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type,
+				cfqd);
+	count = st->count;
 
 	/*
 	 * If priority didn't change, check workload expiration,
@@ -1452,9 +1467,10 @@ static void choose_service_tree(struct cfq_data *cfqd)
 
 	/* otherwise select new workload type */
 	cfqd->serving_type =
-		cfq_choose_wl(cfqd, cfqd->serving_prio, prio_changed);
-	count = service_tree_for(cfqd->serving_prio, cfqd->serving_type, cfqd)
-		->count;
+		cfq_choose_wl(cfqd, cfqg, cfqd->serving_prio, prio_changed);
+	st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type,
+				cfqd);
+	count = st->count;
 
 	/*
 	 * the workload slice is computed as a fraction of target latency
@@ -1478,6 +1494,12 @@ static void choose_service_tree(struct cfq_data *cfqd)
 	cfqd->noidle_tree_requires_idle = false;
 }
 
+static void cfq_choose_cfqg(struct cfq_data *cfqd)
+{
+	cfqd->serving_group = &cfqd->root_group;
+	choose_service_tree(cfqd, &cfqd->root_group);
+}
+
 /*
  * Select a queue for service. If we have a current active queue,
  * check whether to continue servicing it, or retrieve and set a new one.
@@ -1535,7 +1557,7 @@ new_queue:
 	 * service tree
 	 */
 	if (!new_cfqq)
-		choose_service_tree(cfqd);
+		cfq_choose_cfqg(cfqd);
 
 	cfqq = cfq_set_active_queue(cfqd, new_cfqq);
 keep_queue:
@@ -1564,13 +1586,15 @@ static int cfq_forced_dispatch(struct cfq_data *cfqd)
 	struct cfq_queue *cfqq;
 	int dispatched = 0;
 	int i, j;
+	struct cfq_group *cfqg = &cfqd->root_group;
+
 	for (i = 0; i < 2; ++i)
 		for (j = 0; j < 3; ++j)
-			while ((cfqq = cfq_rb_first(&cfqd->service_trees[i][j]))
+			while ((cfqq = cfq_rb_first(&cfqg->service_trees[i][j]))
 				!= NULL)
 				dispatched += __cfq_forced_dispatch_cfqq(cfqq);
 
-	while ((cfqq = cfq_rb_first(&cfqd->service_tree_idle)) != NULL)
+	while ((cfqq = cfq_rb_first(&cfqg->service_tree_idle)) != NULL)
 		dispatched += __cfq_forced_dispatch_cfqq(cfqq);
 
 	cfq_slice_expired(cfqd, 0);
@@ -2041,14 +2065,26 @@ static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 	cfqq->pid = pid;
 }
 
+static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg)
+{
+	cfqq->cfqg = cfqg;
+}
+
+static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create)
+{
+	return &cfqd->root_group;
+}
+
 static struct cfq_queue *
 cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync,
 		     struct io_context *ioc, gfp_t gfp_mask)
 {
 	struct cfq_queue *cfqq, *new_cfqq = NULL;
 	struct cfq_io_context *cic;
+	struct cfq_group *cfqg;
 
 retry:
+	cfqg = cfq_get_cfqg(cfqd, 1);
 	cic = cfq_cic_lookup(cfqd, ioc);
 	/* cic always exists here */
 	cfqq = cic_to_cfqq(cic, is_sync);
@@ -2079,6 +2115,7 @@ retry:
 		if (cfqq) {
 			cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync);
 			cfq_init_prio_data(cfqq, ioc);
+			cfq_link_cfqq_cfqg(cfqq, cfqg);
 			cfq_log_cfqq(cfqd, cfqq, "alloced");
 		} else
 			cfqq = &cfqd->oom_cfqq;
@@ -2931,15 +2968,19 @@ static void *cfq_init_queue(struct request_queue *q)
 {
 	struct cfq_data *cfqd;
 	int i, j;
+	struct cfq_group *cfqg;
 
 	cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node);
 	if (!cfqd)
 		return NULL;
 
+	/* Init root group */
+	cfqg = &cfqd->root_group;
+
 	for (i = 0; i < 2; ++i)
 		for (j = 0; j < 3; ++j)
-			cfqd->service_trees[i][j] = CFQ_RB_ROOT;
-	cfqd->service_tree_idle = CFQ_RB_ROOT;
+			cfqg->service_trees[i][j] = CFQ_RB_ROOT;
+	cfqg->service_tree_idle = CFQ_RB_ROOT;
 
 	/*
 	 * Not strictly needed (since RB_ROOT just clears the node and we
@@ -2956,6 +2997,7 @@ static void *cfq_init_queue(struct request_queue *q)
 	 */
 	cfq_init_cfqq(cfqd, &cfqd->oom_cfqq, 1, 0);
 	atomic_inc(&cfqd->oom_cfqq.ref);
+	cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, &cfqd->root_group);
 
 	INIT_LIST_HEAD(&cfqd->cic_list);
 
-- 
cgit v1.2.2


From 615f0259e6940293359a189f4881bb28c2fea40b Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Thu, 3 Dec 2009 12:59:39 -0500
Subject: blkio: Implement macro to traverse each service tree in group

o Implement a macro to traverse each service tree in the group. This avoids
  usage of double for loop and special condition for idle tree 4 times.

o Macro is little twisted because of special handling of idle class service
  tree.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 41 +++++++++++++++++++++++++----------------
 1 file changed, 25 insertions(+), 16 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index a4d17265411e..fab2be0fa215 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -140,9 +140,9 @@ struct cfq_queue {
  * IDLE is handled separately, so it has negative index
  */
 enum wl_prio_t {
-	IDLE_WORKLOAD = -1,
 	BE_WORKLOAD = 0,
-	RT_WORKLOAD = 1
+	RT_WORKLOAD = 1,
+	IDLE_WORKLOAD = 2,
 };
 
 /*
@@ -303,6 +303,17 @@ CFQ_CFQQ_FNS(deep);
 #define cfq_log(cfqd, fmt, args...)	\
 	blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args)
 
+/* Traverses through cfq group service trees */
+#define for_each_cfqg_st(cfqg, i, j, st) \
+	for (i = 0; i <= IDLE_WORKLOAD; i++) \
+		for (j = 0, st = i < IDLE_WORKLOAD ? &cfqg->service_trees[i][j]\
+			: &cfqg->service_tree_idle; \
+			(i < IDLE_WORKLOAD && j <= SYNC_WORKLOAD) || \
+			(i == IDLE_WORKLOAD && j == 0); \
+			j++, st = i < IDLE_WORKLOAD ? \
+			&cfqg->service_trees[i][j]: NULL) \
+
+
 static inline enum wl_prio_t cfqq_prio(struct cfq_queue *cfqq)
 {
 	if (cfq_class_idle(cfqq))
@@ -565,6 +576,10 @@ cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2,
  */
 static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root)
 {
+	/* Service tree is empty */
+	if (!root->count)
+		return NULL;
+
 	if (!root->left)
 		root->left = rb_first(&root->rb);
 
@@ -1587,18 +1602,14 @@ static int cfq_forced_dispatch(struct cfq_data *cfqd)
 	int dispatched = 0;
 	int i, j;
 	struct cfq_group *cfqg = &cfqd->root_group;
+	struct cfq_rb_root *st;
 
-	for (i = 0; i < 2; ++i)
-		for (j = 0; j < 3; ++j)
-			while ((cfqq = cfq_rb_first(&cfqg->service_trees[i][j]))
-				!= NULL)
-				dispatched += __cfq_forced_dispatch_cfqq(cfqq);
-
-	while ((cfqq = cfq_rb_first(&cfqg->service_tree_idle)) != NULL)
-		dispatched += __cfq_forced_dispatch_cfqq(cfqq);
+	for_each_cfqg_st(cfqg, i, j, st) {
+		while ((cfqq = cfq_rb_first(st)) != NULL)
+			dispatched += __cfq_forced_dispatch_cfqq(cfqq);
+	}
 
 	cfq_slice_expired(cfqd, 0);
-
 	BUG_ON(cfqd->busy_queues);
 
 	cfq_log(cfqd, "forced_dispatch=%d", dispatched);
@@ -2969,6 +2980,7 @@ static void *cfq_init_queue(struct request_queue *q)
 	struct cfq_data *cfqd;
 	int i, j;
 	struct cfq_group *cfqg;
+	struct cfq_rb_root *st;
 
 	cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node);
 	if (!cfqd)
@@ -2976,11 +2988,8 @@ static void *cfq_init_queue(struct request_queue *q)
 
 	/* Init root group */
 	cfqg = &cfqd->root_group;
-
-	for (i = 0; i < 2; ++i)
-		for (j = 0; j < 3; ++j)
-			cfqg->service_trees[i][j] = CFQ_RB_ROOT;
-	cfqg->service_tree_idle = CFQ_RB_ROOT;
+	for_each_cfqg_st(cfqg, i, j, st)
+		*st = CFQ_RB_ROOT;
 
 	/*
 	 * Not strictly needed (since RB_ROOT just clears the node and we
-- 
cgit v1.2.2


From f04a64246344ad50e4b4b4186174a0912d07f30b Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Thu, 3 Dec 2009 12:59:40 -0500
Subject: blkio: Keep queue on service tree until we expire it

o Currently cfqq deletes a queue from service tree if it is empty (even if
  we might idle on the queue). This patch keeps the queue on service tree
  hence associated group remains on the service tree until we decide that
  we are not going to idle on the queue and expire it.

o This just helps in time accounting for queue/group and in implementation
  of rest of the patches.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 70 +++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 49 insertions(+), 21 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index fab2be0fa215..7f5646ac9f5d 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -393,7 +393,7 @@ static int cfq_queue_empty(struct request_queue *q)
 {
 	struct cfq_data *cfqd = q->elevator->elevator_data;
 
-	return !cfqd->busy_queues;
+	return !cfqd->rq_queued;
 }
 
 /*
@@ -842,7 +842,6 @@ static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 static void cfq_del_rq_rb(struct request *rq)
 {
 	struct cfq_queue *cfqq = RQ_CFQQ(rq);
-	struct cfq_data *cfqd = cfqq->cfqd;
 	const int sync = rq_is_sync(rq);
 
 	BUG_ON(!cfqq->queued[sync]);
@@ -850,8 +849,17 @@ static void cfq_del_rq_rb(struct request *rq)
 
 	elv_rb_del(&cfqq->sort_list, rq);
 
-	if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list))
-		cfq_del_cfqq_rr(cfqd, cfqq);
+	if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list)) {
+		/*
+		 * Queue will be deleted from service tree when we actually
+		 * expire it later. Right now just remove it from prio tree
+		 * as it is empty.
+		 */
+		if (cfqq->p_root) {
+			rb_erase(&cfqq->p_node, cfqq->p_root);
+			cfqq->p_root = NULL;
+		}
+	}
 }
 
 static void cfq_add_rq_rb(struct request *rq)
@@ -1065,6 +1073,9 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 		cfq_log_cfqq(cfqd, cfqq, "resid=%ld", cfqq->slice_resid);
 	}
 
+	if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list))
+		cfq_del_cfqq_rr(cfqd, cfqq);
+
 	cfq_resort_rr_list(cfqd, cfqq);
 
 	if (cfqq == cfqd->active_queue)
@@ -1094,11 +1105,30 @@ static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
 		service_tree_for(cfqd->serving_group, cfqd->serving_prio,
 					cfqd->serving_type, cfqd);
 
+	if (!cfqd->rq_queued)
+		return NULL;
+
 	if (RB_EMPTY_ROOT(&service_tree->rb))
 		return NULL;
 	return cfq_rb_first(service_tree);
 }
 
+static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd)
+{
+	struct cfq_group *cfqg = &cfqd->root_group;
+	struct cfq_queue *cfqq;
+	int i, j;
+	struct cfq_rb_root *st;
+
+	if (!cfqd->rq_queued)
+		return NULL;
+
+	for_each_cfqg_st(cfqg, i, j, st)
+		if ((cfqq = cfq_rb_first(st)) != NULL)
+			return cfqq;
+	return NULL;
+}
+
 /*
  * Get and set a new active queue for service.
  */
@@ -1231,6 +1261,9 @@ static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 	enum wl_prio_t prio = cfqq_prio(cfqq);
 	struct cfq_rb_root *service_tree = cfqq->service_tree;
 
+	BUG_ON(!service_tree);
+	BUG_ON(!service_tree->count);
+
 	/* We never do for idle class queues. */
 	if (prio == IDLE_WORKLOAD)
 		return false;
@@ -1243,14 +1276,7 @@ static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 	 * Otherwise, we do only if they are the last ones
 	 * in their service tree.
 	 */
-	if (!service_tree)
-		service_tree = service_tree_for(cfqq->cfqg, prio,
-						cfqq_type(cfqq), cfqd);
-
-	if (service_tree->count == 0)
-		return true;
-
-	return (service_tree->count == 1 && cfq_rb_first(service_tree) == cfqq);
+	return service_tree->count == 1;
 }
 
 static void cfq_arm_slice_timer(struct cfq_data *cfqd)
@@ -1527,6 +1553,8 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
 	if (!cfqq)
 		goto new_queue;
 
+	if (!cfqd->rq_queued)
+		return NULL;
 	/*
 	 * The active queue has run out of time, expire it and select new.
 	 */
@@ -1589,6 +1617,9 @@ static int __cfq_forced_dispatch_cfqq(struct cfq_queue *cfqq)
 	}
 
 	BUG_ON(!list_empty(&cfqq->fifo));
+
+	/* By default cfqq is not expired if it is empty. Do it explicitly */
+	__cfq_slice_expired(cfqq->cfqd, cfqq, 0);
 	return dispatched;
 }
 
@@ -1600,14 +1631,9 @@ static int cfq_forced_dispatch(struct cfq_data *cfqd)
 {
 	struct cfq_queue *cfqq;
 	int dispatched = 0;
-	int i, j;
-	struct cfq_group *cfqg = &cfqd->root_group;
-	struct cfq_rb_root *st;
 
-	for_each_cfqg_st(cfqg, i, j, st) {
-		while ((cfqq = cfq_rb_first(st)) != NULL)
-			dispatched += __cfq_forced_dispatch_cfqq(cfqq);
-	}
+	while ((cfqq = cfq_get_next_queue_forced(cfqd)) != NULL)
+		dispatched += __cfq_forced_dispatch_cfqq(cfqq);
 
 	cfq_slice_expired(cfqd, 0);
 	BUG_ON(cfqd->busy_queues);
@@ -1776,13 +1802,13 @@ static void cfq_put_queue(struct cfq_queue *cfqq)
 	cfq_log_cfqq(cfqd, cfqq, "put_queue");
 	BUG_ON(rb_first(&cfqq->sort_list));
 	BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]);
-	BUG_ON(cfq_cfqq_on_rr(cfqq));
 
 	if (unlikely(cfqd->active_queue == cfqq)) {
 		__cfq_slice_expired(cfqd, cfqq, 0);
 		cfq_schedule_dispatch(cfqd);
 	}
 
+	BUG_ON(cfq_cfqq_on_rr(cfqq));
 	kmem_cache_free(cfq_pool, cfqq);
 }
 
@@ -2444,9 +2470,11 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
 	if (cfq_class_idle(cfqq))
 		return true;
 
+	/* Allow preemption only if we are idling on sync-noidle tree */
 	if (cfqd->serving_type == SYNC_NOIDLE_WORKLOAD &&
 	    cfqq_type(new_cfqq) == SYNC_NOIDLE_WORKLOAD &&
-	    new_cfqq->service_tree->count == 1)
+	    new_cfqq->service_tree->count == 2 &&
+	    RB_EMPTY_ROOT(&cfqq->sort_list))
 		return true;
 
 	/*
-- 
cgit v1.2.2


From 1fa8f6d68b5c8ca0a608fd8d296c5f07ac788cd6 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Thu, 3 Dec 2009 12:59:41 -0500
Subject: blkio: Introduce the root service tree for cfq groups

o So far we just had one cfq_group in cfq_data. To create space for more than
  one cfq_group, we need to have a service tree of groups where all the groups
  can be queued if they have active cfq queues backlogged in these.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 137 ++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 134 insertions(+), 3 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 7f5646ac9f5d..e1f822ac4690 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -66,6 +66,7 @@ static DEFINE_SPINLOCK(ioc_gone_lock);
 #define cfq_class_rt(cfqq)	((cfqq)->ioprio_class == IOPRIO_CLASS_RT)
 
 #define sample_valid(samples)	((samples) > 80)
+#define rb_entry_cfqg(node)	rb_entry((node), struct cfq_group, rb_node)
 
 /*
  * Most of our rbtree usage is for sorting with min extraction, so
@@ -77,8 +78,9 @@ struct cfq_rb_root {
 	struct rb_root rb;
 	struct rb_node *left;
 	unsigned count;
+	u64 min_vdisktime;
 };
-#define CFQ_RB_ROOT	(struct cfq_rb_root) { RB_ROOT, NULL, 0, }
+#define CFQ_RB_ROOT	(struct cfq_rb_root) { RB_ROOT, NULL, 0, 0, }
 
 /*
  * Per process-grouping structure
@@ -156,6 +158,16 @@ enum wl_type_t {
 
 /* This is per cgroup per device grouping structure */
 struct cfq_group {
+	/* group service_tree member */
+	struct rb_node rb_node;
+
+	/* group service_tree key */
+	u64 vdisktime;
+	bool on_st;
+
+	/* number of cfqq currently on this group */
+	int nr_cfqq;
+
 	/*
 	 * rr lists of queues with requests, onle rr for each priority class.
 	 * Counts are embedded in the cfq_rb_root
@@ -169,6 +181,8 @@ struct cfq_group {
  */
 struct cfq_data {
 	struct request_queue *queue;
+	/* Root service tree for cfq_groups */
+	struct cfq_rb_root grp_service_tree;
 	struct cfq_group root_group;
 
 	/*
@@ -251,6 +265,9 @@ static struct cfq_rb_root *service_tree_for(struct cfq_group *cfqg,
 					    enum wl_type_t type,
 					    struct cfq_data *cfqd)
 {
+	if (!cfqg)
+		return NULL;
+
 	if (prio == IDLE_WORKLOAD)
 		return &cfqg->service_tree_idle;
 
@@ -589,6 +606,17 @@ static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root)
 	return NULL;
 }
 
+static struct cfq_group *cfq_rb_first_group(struct cfq_rb_root *root)
+{
+	if (!root->left)
+		root->left = rb_first(&root->rb);
+
+	if (root->left)
+		return rb_entry_cfqg(root->left);
+
+	return NULL;
+}
+
 static void rb_erase_init(struct rb_node *n, struct rb_root *root)
 {
 	rb_erase(n, root);
@@ -640,6 +668,83 @@ static unsigned long cfq_slice_offset(struct cfq_data *cfqd,
 		       cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio));
 }
 
+static inline s64
+cfqg_key(struct cfq_rb_root *st, struct cfq_group *cfqg)
+{
+	return cfqg->vdisktime - st->min_vdisktime;
+}
+
+static void
+__cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
+{
+	struct rb_node **node = &st->rb.rb_node;
+	struct rb_node *parent = NULL;
+	struct cfq_group *__cfqg;
+	s64 key = cfqg_key(st, cfqg);
+	int left = 1;
+
+	while (*node != NULL) {
+		parent = *node;
+		__cfqg = rb_entry_cfqg(parent);
+
+		if (key < cfqg_key(st, __cfqg))
+			node = &parent->rb_left;
+		else {
+			node = &parent->rb_right;
+			left = 0;
+		}
+	}
+
+	if (left)
+		st->left = &cfqg->rb_node;
+
+	rb_link_node(&cfqg->rb_node, parent, node);
+	rb_insert_color(&cfqg->rb_node, &st->rb);
+}
+
+static void
+cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
+{
+	struct cfq_rb_root *st = &cfqd->grp_service_tree;
+	struct cfq_group *__cfqg;
+	struct rb_node *n;
+
+	cfqg->nr_cfqq++;
+	if (cfqg->on_st)
+		return;
+
+	/*
+	 * Currently put the group at the end. Later implement something
+	 * so that groups get lesser vtime based on their weights, so that
+	 * if group does not loose all if it was not continously backlogged.
+	 */
+	n = rb_last(&st->rb);
+	if (n) {
+		__cfqg = rb_entry_cfqg(n);
+		cfqg->vdisktime = __cfqg->vdisktime + CFQ_IDLE_DELAY;
+	} else
+		cfqg->vdisktime = st->min_vdisktime;
+
+	__cfq_group_service_tree_add(st, cfqg);
+	cfqg->on_st = true;
+}
+
+static void
+cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
+{
+	struct cfq_rb_root *st = &cfqd->grp_service_tree;
+
+	BUG_ON(cfqg->nr_cfqq < 1);
+	cfqg->nr_cfqq--;
+	/* If there are other cfq queues under this group, don't delete it */
+	if (cfqg->nr_cfqq)
+		return;
+
+	cfqg->on_st = false;
+	if (!RB_EMPTY_NODE(&cfqg->rb_node))
+		cfq_rb_erase(&cfqg->rb_node, st);
+}
+
 /*
  * The cfqd->service_trees holds all pending cfq_queue's that have
  * requests waiting to be processed. It is sorted in the order that
@@ -722,6 +827,7 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 	rb_link_node(&cfqq->rb_node, parent, p);
 	rb_insert_color(&cfqq->rb_node, &service_tree->rb);
 	service_tree->count++;
+	cfq_group_service_tree_add(cfqd, cfqq->cfqg);
 }
 
 static struct cfq_queue *
@@ -832,6 +938,7 @@ static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 		cfqq->p_root = NULL;
 	}
 
+	cfq_group_service_tree_del(cfqd, cfqq->cfqg);
 	BUG_ON(!cfqd->busy_queues);
 	cfqd->busy_queues--;
 }
@@ -1108,6 +1215,9 @@ static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
 	if (!cfqd->rq_queued)
 		return NULL;
 
+	/* There is nothing to dispatch */
+	if (!service_tree)
+		return NULL;
 	if (RB_EMPTY_ROOT(&service_tree->rb))
 		return NULL;
 	return cfq_rb_first(service_tree);
@@ -1477,6 +1587,12 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
 	unsigned count;
 	struct cfq_rb_root *st;
 
+	if (!cfqg) {
+		cfqd->serving_prio = IDLE_WORKLOAD;
+		cfqd->workload_expires = jiffies + 1;
+		return;
+	}
+
 	/* Choose next priority. RT > BE > IDLE */
 	if (cfq_busy_queues_wl(RT_WORKLOAD, cfqd))
 		cfqd->serving_prio = RT_WORKLOAD;
@@ -1535,10 +1651,21 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
 	cfqd->noidle_tree_requires_idle = false;
 }
 
+static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd)
+{
+	struct cfq_rb_root *st = &cfqd->grp_service_tree;
+
+	if (RB_EMPTY_ROOT(&st->rb))
+		return NULL;
+	return cfq_rb_first_group(st);
+}
+
 static void cfq_choose_cfqg(struct cfq_data *cfqd)
 {
-	cfqd->serving_group = &cfqd->root_group;
-	choose_service_tree(cfqd, &cfqd->root_group);
+	struct cfq_group *cfqg = cfq_get_next_cfqg(cfqd);
+
+	cfqd->serving_group = cfqg;
+	choose_service_tree(cfqd, cfqg);
 }
 
 /*
@@ -3014,10 +3141,14 @@ static void *cfq_init_queue(struct request_queue *q)
 	if (!cfqd)
 		return NULL;
 
+	/* Init root service tree */
+	cfqd->grp_service_tree = CFQ_RB_ROOT;
+
 	/* Init root group */
 	cfqg = &cfqd->root_group;
 	for_each_cfqg_st(cfqg, i, j, st)
 		*st = CFQ_RB_ROOT;
+	RB_CLEAR_NODE(&cfqg->rb_node);
 
 	/*
 	 * Not strictly needed (since RB_ROOT just clears the node and we
-- 
cgit v1.2.2


From 31e4c28d95e64f2d5d3c497a3ecf37c62de635b4 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Thu, 3 Dec 2009 12:59:42 -0500
Subject: blkio: Introduce blkio controller cgroup interface

o This is basic implementation of blkio controller cgroup interface. This is
  the common interface visible to user space and should be used by different
  IO control policies as we implement those.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/Kconfig         |  13 ++++
 block/Kconfig.iosched |   1 +
 block/Makefile        |   1 +
 block/blk-cgroup.c    | 177 ++++++++++++++++++++++++++++++++++++++++++++++++++
 block/blk-cgroup.h    |  58 +++++++++++++++++
 5 files changed, 250 insertions(+)
 create mode 100644 block/blk-cgroup.c
 create mode 100644 block/blk-cgroup.h

(limited to 'block')

diff --git a/block/Kconfig b/block/Kconfig
index 9be0b56eaee1..6ba1a8e3388b 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -77,6 +77,19 @@ config BLK_DEV_INTEGRITY
 	T10/SCSI Data Integrity Field or the T13/ATA External Path
 	Protection.  If in doubt, say N.
 
+config BLK_CGROUP
+	bool
+	depends on CGROUPS
+	default n
+	---help---
+	Generic block IO controller cgroup interface. This is the common
+	cgroup interface which should be used by various IO controlling
+	policies.
+
+	Currently, CFQ IO scheduler uses it to recognize task groups and
+	control disk bandwidth allocation (proportional time slice allocation)
+	to such task groups.
+
 endif # BLOCK
 
 config BLOCK_COMPAT
diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
index 8bd105115a69..be0280deec29 100644
--- a/block/Kconfig.iosched
+++ b/block/Kconfig.iosched
@@ -23,6 +23,7 @@ config IOSCHED_DEADLINE
 
 config IOSCHED_CFQ
 	tristate "CFQ I/O scheduler"
+	select BLK_CGROUP
 	default y
 	---help---
 	  The CFQ I/O scheduler tries to distribute bandwidth equally
diff --git a/block/Makefile b/block/Makefile
index 7914108952f2..cb2d515ebd6e 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -8,6 +8,7 @@ obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \
 			blk-iopoll.o ioctl.o genhd.o scsi_ioctl.o
 
 obj-$(CONFIG_BLK_DEV_BSG)	+= bsg.o
+obj-$(CONFIG_BLK_CGROUP)	+= blk-cgroup.o
 obj-$(CONFIG_IOSCHED_NOOP)	+= noop-iosched.o
 obj-$(CONFIG_IOSCHED_DEADLINE)	+= deadline-iosched.o
 obj-$(CONFIG_IOSCHED_CFQ)	+= cfq-iosched.o
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
new file mode 100644
index 000000000000..4f6afd76ec59
--- /dev/null
+++ b/block/blk-cgroup.c
@@ -0,0 +1,177 @@
+/*
+ * Common Block IO controller cgroup interface
+ *
+ * Based on ideas and code from CFQ, CFS and BFQ:
+ * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
+ *
+ * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
+ *		      Paolo Valente <paolo.valente@unimore.it>
+ *
+ * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com>
+ * 	              Nauman Rafique <nauman@google.com>
+ */
+#include <linux/ioprio.h>
+#include "blk-cgroup.h"
+
+struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT };
+
+struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup)
+{
+	return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id),
+			    struct blkio_cgroup, css);
+}
+
+void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
+				struct blkio_group *blkg, void *key)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&blkcg->lock, flags);
+	rcu_assign_pointer(blkg->key, key);
+	hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
+	spin_unlock_irqrestore(&blkcg->lock, flags);
+}
+
+int blkiocg_del_blkio_group(struct blkio_group *blkg)
+{
+	/* Implemented later */
+	return 0;
+}
+
+/* called under rcu_read_lock(). */
+struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key)
+{
+	struct blkio_group *blkg;
+	struct hlist_node *n;
+	void *__key;
+
+	hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {
+		__key = blkg->key;
+		if (__key == key)
+			return blkg;
+	}
+
+	return NULL;
+}
+
+#define SHOW_FUNCTION(__VAR)						\
+static u64 blkiocg_##__VAR##_read(struct cgroup *cgroup,		\
+				       struct cftype *cftype)		\
+{									\
+	struct blkio_cgroup *blkcg;					\
+									\
+	blkcg = cgroup_to_blkio_cgroup(cgroup);				\
+	return (u64)blkcg->__VAR;					\
+}
+
+SHOW_FUNCTION(weight);
+#undef SHOW_FUNCTION
+
+static int
+blkiocg_weight_write(struct cgroup *cgroup, struct cftype *cftype, u64 val)
+{
+	struct blkio_cgroup *blkcg;
+
+	if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX)
+		return -EINVAL;
+
+	blkcg = cgroup_to_blkio_cgroup(cgroup);
+	blkcg->weight = (unsigned int)val;
+	return 0;
+}
+
+struct cftype blkio_files[] = {
+	{
+		.name = "weight",
+		.read_u64 = blkiocg_weight_read,
+		.write_u64 = blkiocg_weight_write,
+	},
+};
+
+static int blkiocg_populate(struct cgroup_subsys *subsys, struct cgroup *cgroup)
+{
+	return cgroup_add_files(cgroup, subsys, blkio_files,
+				ARRAY_SIZE(blkio_files));
+}
+
+static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup)
+{
+	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
+
+	free_css_id(&blkio_subsys, &blkcg->css);
+	kfree(blkcg);
+}
+
+static struct cgroup_subsys_state *
+blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup)
+{
+	struct blkio_cgroup *blkcg, *parent_blkcg;
+
+	if (!cgroup->parent) {
+		blkcg = &blkio_root_cgroup;
+		goto done;
+	}
+
+	/* Currently we do not support hierarchy deeper than two level (0,1) */
+	parent_blkcg = cgroup_to_blkio_cgroup(cgroup->parent);
+	if (css_depth(&parent_blkcg->css) > 0)
+		return ERR_PTR(-EINVAL);
+
+	blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
+	if (!blkcg)
+		return ERR_PTR(-ENOMEM);
+
+	blkcg->weight = BLKIO_WEIGHT_DEFAULT;
+done:
+	spin_lock_init(&blkcg->lock);
+	INIT_HLIST_HEAD(&blkcg->blkg_list);
+
+	return &blkcg->css;
+}
+
+/*
+ * We cannot support shared io contexts, as we have no mean to support
+ * two tasks with the same ioc in two different groups without major rework
+ * of the main cic data structures.  For now we allow a task to change
+ * its cgroup only if it's the only owner of its ioc.
+ */
+static int blkiocg_can_attach(struct cgroup_subsys *subsys,
+				struct cgroup *cgroup, struct task_struct *tsk,
+				bool threadgroup)
+{
+	struct io_context *ioc;
+	int ret = 0;
+
+	/* task_lock() is needed to avoid races with exit_io_context() */
+	task_lock(tsk);
+	ioc = tsk->io_context;
+	if (ioc && atomic_read(&ioc->nr_tasks) > 1)
+		ret = -EINVAL;
+	task_unlock(tsk);
+
+	return ret;
+}
+
+static void blkiocg_attach(struct cgroup_subsys *subsys, struct cgroup *cgroup,
+				struct cgroup *prev, struct task_struct *tsk,
+				bool threadgroup)
+{
+	struct io_context *ioc;
+
+	task_lock(tsk);
+	ioc = tsk->io_context;
+	if (ioc)
+		ioc->cgroup_changed = 1;
+	task_unlock(tsk);
+}
+
+struct cgroup_subsys blkio_subsys = {
+	.name = "blkio",
+	.create = blkiocg_create,
+	.can_attach = blkiocg_can_attach,
+	.attach = blkiocg_attach,
+	.destroy = blkiocg_destroy,
+	.populate = blkiocg_populate,
+	.subsys_id = blkio_subsys_id,
+	.use_id = 1,
+};
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
new file mode 100644
index 000000000000..ba5703f69b42
--- /dev/null
+++ b/block/blk-cgroup.h
@@ -0,0 +1,58 @@
+#ifndef _BLK_CGROUP_H
+#define _BLK_CGROUP_H
+/*
+ * Common Block IO controller cgroup interface
+ *
+ * Based on ideas and code from CFQ, CFS and BFQ:
+ * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
+ *
+ * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
+ *		      Paolo Valente <paolo.valente@unimore.it>
+ *
+ * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com>
+ * 	              Nauman Rafique <nauman@google.com>
+ */
+
+#include <linux/cgroup.h>
+
+struct blkio_cgroup {
+	struct cgroup_subsys_state css;
+	unsigned int weight;
+	spinlock_t lock;
+	struct hlist_head blkg_list;
+};
+
+struct blkio_group {
+	/* An rcu protected unique identifier for the group */
+	void *key;
+	struct hlist_node blkcg_node;
+};
+
+#define BLKIO_WEIGHT_MIN	100
+#define BLKIO_WEIGHT_MAX	1000
+#define BLKIO_WEIGHT_DEFAULT	500
+
+#ifdef CONFIG_BLK_CGROUP
+extern struct blkio_cgroup blkio_root_cgroup;
+extern struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup);
+extern void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
+				struct blkio_group *blkg, void *key);
+extern int blkiocg_del_blkio_group(struct blkio_group *blkg);
+extern struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg,
+						void *key);
+#else
+static inline struct blkio_cgroup *
+cgroup_to_blkio_cgroup(struct cgroup *cgroup) { return NULL; }
+
+static inline void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
+			struct blkio_group *blkg, void *key)
+{
+}
+
+static inline int
+blkiocg_del_blkio_group(struct blkio_group *blkg) { return 0; }
+
+static inline struct blkio_group *
+blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key) { return NULL; }
+#endif
+#endif /* _BLK_CGROUP_H */
-- 
cgit v1.2.2


From 25bc6b07767fe77422312eda2af99c9477f76191 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Thu, 3 Dec 2009 12:59:43 -0500
Subject: blkio: Introduce per cfq group weights and vdisktime calculations

o Bring in the per cfq group weight and how vdisktime is calculated for the
  group. Also bring in the functionality of updating the min_vdisktime of
  the group service tree.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/Kconfig.iosched |  9 +++++++-
 block/cfq-iosched.c   | 62 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 69 insertions(+), 2 deletions(-)

(limited to 'block')

diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
index be0280deec29..fa95fa770570 100644
--- a/block/Kconfig.iosched
+++ b/block/Kconfig.iosched
@@ -23,7 +23,6 @@ config IOSCHED_DEADLINE
 
 config IOSCHED_CFQ
 	tristate "CFQ I/O scheduler"
-	select BLK_CGROUP
 	default y
 	---help---
 	  The CFQ I/O scheduler tries to distribute bandwidth equally
@@ -33,6 +32,14 @@ config IOSCHED_CFQ
 
 	  This is the default I/O scheduler.
 
+config CFQ_GROUP_IOSCHED
+	bool "CFQ Group Scheduling support"
+	depends on IOSCHED_CFQ && CGROUPS
+	select BLK_CGROUP
+	default n
+	---help---
+	  Enable group IO scheduling in CFQ.
+
 choice
 	prompt "Default I/O scheduler"
 	default DEFAULT_CFQ
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index e1f822ac4690..019f28eea9df 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -13,6 +13,7 @@
 #include <linux/rbtree.h>
 #include <linux/ioprio.h>
 #include <linux/blktrace_api.h>
+#include "blk-cgroup.h"
 
 /*
  * tunables
@@ -49,6 +50,7 @@ static const int cfq_hist_divisor = 4;
 
 #define CFQ_SLICE_SCALE		(5)
 #define CFQ_HW_QUEUE_MIN	(5)
+#define CFQ_SERVICE_SHIFT       12
 
 #define RQ_CIC(rq)		\
 	((struct cfq_io_context *) (rq)->elevator_private)
@@ -79,6 +81,7 @@ struct cfq_rb_root {
 	struct rb_node *left;
 	unsigned count;
 	u64 min_vdisktime;
+	struct rb_node *active;
 };
 #define CFQ_RB_ROOT	(struct cfq_rb_root) { RB_ROOT, NULL, 0, 0, }
 
@@ -163,6 +166,7 @@ struct cfq_group {
 
 	/* group service_tree key */
 	u64 vdisktime;
+	unsigned int weight;
 	bool on_st;
 
 	/* number of cfqq currently on this group */
@@ -434,6 +438,51 @@ cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 	return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio);
 }
 
+static inline u64 cfq_scale_slice(unsigned long delta, struct cfq_group *cfqg)
+{
+	u64 d = delta << CFQ_SERVICE_SHIFT;
+
+	d = d * BLKIO_WEIGHT_DEFAULT;
+	do_div(d, cfqg->weight);
+	return d;
+}
+
+static inline u64 max_vdisktime(u64 min_vdisktime, u64 vdisktime)
+{
+	s64 delta = (s64)(vdisktime - min_vdisktime);
+	if (delta > 0)
+		min_vdisktime = vdisktime;
+
+	return min_vdisktime;
+}
+
+static inline u64 min_vdisktime(u64 min_vdisktime, u64 vdisktime)
+{
+	s64 delta = (s64)(vdisktime - min_vdisktime);
+	if (delta < 0)
+		min_vdisktime = vdisktime;
+
+	return min_vdisktime;
+}
+
+static void update_min_vdisktime(struct cfq_rb_root *st)
+{
+	u64 vdisktime = st->min_vdisktime;
+	struct cfq_group *cfqg;
+
+	if (st->active) {
+		cfqg = rb_entry_cfqg(st->active);
+		vdisktime = cfqg->vdisktime;
+	}
+
+	if (st->left) {
+		cfqg = rb_entry_cfqg(st->left);
+		vdisktime = min_vdisktime(vdisktime, cfqg->vdisktime);
+	}
+
+	st->min_vdisktime = max_vdisktime(st->min_vdisktime, vdisktime);
+}
+
 /*
  * get averaged number of queues of RT/BE priority.
  * average is updated, with a formula that gives more weight to higher numbers,
@@ -734,8 +783,12 @@ cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
 {
 	struct cfq_rb_root *st = &cfqd->grp_service_tree;
 
+	if (st->active == &cfqg->rb_node)
+		st->active = NULL;
+
 	BUG_ON(cfqg->nr_cfqq < 1);
 	cfqg->nr_cfqq--;
+
 	/* If there are other cfq queues under this group, don't delete it */
 	if (cfqg->nr_cfqq)
 		return;
@@ -1654,10 +1707,14 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
 static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd)
 {
 	struct cfq_rb_root *st = &cfqd->grp_service_tree;
+	struct cfq_group *cfqg;
 
 	if (RB_EMPTY_ROOT(&st->rb))
 		return NULL;
-	return cfq_rb_first_group(st);
+	cfqg = cfq_rb_first_group(st);
+	st->active = &cfqg->rb_node;
+	update_min_vdisktime(st);
+	return cfqg;
 }
 
 static void cfq_choose_cfqg(struct cfq_data *cfqd)
@@ -3150,6 +3207,9 @@ static void *cfq_init_queue(struct request_queue *q)
 		*st = CFQ_RB_ROOT;
 	RB_CLEAR_NODE(&cfqg->rb_node);
 
+	/* Give preference to root group over other groups */
+	cfqg->weight = 2*BLKIO_WEIGHT_DEFAULT;
+
 	/*
 	 * Not strictly needed (since RB_ROOT just clears the node and we
 	 * zeroed cfqd on alloc), but better be safe in case someone decides
-- 
cgit v1.2.2


From 58ff82f34cded3812af5b6c69b6aa626b6be2490 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Thu, 3 Dec 2009 12:59:44 -0500
Subject: blkio: Implement per cfq group latency target and busy queue avg

o So far we had 300ms soft target latency system wide. Now with the
  introduction of cfq groups, divide that latency by number of groups so
  that one can come up with group target latency which will be helpful
  in determining the workload slice with-in group and also the dynamic
  slice length of the cfq queue.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 65 ++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 45 insertions(+), 20 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 019f28eea9df..84887e2eb210 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -82,6 +82,7 @@ struct cfq_rb_root {
 	unsigned count;
 	u64 min_vdisktime;
 	struct rb_node *active;
+	unsigned total_weight;
 };
 #define CFQ_RB_ROOT	(struct cfq_rb_root) { RB_ROOT, NULL, 0, 0, }
 
@@ -172,6 +173,8 @@ struct cfq_group {
 	/* number of cfqq currently on this group */
 	int nr_cfqq;
 
+	/* Per group busy queus average. Useful for workload slice calc. */
+	unsigned int busy_queues_avg[2];
 	/*
 	 * rr lists of queues with requests, onle rr for each priority class.
 	 * Counts are embedded in the cfq_rb_root
@@ -188,6 +191,8 @@ struct cfq_data {
 	/* Root service tree for cfq_groups */
 	struct cfq_rb_root grp_service_tree;
 	struct cfq_group root_group;
+	/* Number of active cfq groups on group service tree */
+	int nr_groups;
 
 	/*
 	 * The priority currently being served
@@ -206,7 +211,6 @@ struct cfq_data {
 	struct rb_root prio_trees[CFQ_PRIO_LISTS];
 
 	unsigned int busy_queues;
-	unsigned int busy_queues_avg[2];
 
 	int rq_in_driver[2];
 	int sync_flight;
@@ -354,10 +358,10 @@ static enum wl_type_t cfqq_type(struct cfq_queue *cfqq)
 	return SYNC_WORKLOAD;
 }
 
-static inline int cfq_busy_queues_wl(enum wl_prio_t wl, struct cfq_data *cfqd)
+static inline int cfq_group_busy_queues_wl(enum wl_prio_t wl,
+					struct cfq_data *cfqd,
+					struct cfq_group *cfqg)
 {
-	struct cfq_group *cfqg = &cfqd->root_group;
-
 	if (wl == IDLE_WORKLOAD)
 		return cfqg->service_tree_idle.count;
 
@@ -489,18 +493,27 @@ static void update_min_vdisktime(struct cfq_rb_root *st)
  * to quickly follows sudden increases and decrease slowly
  */
 
-static inline unsigned cfq_get_avg_queues(struct cfq_data *cfqd, bool rt)
+static inline unsigned cfq_group_get_avg_queues(struct cfq_data *cfqd,
+					struct cfq_group *cfqg, bool rt)
 {
 	unsigned min_q, max_q;
 	unsigned mult  = cfq_hist_divisor - 1;
 	unsigned round = cfq_hist_divisor / 2;
-	unsigned busy = cfq_busy_queues_wl(rt, cfqd);
+	unsigned busy = cfq_group_busy_queues_wl(rt, cfqd, cfqg);
 
-	min_q = min(cfqd->busy_queues_avg[rt], busy);
-	max_q = max(cfqd->busy_queues_avg[rt], busy);
-	cfqd->busy_queues_avg[rt] = (mult * max_q + min_q + round) /
+	min_q = min(cfqg->busy_queues_avg[rt], busy);
+	max_q = max(cfqg->busy_queues_avg[rt], busy);
+	cfqg->busy_queues_avg[rt] = (mult * max_q + min_q + round) /
 		cfq_hist_divisor;
-	return cfqd->busy_queues_avg[rt];
+	return cfqg->busy_queues_avg[rt];
+}
+
+static inline unsigned
+cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg)
+{
+	struct cfq_rb_root *st = &cfqd->grp_service_tree;
+
+	return cfq_target_latency * cfqg->weight / st->total_weight;
 }
 
 static inline void
@@ -508,12 +521,17 @@ cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
 	unsigned slice = cfq_prio_to_slice(cfqd, cfqq);
 	if (cfqd->cfq_latency) {
-		/* interested queues (we consider only the ones with the same
-		 * priority class) */
-		unsigned iq = cfq_get_avg_queues(cfqd, cfq_class_rt(cfqq));
+		/*
+		 * interested queues (we consider only the ones with the same
+		 * priority class in the cfq group)
+		 */
+		unsigned iq = cfq_group_get_avg_queues(cfqd, cfqq->cfqg,
+						cfq_class_rt(cfqq));
 		unsigned sync_slice = cfqd->cfq_slice[1];
 		unsigned expect_latency = sync_slice * iq;
-		if (expect_latency > cfq_target_latency) {
+		unsigned group_slice = cfq_group_slice(cfqd, cfqq->cfqg);
+
+		if (expect_latency > group_slice) {
 			unsigned base_low_slice = 2 * cfqd->cfq_slice_idle;
 			/* scale low_slice according to IO priority
 			 * and sync vs async */
@@ -521,7 +539,7 @@ cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 				min(slice, base_low_slice * slice / sync_slice);
 			/* the adapted slice value is scaled to fit all iqs
 			 * into the target latency */
-			slice = max(slice * cfq_target_latency / expect_latency,
+			slice = max(slice * group_slice / expect_latency,
 				    low_slice);
 		}
 	}
@@ -776,6 +794,8 @@ cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
 
 	__cfq_group_service_tree_add(st, cfqg);
 	cfqg->on_st = true;
+	cfqd->nr_groups++;
+	st->total_weight += cfqg->weight;
 }
 
 static void
@@ -794,6 +814,8 @@ cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
 		return;
 
 	cfqg->on_st = false;
+	cfqd->nr_groups--;
+	st->total_weight -= cfqg->weight;
 	if (!RB_EMPTY_NODE(&cfqg->rb_node))
 		cfq_rb_erase(&cfqg->rb_node, st);
 }
@@ -1639,6 +1661,7 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
 	unsigned slice;
 	unsigned count;
 	struct cfq_rb_root *st;
+	unsigned group_slice;
 
 	if (!cfqg) {
 		cfqd->serving_prio = IDLE_WORKLOAD;
@@ -1647,9 +1670,9 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
 	}
 
 	/* Choose next priority. RT > BE > IDLE */
-	if (cfq_busy_queues_wl(RT_WORKLOAD, cfqd))
+	if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg))
 		cfqd->serving_prio = RT_WORKLOAD;
-	else if (cfq_busy_queues_wl(BE_WORKLOAD, cfqd))
+	else if (cfq_group_busy_queues_wl(BE_WORKLOAD, cfqd, cfqg))
 		cfqd->serving_prio = BE_WORKLOAD;
 	else {
 		cfqd->serving_prio = IDLE_WORKLOAD;
@@ -1687,9 +1710,11 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
 	 * proportional to the number of queues in that workload, over
 	 * all the queues in the same priority class
 	 */
-	slice = cfq_target_latency * count /
-		max_t(unsigned, cfqd->busy_queues_avg[cfqd->serving_prio],
-		      cfq_busy_queues_wl(cfqd->serving_prio, cfqd));
+	group_slice = cfq_group_slice(cfqd, cfqg);
+
+	slice = group_slice * count /
+		max_t(unsigned, cfqg->busy_queues_avg[cfqd->serving_prio],
+		      cfq_group_busy_queues_wl(cfqd->serving_prio, cfqd, cfqg));
 
 	if (cfqd->serving_type == ASYNC_WORKLOAD)
 		/* async workload slice is scaled down according to
-- 
cgit v1.2.2


From dae739ebc4c590630039533a5bbd05865966094f Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Thu, 3 Dec 2009 12:59:45 -0500
Subject: blkio: Group time used accounting and workload context save restore

o This patch introduces the functionality to do the accounting of group time
  when a queue expires. This time used decides which is the group to go
  next.

o Also introduce the functionlity to save and restore the workload type
  context with-in group. It might happen that once we expire the cfq queue
  and group, a different group will schedule in and we will lose the context
  of the workload type. Hence save and restore it upon queue expiry.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 79 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 79 insertions(+)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 84887e2eb210..55d2a21f7f06 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -115,6 +115,10 @@ struct cfq_queue {
 	/* fifo list of requests in sort_list */
 	struct list_head fifo;
 
+	/* time when queue got scheduled in to dispatch first request. */
+	unsigned long dispatch_start;
+	/* time when first request from queue completed and slice started. */
+	unsigned long slice_start;
 	unsigned long slice_end;
 	long slice_resid;
 	unsigned int slice_dispatch;
@@ -181,6 +185,10 @@ struct cfq_group {
 	 */
 	struct cfq_rb_root service_trees[2][3];
 	struct cfq_rb_root service_tree_idle;
+
+	unsigned long saved_workload_slice;
+	enum wl_type_t saved_workload;
+	enum wl_prio_t saved_serving_prio;
 };
 
 /*
@@ -543,6 +551,7 @@ cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 				    low_slice);
 		}
 	}
+	cfqq->slice_start = jiffies;
 	cfqq->slice_end = jiffies + slice;
 	cfq_log_cfqq(cfqd, cfqq, "set_slice=%lu", cfqq->slice_end - jiffies);
 }
@@ -818,6 +827,58 @@ cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
 	st->total_weight -= cfqg->weight;
 	if (!RB_EMPTY_NODE(&cfqg->rb_node))
 		cfq_rb_erase(&cfqg->rb_node, st);
+	cfqg->saved_workload_slice = 0;
+}
+
+static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
+{
+	unsigned int slice_used, allocated_slice;
+
+	/*
+	 * Queue got expired before even a single request completed or
+	 * got expired immediately after first request completion.
+	 */
+	if (!cfqq->slice_start || cfqq->slice_start == jiffies) {
+		/*
+		 * Also charge the seek time incurred to the group, otherwise
+		 * if there are mutiple queues in the group, each can dispatch
+		 * a single request on seeky media and cause lots of seek time
+		 * and group will never know it.
+		 */
+		slice_used = max_t(unsigned, (jiffies - cfqq->dispatch_start),
+					1);
+	} else {
+		slice_used = jiffies - cfqq->slice_start;
+		allocated_slice = cfqq->slice_end - cfqq->slice_start;
+		if (slice_used > allocated_slice)
+			slice_used = allocated_slice;
+	}
+
+	cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u", slice_used);
+	return slice_used;
+}
+
+static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
+				struct cfq_queue *cfqq)
+{
+	struct cfq_rb_root *st = &cfqd->grp_service_tree;
+	unsigned int used_sl;
+
+	used_sl = cfq_cfqq_slice_usage(cfqq);
+
+	/* Can't update vdisktime while group is on service tree */
+	cfq_rb_erase(&cfqg->rb_node, st);
+	cfqg->vdisktime += cfq_scale_slice(used_sl, cfqg);
+	__cfq_group_service_tree_add(st, cfqg);
+
+	/* This group is being expired. Save the context */
+	if (time_after(cfqd->workload_expires, jiffies)) {
+		cfqg->saved_workload_slice = cfqd->workload_expires
+						- jiffies;
+		cfqg->saved_workload = cfqd->serving_type;
+		cfqg->saved_serving_prio = cfqd->serving_prio;
+	} else
+		cfqg->saved_workload_slice = 0;
 }
 
 /*
@@ -833,6 +894,7 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 	unsigned long rb_key;
 	struct cfq_rb_root *service_tree;
 	int left;
+	int new_cfqq = 1;
 
 	service_tree = service_tree_for(cfqq->cfqg, cfqq_prio(cfqq),
 						cfqq_type(cfqq), cfqd);
@@ -861,6 +923,7 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 	}
 
 	if (!RB_EMPTY_NODE(&cfqq->rb_node)) {
+		new_cfqq = 0;
 		/*
 		 * same position, nothing more to do
 		 */
@@ -902,6 +965,8 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 	rb_link_node(&cfqq->rb_node, parent, p);
 	rb_insert_color(&cfqq->rb_node, &service_tree->rb);
 	service_tree->count++;
+	if (add_front || !new_cfqq)
+		return;
 	cfq_group_service_tree_add(cfqd, cfqq->cfqg);
 }
 
@@ -1218,6 +1283,8 @@ static void __cfq_set_active_queue(struct cfq_data *cfqd,
 {
 	if (cfqq) {
 		cfq_log_cfqq(cfqd, cfqq, "set_active");
+		cfqq->slice_start = 0;
+		cfqq->dispatch_start = jiffies;
 		cfqq->slice_end = 0;
 		cfqq->slice_dispatch = 0;
 
@@ -1255,6 +1322,8 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 		cfq_log_cfqq(cfqd, cfqq, "resid=%ld", cfqq->slice_resid);
 	}
 
+	cfq_group_served(cfqd, cfqq->cfqg, cfqq);
+
 	if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list))
 		cfq_del_cfqq_rr(cfqd, cfqq);
 
@@ -1263,6 +1332,9 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 	if (cfqq == cfqd->active_queue)
 		cfqd->active_queue = NULL;
 
+	if (&cfqq->cfqg->rb_node == cfqd->grp_service_tree.active)
+		cfqd->grp_service_tree.active = NULL;
+
 	if (cfqd->active_cic) {
 		put_io_context(cfqd->active_cic->ioc);
 		cfqd->active_cic = NULL;
@@ -1747,6 +1819,13 @@ static void cfq_choose_cfqg(struct cfq_data *cfqd)
 	struct cfq_group *cfqg = cfq_get_next_cfqg(cfqd);
 
 	cfqd->serving_group = cfqg;
+
+	/* Restore the workload type data */
+	if (cfqg->saved_workload_slice) {
+		cfqd->workload_expires = jiffies + cfqg->saved_workload_slice;
+		cfqd->serving_type = cfqg->saved_workload;
+		cfqd->serving_prio = cfqg->saved_serving_prio;
+	}
 	choose_service_tree(cfqd, cfqg);
 }
 
-- 
cgit v1.2.2


From 25fb5169d4c9d4255107abbb7c08ab712434efc8 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Thu, 3 Dec 2009 12:59:46 -0500
Subject: blkio: Dynamic cfq group creation based on cgroup tasks belongs to

o Determine the cgroup IO submitting task belongs to and create the cfq
  group if it does not exist already.

o Also link cfqq and associated cfq group.

o Currently all async IO is mapped to root group.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 111 ++++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 100 insertions(+), 11 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 55d2a21f7f06..a877eeee80af 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -189,6 +189,10 @@ struct cfq_group {
 	unsigned long saved_workload_slice;
 	enum wl_type_t saved_workload;
 	enum wl_prio_t saved_serving_prio;
+	struct blkio_group blkg;
+#ifdef CONFIG_CFQ_GROUP_IOSCHED
+	struct hlist_node cfqd_node;
+#endif
 };
 
 /*
@@ -274,8 +278,13 @@ struct cfq_data {
 	struct cfq_queue oom_cfqq;
 
 	unsigned long last_end_sync_rq;
+
+	/* List of cfq groups being managed on this device*/
+	struct hlist_head cfqg_list;
 };
 
+static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);
+
 static struct cfq_rb_root *service_tree_for(struct cfq_group *cfqg,
 					    enum wl_prio_t prio,
 					    enum wl_type_t type,
@@ -881,6 +890,89 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
 		cfqg->saved_workload_slice = 0;
 }
 
+#ifdef CONFIG_CFQ_GROUP_IOSCHED
+static inline struct cfq_group *cfqg_of_blkg(struct blkio_group *blkg)
+{
+	if (blkg)
+		return container_of(blkg, struct cfq_group, blkg);
+	return NULL;
+}
+
+static struct cfq_group *
+cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
+{
+	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
+	struct cfq_group *cfqg = NULL;
+	void *key = cfqd;
+	int i, j;
+	struct cfq_rb_root *st;
+
+	/* Do we need to take this reference */
+	if (!css_tryget(&blkcg->css))
+		return NULL;;
+
+	cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
+	if (cfqg || !create)
+		goto done;
+
+	cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node);
+	if (!cfqg)
+		goto done;
+
+	cfqg->weight = blkcg->weight;
+	for_each_cfqg_st(cfqg, i, j, st)
+		*st = CFQ_RB_ROOT;
+	RB_CLEAR_NODE(&cfqg->rb_node);
+
+	/* Add group onto cgroup list */
+	blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd);
+
+	/* Add group on cfqd list */
+	hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
+
+done:
+	css_put(&blkcg->css);
+	return cfqg;
+}
+
+/*
+ * Search for the cfq group current task belongs to. If create = 1, then also
+ * create the cfq group if it does not exist. request_queue lock must be held.
+ */
+static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create)
+{
+	struct cgroup *cgroup;
+	struct cfq_group *cfqg = NULL;
+
+	rcu_read_lock();
+	cgroup = task_cgroup(current, blkio_subsys_id);
+	cfqg = cfq_find_alloc_cfqg(cfqd, cgroup, create);
+	if (!cfqg && create)
+		cfqg = &cfqd->root_group;
+	rcu_read_unlock();
+	return cfqg;
+}
+
+static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg)
+{
+	/* Currently, all async queues are mapped to root group */
+	if (!cfq_cfqq_sync(cfqq))
+		cfqg = &cfqq->cfqd->root_group;
+
+	cfqq->cfqg = cfqg;
+}
+#else /* GROUP_IOSCHED */
+static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create)
+{
+	return &cfqd->root_group;
+}
+static inline void
+cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) {
+	cfqq->cfqg = cfqg;
+}
+
+#endif /* GROUP_IOSCHED */
+
 /*
  * The cfqd->service_trees holds all pending cfq_queue's that have
  * requests waiting to be processed. It is sorted in the order that
@@ -1372,7 +1464,7 @@ static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
 
 static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd)
 {
-	struct cfq_group *cfqg = &cfqd->root_group;
+	struct cfq_group *cfqg;
 	struct cfq_queue *cfqq;
 	int i, j;
 	struct cfq_rb_root *st;
@@ -1380,6 +1472,10 @@ static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd)
 	if (!cfqd->rq_queued)
 		return NULL;
 
+	cfqg = cfq_get_next_cfqg(cfqd);
+	if (!cfqg)
+		return NULL;
+
 	for_each_cfqg_st(cfqg, i, j, st)
 		if ((cfqq = cfq_rb_first(st)) != NULL)
 			return cfqq;
@@ -2390,16 +2486,6 @@ static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 	cfqq->pid = pid;
 }
 
-static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg)
-{
-	cfqq->cfqg = cfqg;
-}
-
-static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create)
-{
-	return &cfqd->root_group;
-}
-
 static struct cfq_queue *
 cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync,
 		     struct io_context *ioc, gfp_t gfp_mask)
@@ -3314,6 +3400,9 @@ static void *cfq_init_queue(struct request_queue *q)
 	/* Give preference to root group over other groups */
 	cfqg->weight = 2*BLKIO_WEIGHT_DEFAULT;
 
+#ifdef CONFIG_CFQ_GROUP_IOSCHED
+	blkiocg_add_blkio_group(&blkio_root_cgroup, &cfqg->blkg, (void *)cfqd);
+#endif
 	/*
 	 * Not strictly needed (since RB_ROOT just clears the node and we
 	 * zeroed cfqd on alloc), but better be safe in case someone decides
-- 
cgit v1.2.2


From b1c3576961847da26c91b1e97f226bb66be5fa3f Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Thu, 3 Dec 2009 12:59:47 -0500
Subject: blkio: Take care of cgroup deletion and cfq group reference counting

o One can choose to change elevator or delete a cgroup. Implement group
  reference counting so that both elevator exit and cgroup deletion can
  take place gracefully.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Nauman Rafique <nauman@google.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-cgroup.c  | 66 +++++++++++++++++++++++++++++++++++--
 block/blk-cgroup.h  |  1 +
 block/cfq-iosched.c | 95 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 160 insertions(+), 2 deletions(-)

(limited to 'block')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 4f6afd76ec59..0426ab692fd5 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -13,6 +13,8 @@
 #include <linux/ioprio.h>
 #include "blk-cgroup.h"
 
+extern void cfq_unlink_blkio_group(void *, struct blkio_group *);
+
 struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT };
 
 struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup)
@@ -28,14 +30,43 @@ void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
 
 	spin_lock_irqsave(&blkcg->lock, flags);
 	rcu_assign_pointer(blkg->key, key);
+	blkg->blkcg_id = css_id(&blkcg->css);
 	hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
 	spin_unlock_irqrestore(&blkcg->lock, flags);
 }
 
+static void __blkiocg_del_blkio_group(struct blkio_group *blkg)
+{
+	hlist_del_init_rcu(&blkg->blkcg_node);
+	blkg->blkcg_id = 0;
+}
+
+/*
+ * returns 0 if blkio_group was still on cgroup list. Otherwise returns 1
+ * indicating that blk_group was unhashed by the time we got to it.
+ */
 int blkiocg_del_blkio_group(struct blkio_group *blkg)
 {
-	/* Implemented later */
-	return 0;
+	struct blkio_cgroup *blkcg;
+	unsigned long flags;
+	struct cgroup_subsys_state *css;
+	int ret = 1;
+
+	rcu_read_lock();
+	css = css_lookup(&blkio_subsys, blkg->blkcg_id);
+	if (!css)
+		goto out;
+
+	blkcg = container_of(css, struct blkio_cgroup, css);
+	spin_lock_irqsave(&blkcg->lock, flags);
+	if (!hlist_unhashed(&blkg->blkcg_node)) {
+		__blkiocg_del_blkio_group(blkg);
+		ret = 0;
+	}
+	spin_unlock_irqrestore(&blkcg->lock, flags);
+out:
+	rcu_read_unlock();
+	return ret;
 }
 
 /* called under rcu_read_lock(). */
@@ -97,8 +128,39 @@ static int blkiocg_populate(struct cgroup_subsys *subsys, struct cgroup *cgroup)
 static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup)
 {
 	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
+	unsigned long flags;
+	struct blkio_group *blkg;
+	void *key;
 
+	rcu_read_lock();
+remove_entry:
+	spin_lock_irqsave(&blkcg->lock, flags);
+
+	if (hlist_empty(&blkcg->blkg_list)) {
+		spin_unlock_irqrestore(&blkcg->lock, flags);
+		goto done;
+	}
+
+	blkg = hlist_entry(blkcg->blkg_list.first, struct blkio_group,
+				blkcg_node);
+	key = rcu_dereference(blkg->key);
+	__blkiocg_del_blkio_group(blkg);
+
+	spin_unlock_irqrestore(&blkcg->lock, flags);
+
+	/*
+	 * This blkio_group is being unlinked as associated cgroup is going
+	 * away. Let all the IO controlling policies know about this event.
+	 *
+	 * Currently this is static call to one io controlling policy. Once
+	 * we have more policies in place, we need some dynamic registration
+	 * of callback function.
+	 */
+	cfq_unlink_blkio_group(key, blkg);
+	goto remove_entry;
+done:
 	free_css_id(&blkio_subsys, &blkcg->css);
+	rcu_read_unlock();
 	kfree(blkcg);
 }
 
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index ba5703f69b42..cd50a2f8733e 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -26,6 +26,7 @@ struct blkio_group {
 	/* An rcu protected unique identifier for the group */
 	void *key;
 	struct hlist_node blkcg_node;
+	unsigned short blkcg_id;
 };
 
 #define BLKIO_WEIGHT_MIN	100
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index a877eeee80af..8bc31a50a57f 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -192,6 +192,7 @@ struct cfq_group {
 	struct blkio_group blkg;
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
 	struct hlist_node cfqd_node;
+	atomic_t ref;
 #endif
 };
 
@@ -924,6 +925,14 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
 		*st = CFQ_RB_ROOT;
 	RB_CLEAR_NODE(&cfqg->rb_node);
 
+	/*
+	 * Take the initial reference that will be released on destroy
+	 * This can be thought of a joint reference by cgroup and
+	 * elevator which will be dropped by either elevator exit
+	 * or cgroup deletion path depending on who is exiting first.
+	 */
+	atomic_set(&cfqg->ref, 1);
+
 	/* Add group onto cgroup list */
 	blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd);
 
@@ -960,7 +969,77 @@ static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg)
 		cfqg = &cfqq->cfqd->root_group;
 
 	cfqq->cfqg = cfqg;
+	/* cfqq reference on cfqg */
+	atomic_inc(&cfqq->cfqg->ref);
+}
+
+static void cfq_put_cfqg(struct cfq_group *cfqg)
+{
+	struct cfq_rb_root *st;
+	int i, j;
+
+	BUG_ON(atomic_read(&cfqg->ref) <= 0);
+	if (!atomic_dec_and_test(&cfqg->ref))
+		return;
+	for_each_cfqg_st(cfqg, i, j, st)
+		BUG_ON(!RB_EMPTY_ROOT(&st->rb) || st->active != NULL);
+	kfree(cfqg);
+}
+
+static void cfq_destroy_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg)
+{
+	/* Something wrong if we are trying to remove same group twice */
+	BUG_ON(hlist_unhashed(&cfqg->cfqd_node));
+
+	hlist_del_init(&cfqg->cfqd_node);
+
+	/*
+	 * Put the reference taken at the time of creation so that when all
+	 * queues are gone, group can be destroyed.
+	 */
+	cfq_put_cfqg(cfqg);
+}
+
+static void cfq_release_cfq_groups(struct cfq_data *cfqd)
+{
+	struct hlist_node *pos, *n;
+	struct cfq_group *cfqg;
+
+	hlist_for_each_entry_safe(cfqg, pos, n, &cfqd->cfqg_list, cfqd_node) {
+		/*
+		 * If cgroup removal path got to blk_group first and removed
+		 * it from cgroup list, then it will take care of destroying
+		 * cfqg also.
+		 */
+		if (!blkiocg_del_blkio_group(&cfqg->blkg))
+			cfq_destroy_cfqg(cfqd, cfqg);
+	}
 }
+
+/*
+ * Blk cgroup controller notification saying that blkio_group object is being
+ * delinked as associated cgroup object is going away. That also means that
+ * no new IO will come in this group. So get rid of this group as soon as
+ * any pending IO in the group is finished.
+ *
+ * This function is called under rcu_read_lock(). key is the rcu protected
+ * pointer. That means "key" is a valid cfq_data pointer as long as we are rcu
+ * read lock.
+ *
+ * "key" was fetched from blkio_group under blkio_cgroup->lock. That means
+ * it should not be NULL as even if elevator was exiting, cgroup deltion
+ * path got to it first.
+ */
+void cfq_unlink_blkio_group(void *key, struct blkio_group *blkg)
+{
+	unsigned long  flags;
+	struct cfq_data *cfqd = key;
+
+	spin_lock_irqsave(cfqd->queue->queue_lock, flags);
+	cfq_destroy_cfqg(cfqd, cfqg_of_blkg(blkg));
+	spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
+}
+
 #else /* GROUP_IOSCHED */
 static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create)
 {
@@ -971,6 +1050,9 @@ cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) {
 	cfqq->cfqg = cfqg;
 }
 
+static void cfq_release_cfq_groups(struct cfq_data *cfqd) {}
+static inline void cfq_put_cfqg(struct cfq_group *cfqg) {}
+
 #endif /* GROUP_IOSCHED */
 
 /*
@@ -2172,11 +2254,13 @@ static int cfq_dispatch_requests(struct request_queue *q, int force)
  * task holds one reference to the queue, dropped when task exits. each rq
  * in-flight on this queue also holds a reference, dropped when rq is freed.
  *
+ * Each cfq queue took a reference on the parent group. Drop it now.
  * queue lock must be held here.
  */
 static void cfq_put_queue(struct cfq_queue *cfqq)
 {
 	struct cfq_data *cfqd = cfqq->cfqd;
+	struct cfq_group *cfqg;
 
 	BUG_ON(atomic_read(&cfqq->ref) <= 0);
 
@@ -2186,6 +2270,7 @@ static void cfq_put_queue(struct cfq_queue *cfqq)
 	cfq_log_cfqq(cfqd, cfqq, "put_queue");
 	BUG_ON(rb_first(&cfqq->sort_list));
 	BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]);
+	cfqg = cfqq->cfqg;
 
 	if (unlikely(cfqd->active_queue == cfqq)) {
 		__cfq_slice_expired(cfqd, cfqq, 0);
@@ -2194,6 +2279,7 @@ static void cfq_put_queue(struct cfq_queue *cfqq)
 
 	BUG_ON(cfq_cfqq_on_rr(cfqq));
 	kmem_cache_free(cfq_pool, cfqq);
+	cfq_put_cfqg(cfqg);
 }
 
 /*
@@ -3369,11 +3455,15 @@ static void cfq_exit_queue(struct elevator_queue *e)
 	}
 
 	cfq_put_async_queues(cfqd);
+	cfq_release_cfq_groups(cfqd);
+	blkiocg_del_blkio_group(&cfqd->root_group.blkg);
 
 	spin_unlock_irq(q->queue_lock);
 
 	cfq_shutdown_timer_wq(cfqd);
 
+	/* Wait for cfqg->blkg->key accessors to exit their grace periods. */
+	synchronize_rcu();
 	kfree(cfqd);
 }
 
@@ -3401,6 +3491,11 @@ static void *cfq_init_queue(struct request_queue *q)
 	cfqg->weight = 2*BLKIO_WEIGHT_DEFAULT;
 
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
+	/*
+	 * Take a reference to root group which we never drop. This is just
+	 * to make sure that cfq_put_cfqg() does not try to kfree root group
+	 */
+	atomic_set(&cfqg->ref, 1);
 	blkiocg_add_blkio_group(&blkio_root_cgroup, &cfqg->blkg, (void *)cfqd);
 #endif
 	/*
-- 
cgit v1.2.2


From 2868ef7b39490e6b41c2c61cd9a5cd891e778b54 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Thu, 3 Dec 2009 12:59:48 -0500
Subject: blkio: Some debugging aids for CFQ

o Some debugging aids for CFQ.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/Kconfig         |  9 +++++++++
 block/Kconfig.iosched |  9 +++++++++
 block/blk-cgroup.c    |  4 ++++
 block/blk-cgroup.h    | 13 +++++++++++++
 block/cfq-iosched.c   | 19 ++++++++++++++++++-
 5 files changed, 53 insertions(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/Kconfig b/block/Kconfig
index 6ba1a8e3388b..e20fbde0875c 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -90,6 +90,15 @@ config BLK_CGROUP
 	control disk bandwidth allocation (proportional time slice allocation)
 	to such task groups.
 
+config DEBUG_BLK_CGROUP
+	bool
+	depends on BLK_CGROUP
+	default n
+	---help---
+	Enable some debugging help. Currently it stores the cgroup path
+	in the blk group which can be used by cfq for tracing various
+	group related activity.
+
 endif # BLOCK
 
 config BLOCK_COMPAT
diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
index fa95fa770570..b71abfb0d726 100644
--- a/block/Kconfig.iosched
+++ b/block/Kconfig.iosched
@@ -40,6 +40,15 @@ config CFQ_GROUP_IOSCHED
 	---help---
 	  Enable group IO scheduling in CFQ.
 
+config DEBUG_CFQ_IOSCHED
+	bool "Debug CFQ Scheduling"
+	depends on CFQ_GROUP_IOSCHED
+	select DEBUG_BLK_CGROUP
+	default n
+	---help---
+	  Enable CFQ IO scheduling debugging in CFQ. Currently it makes
+	  blktrace output more verbose.
+
 choice
 	prompt "Default I/O scheduler"
 	default DEFAULT_CFQ
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 0426ab692fd5..6bc99a3865b0 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -33,6 +33,10 @@ void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
 	blkg->blkcg_id = css_id(&blkcg->css);
 	hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
 	spin_unlock_irqrestore(&blkcg->lock, flags);
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+	/* Need to take css reference ? */
+	cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path));
+#endif
 }
 
 static void __blkiocg_del_blkio_group(struct blkio_group *blkg)
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index cd50a2f8733e..3573199b298b 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -27,12 +27,25 @@ struct blkio_group {
 	void *key;
 	struct hlist_node blkcg_node;
 	unsigned short blkcg_id;
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+	/* Store cgroup path */
+	char path[128];
+#endif
 };
 
 #define BLKIO_WEIGHT_MIN	100
 #define BLKIO_WEIGHT_MAX	1000
 #define BLKIO_WEIGHT_DEFAULT	500
 
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+static inline char *blkg_path(struct blkio_group *blkg)
+{
+	return blkg->path;
+}
+#else
+static inline char *blkg_path(struct blkio_group *blkg) { return NULL; }
+#endif
+
 #ifdef CONFIG_BLK_CGROUP
 extern struct blkio_cgroup blkio_root_cgroup;
 extern struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup);
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 8bc31a50a57f..662d4e55b3c2 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -341,8 +341,21 @@ CFQ_CFQQ_FNS(coop);
 CFQ_CFQQ_FNS(deep);
 #undef CFQ_CFQQ_FNS
 
+#ifdef CONFIG_DEBUG_CFQ_IOSCHED
+#define cfq_log_cfqq(cfqd, cfqq, fmt, args...)	\
+	blk_add_trace_msg((cfqd)->queue, "cfq%d%c %s " fmt, (cfqq)->pid, \
+			cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \
+			blkg_path(&(cfqq)->cfqg->blkg), ##args);
+
+#define cfq_log_cfqg(cfqd, cfqg, fmt, args...)				\
+	blk_add_trace_msg((cfqd)->queue, "%s " fmt,			\
+				blkg_path(&(cfqg)->blkg), ##args);      \
+
+#else
 #define cfq_log_cfqq(cfqd, cfqq, fmt, args...)	\
 	blk_add_trace_msg((cfqd)->queue, "cfq%d " fmt, (cfqq)->pid, ##args)
+#define cfq_log_cfqg(cfqd, cfqg, fmt, args...)		do {} while (0);
+#endif
 #define cfq_log(cfqd, fmt, args...)	\
 	blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args)
 
@@ -832,6 +845,7 @@ cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
 	if (cfqg->nr_cfqq)
 		return;
 
+	cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");
 	cfqg->on_st = false;
 	cfqd->nr_groups--;
 	st->total_weight -= cfqg->weight;
@@ -889,6 +903,9 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
 		cfqg->saved_serving_prio = cfqd->serving_prio;
 	} else
 		cfqg->saved_workload_slice = 0;
+
+	cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime,
+					st->min_vdisktime);
 }
 
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
@@ -3102,7 +3119,7 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
 	unsigned long now;
 
 	now = jiffies;
-	cfq_log_cfqq(cfqd, cfqq, "complete");
+	cfq_log_cfqq(cfqd, cfqq, "complete rqnoidle %d", !!rq_noidle(rq));
 
 	cfq_update_hw_tag(cfqd);
 
-- 
cgit v1.2.2


From 220841906fccafaf4094e87bdb6d252e20cf8c7c Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Thu, 3 Dec 2009 12:59:49 -0500
Subject: blkio: Export disk time and sectors used by a group to user space

o Export disk time and sector used by a group to user space through cgroup
  interface.

o Also export a "dequeue" interface to cgroup which keeps track of how many
  a times a group was deleted from service tree. Helps in debugging.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-cgroup.c  | 64 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
 block/blk-cgroup.h  | 22 ++++++++++++++++--
 block/cfq-iosched.c | 19 +++++++++++++---
 3 files changed, 99 insertions(+), 6 deletions(-)

(limited to 'block')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 6bc99a3865b0..4ef78d35cbd2 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -11,6 +11,8 @@
  * 	              Nauman Rafique <nauman@google.com>
  */
 #include <linux/ioprio.h>
+#include <linux/seq_file.h>
+#include <linux/kdev_t.h>
 #include "blk-cgroup.h"
 
 extern void cfq_unlink_blkio_group(void *, struct blkio_group *);
@@ -23,8 +25,15 @@ struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup)
 			    struct blkio_cgroup, css);
 }
 
+void blkiocg_update_blkio_group_stats(struct blkio_group *blkg,
+			unsigned long time, unsigned long sectors)
+{
+	blkg->time += time;
+	blkg->sectors += sectors;
+}
+
 void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
-				struct blkio_group *blkg, void *key)
+			struct blkio_group *blkg, void *key, dev_t dev)
 {
 	unsigned long flags;
 
@@ -37,6 +46,7 @@ void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
 	/* Need to take css reference ? */
 	cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path));
 #endif
+	blkg->dev = dev;
 }
 
 static void __blkiocg_del_blkio_group(struct blkio_group *blkg)
@@ -115,12 +125,64 @@ blkiocg_weight_write(struct cgroup *cgroup, struct cftype *cftype, u64 val)
 	return 0;
 }
 
+#define SHOW_FUNCTION_PER_GROUP(__VAR)					\
+static int blkiocg_##__VAR##_read(struct cgroup *cgroup,		\
+			struct cftype *cftype, struct seq_file *m)	\
+{									\
+	struct blkio_cgroup *blkcg;					\
+	struct blkio_group *blkg;					\
+	struct hlist_node *n;						\
+									\
+	if (!cgroup_lock_live_group(cgroup))				\
+		return -ENODEV;						\
+									\
+	blkcg = cgroup_to_blkio_cgroup(cgroup);				\
+	rcu_read_lock();						\
+	hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {\
+		if (blkg->dev)						\
+			seq_printf(m, "%u:%u %lu\n", MAJOR(blkg->dev),	\
+				 MINOR(blkg->dev), blkg->__VAR);	\
+	}								\
+	rcu_read_unlock();						\
+	cgroup_unlock();						\
+	return 0;							\
+}
+
+SHOW_FUNCTION_PER_GROUP(time);
+SHOW_FUNCTION_PER_GROUP(sectors);
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+SHOW_FUNCTION_PER_GROUP(dequeue);
+#endif
+#undef SHOW_FUNCTION_PER_GROUP
+
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+void blkiocg_update_blkio_group_dequeue_stats(struct blkio_group *blkg,
+			unsigned long dequeue)
+{
+	blkg->dequeue += dequeue;
+}
+#endif
+
 struct cftype blkio_files[] = {
 	{
 		.name = "weight",
 		.read_u64 = blkiocg_weight_read,
 		.write_u64 = blkiocg_weight_write,
 	},
+	{
+		.name = "time",
+		.read_seq_string = blkiocg_time_read,
+	},
+	{
+		.name = "sectors",
+		.read_seq_string = blkiocg_sectors_read,
+	},
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+       {
+		.name = "dequeue",
+		.read_seq_string = blkiocg_dequeue_read,
+       },
+#endif
 };
 
 static int blkiocg_populate(struct cgroup_subsys *subsys, struct cgroup *cgroup)
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 3573199b298b..b24ab71db826 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -30,7 +30,15 @@ struct blkio_group {
 #ifdef CONFIG_DEBUG_BLK_CGROUP
 	/* Store cgroup path */
 	char path[128];
+	/* How many times this group has been removed from service tree */
+	unsigned long dequeue;
 #endif
+	/* The device MKDEV(major, minor), this group has been created for */
+	dev_t   dev;
+
+	/* total disk time and nr sectors dispatched by this group */
+	unsigned long time;
+	unsigned long sectors;
 };
 
 #define BLKIO_WEIGHT_MIN	100
@@ -42,24 +50,30 @@ static inline char *blkg_path(struct blkio_group *blkg)
 {
 	return blkg->path;
 }
+void blkiocg_update_blkio_group_dequeue_stats(struct blkio_group *blkg,
+				unsigned long dequeue);
 #else
 static inline char *blkg_path(struct blkio_group *blkg) { return NULL; }
+static inline void blkiocg_update_blkio_group_dequeue_stats(
+			struct blkio_group *blkg, unsigned long dequeue) {}
 #endif
 
 #ifdef CONFIG_BLK_CGROUP
 extern struct blkio_cgroup blkio_root_cgroup;
 extern struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup);
 extern void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
-				struct blkio_group *blkg, void *key);
+			struct blkio_group *blkg, void *key, dev_t dev);
 extern int blkiocg_del_blkio_group(struct blkio_group *blkg);
 extern struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg,
 						void *key);
+void blkiocg_update_blkio_group_stats(struct blkio_group *blkg,
+			unsigned long time, unsigned long sectors);
 #else
 static inline struct blkio_cgroup *
 cgroup_to_blkio_cgroup(struct cgroup *cgroup) { return NULL; }
 
 static inline void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
-			struct blkio_group *blkg, void *key)
+			struct blkio_group *blkg, void *key, dev_t dev)
 {
 }
 
@@ -68,5 +82,9 @@ blkiocg_del_blkio_group(struct blkio_group *blkg) { return 0; }
 
 static inline struct blkio_group *
 blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key) { return NULL; }
+static inline void blkiocg_update_blkio_group_stats(struct blkio_group *blkg,
+			unsigned long time, unsigned long sectors)
+{
+}
 #endif
 #endif /* _BLK_CGROUP_H */
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 662d4e55b3c2..7d345e772d88 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -143,6 +143,8 @@ struct cfq_queue {
 	struct cfq_rb_root *service_tree;
 	struct cfq_queue *new_cfqq;
 	struct cfq_group *cfqg;
+	/* Sectors dispatched in current dispatch round */
+	unsigned long nr_sectors;
 };
 
 /*
@@ -852,6 +854,7 @@ cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
 	if (!RB_EMPTY_NODE(&cfqg->rb_node))
 		cfq_rb_erase(&cfqg->rb_node, st);
 	cfqg->saved_workload_slice = 0;
+	blkiocg_update_blkio_group_dequeue_stats(&cfqg->blkg, 1);
 }
 
 static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
@@ -878,7 +881,8 @@ static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
 			slice_used = allocated_slice;
 	}
 
-	cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u", slice_used);
+	cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u sect=%lu", slice_used,
+				cfqq->nr_sectors);
 	return slice_used;
 }
 
@@ -906,6 +910,8 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
 
 	cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime,
 					st->min_vdisktime);
+	blkiocg_update_blkio_group_stats(&cfqg->blkg, used_sl,
+						cfqq->nr_sectors);
 }
 
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
@@ -924,6 +930,8 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
 	void *key = cfqd;
 	int i, j;
 	struct cfq_rb_root *st;
+	struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
+	unsigned int major, minor;
 
 	/* Do we need to take this reference */
 	if (!css_tryget(&blkcg->css))
@@ -951,7 +959,9 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
 	atomic_set(&cfqg->ref, 1);
 
 	/* Add group onto cgroup list */
-	blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd);
+	sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
+	blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
+					MKDEV(major, minor));
 
 	/* Add group on cfqd list */
 	hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
@@ -1478,6 +1488,7 @@ static void __cfq_set_active_queue(struct cfq_data *cfqd,
 		cfqq->dispatch_start = jiffies;
 		cfqq->slice_end = 0;
 		cfqq->slice_dispatch = 0;
+		cfqq->nr_sectors = 0;
 
 		cfq_clear_cfqq_wait_request(cfqq);
 		cfq_clear_cfqq_must_dispatch(cfqq);
@@ -1801,6 +1812,7 @@ static void cfq_dispatch_insert(struct request_queue *q, struct request *rq)
 
 	if (cfq_cfqq_sync(cfqq))
 		cfqd->sync_flight++;
+	cfqq->nr_sectors += blk_rq_sectors(rq);
 }
 
 /*
@@ -3513,7 +3525,8 @@ static void *cfq_init_queue(struct request_queue *q)
 	 * to make sure that cfq_put_cfqg() does not try to kfree root group
 	 */
 	atomic_set(&cfqg->ref, 1);
-	blkiocg_add_blkio_group(&blkio_root_cgroup, &cfqg->blkg, (void *)cfqd);
+	blkiocg_add_blkio_group(&blkio_root_cgroup, &cfqg->blkg, (void *)cfqd,
+					0);
 #endif
 	/*
 	 * Not strictly needed (since RB_ROOT just clears the node and we
-- 
cgit v1.2.2


From 8682e1f15f26dae9a9e8af794d179055fbd81166 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Thu, 3 Dec 2009 12:59:50 -0500
Subject: blkio: Provide some isolation between groups

o Do not allow following three operations across groups for isolation.
	- selection of co-operating queues
	- preemtpions across groups
	- request merging across groups.

o Async queues are currently global and not per group. Allow preemption of
  an async queue if a sync queue in other group gets backlogged.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 30 ++++++++++++++++++++----------
 1 file changed, 20 insertions(+), 10 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 7d345e772d88..3a62ce95daec 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1461,6 +1461,9 @@ static int cfq_allow_merge(struct request_queue *q, struct request *rq,
 	struct cfq_io_context *cic;
 	struct cfq_queue *cfqq;
 
+	/* Deny merge if bio and rq don't belong to same cfq group */
+	if ((RQ_CFQQ(rq))->cfqg != cfq_get_cfqg(cfqd, 0))
+		return false;
 	/*
 	 * Disallow merge of a sync bio into an async request.
 	 */
@@ -1698,6 +1701,10 @@ static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd,
 	if (!cfqq)
 		return NULL;
 
+	/* If new queue belongs to different cfq_group, don't choose it */
+	if (cur_cfqq->cfqg != cfqq->cfqg)
+		return NULL;
+
 	/*
 	 * It only makes sense to merge sync queues.
 	 */
@@ -2950,22 +2957,12 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
 	if (!cfqq)
 		return false;
 
-	if (cfq_slice_used(cfqq))
-		return true;
-
 	if (cfq_class_idle(new_cfqq))
 		return false;
 
 	if (cfq_class_idle(cfqq))
 		return true;
 
-	/* Allow preemption only if we are idling on sync-noidle tree */
-	if (cfqd->serving_type == SYNC_NOIDLE_WORKLOAD &&
-	    cfqq_type(new_cfqq) == SYNC_NOIDLE_WORKLOAD &&
-	    new_cfqq->service_tree->count == 2 &&
-	    RB_EMPTY_ROOT(&cfqq->sort_list))
-		return true;
-
 	/*
 	 * if the new request is sync, but the currently running queue is
 	 * not, let the sync request have priority.
@@ -2973,6 +2970,19 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
 	if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq))
 		return true;
 
+	if (new_cfqq->cfqg != cfqq->cfqg)
+		return false;
+
+	if (cfq_slice_used(cfqq))
+		return true;
+
+	/* Allow preemption only if we are idling on sync-noidle tree */
+	if (cfqd->serving_type == SYNC_NOIDLE_WORKLOAD &&
+	    cfqq_type(new_cfqq) == SYNC_NOIDLE_WORKLOAD &&
+	    new_cfqq->service_tree->count == 2 &&
+	    RB_EMPTY_ROOT(&cfqq->sort_list))
+		return true;
+
 	/*
 	 * So both queues are sync. Let the new request get disk time if
 	 * it's a metadata request and the current queue is doing regular IO.
-- 
cgit v1.2.2


From 24610333d578478d354144ab4709a203684afc5f Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Thu, 3 Dec 2009 12:59:51 -0500
Subject: blkio: Drop the reference to queue once the task changes cgroup

o If a task changes cgroup, drop reference to the cfqq associated with io
  context and set cfqq pointer stored in ioc to NULL so that upon next request
  arrival we will allocate a  new queue in new group.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 39 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 3a62ce95daec..3d99e45789bd 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -2608,6 +2608,41 @@ static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 	cfqq->pid = pid;
 }
 
+#ifdef CONFIG_CFQ_GROUP_IOSCHED
+static void changed_cgroup(struct io_context *ioc, struct cfq_io_context *cic)
+{
+	struct cfq_queue *sync_cfqq = cic_to_cfqq(cic, 1);
+	struct cfq_data *cfqd = cic->key;
+	unsigned long flags;
+	struct request_queue *q;
+
+	if (unlikely(!cfqd))
+		return;
+
+	q = cfqd->queue;
+
+	spin_lock_irqsave(q->queue_lock, flags);
+
+	if (sync_cfqq) {
+		/*
+		 * Drop reference to sync queue. A new sync queue will be
+		 * assigned in new group upon arrival of a fresh request.
+		 */
+		cfq_log_cfqq(cfqd, sync_cfqq, "changed cgroup");
+		cic_set_cfqq(cic, NULL, 1);
+		cfq_put_queue(sync_cfqq);
+	}
+
+	spin_unlock_irqrestore(q->queue_lock, flags);
+}
+
+static void cfq_ioc_set_cgroup(struct io_context *ioc)
+{
+	call_for_each_cic(ioc, changed_cgroup);
+	ioc->cgroup_changed = 0;
+}
+#endif  /* CONFIG_CFQ_GROUP_IOSCHED */
+
 static struct cfq_queue *
 cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync,
 		     struct io_context *ioc, gfp_t gfp_mask)
@@ -2840,6 +2875,10 @@ out:
 	if (unlikely(ioc->ioprio_changed))
 		cfq_ioc_set_ioprio(ioc);
 
+#ifdef CONFIG_CFQ_GROUP_IOSCHED
+	if (unlikely(ioc->cgroup_changed))
+		cfq_ioc_set_cgroup(ioc);
+#endif
 	return cic;
 err_free:
 	cfq_cic_free(cic);
-- 
cgit v1.2.2


From f8d461d692c341add957fb973fb5ee1f62039dc7 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Thu, 3 Dec 2009 12:59:52 -0500
Subject: blkio: Propagate cgroup weight updation to cfq groups

o Propagate blkio cgroup weight updation to associated cfq groups.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-cgroup.c  | 7 +++++++
 block/cfq-iosched.c | 6 ++++++
 2 files changed, 13 insertions(+)

(limited to 'block')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 4ef78d35cbd2..179ddfaebc5c 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -16,6 +16,7 @@
 #include "blk-cgroup.h"
 
 extern void cfq_unlink_blkio_group(void *, struct blkio_group *);
+extern void cfq_update_blkio_group_weight(struct blkio_group *, unsigned int);
 
 struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT };
 
@@ -116,12 +117,18 @@ static int
 blkiocg_weight_write(struct cgroup *cgroup, struct cftype *cftype, u64 val)
 {
 	struct blkio_cgroup *blkcg;
+	struct blkio_group *blkg;
+	struct hlist_node *n;
 
 	if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX)
 		return -EINVAL;
 
 	blkcg = cgroup_to_blkio_cgroup(cgroup);
+	spin_lock_irq(&blkcg->lock);
 	blkcg->weight = (unsigned int)val;
+	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node)
+		cfq_update_blkio_group_weight(blkg, blkcg->weight);
+	spin_unlock_irq(&blkcg->lock);
 	return 0;
 }
 
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 3d99e45789bd..f7364621613a 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -922,6 +922,12 @@ static inline struct cfq_group *cfqg_of_blkg(struct blkio_group *blkg)
 	return NULL;
 }
 
+void
+cfq_update_blkio_group_weight(struct blkio_group *blkg, unsigned int weight)
+{
+	cfqg_of_blkg(blkg)->weight = weight;
+}
+
 static struct cfq_group *
 cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
 {
-- 
cgit v1.2.2


From f75edf2dc828802d358393be80a6c89e919f8273 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Thu, 3 Dec 2009 12:59:53 -0500
Subject: blkio: Wait for cfq queue to get backlogged if group is empty

o If a queue consumes its slice and then gets deleted from service tree, its
  associated group will also get deleted from service tree if this was the
  only queue in the group. That will make group loose its share.

o For the queues on which we have idling on and if these have used their
  slice, wait a bit for these queues to get backlogged again and then
  expire these queues so that group does not loose its share.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 34 +++++++++++++++++++++++++++++-----
 1 file changed, 29 insertions(+), 5 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index f7364621613a..1cc10489eaf0 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -117,6 +117,7 @@ struct cfq_queue {
 
 	/* time when queue got scheduled in to dispatch first request. */
 	unsigned long dispatch_start;
+	unsigned int allocated_slice;
 	/* time when first request from queue completed and slice started. */
 	unsigned long slice_start;
 	unsigned long slice_end;
@@ -314,6 +315,8 @@ enum cfqq_state_flags {
 	CFQ_CFQQ_FLAG_sync,		/* synchronous queue */
 	CFQ_CFQQ_FLAG_coop,		/* cfqq is shared */
 	CFQ_CFQQ_FLAG_deep,		/* sync cfqq experienced large depth */
+	CFQ_CFQQ_FLAG_wait_busy,	/* Waiting for next request */
+	CFQ_CFQQ_FLAG_wait_busy_done,	/* Got new request. Expire the queue */
 };
 
 #define CFQ_CFQQ_FNS(name)						\
@@ -341,6 +344,8 @@ CFQ_CFQQ_FNS(slice_new);
 CFQ_CFQQ_FNS(sync);
 CFQ_CFQQ_FNS(coop);
 CFQ_CFQQ_FNS(deep);
+CFQ_CFQQ_FNS(wait_busy);
+CFQ_CFQQ_FNS(wait_busy_done);
 #undef CFQ_CFQQ_FNS
 
 #ifdef CONFIG_DEBUG_CFQ_IOSCHED
@@ -578,6 +583,7 @@ cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 	}
 	cfqq->slice_start = jiffies;
 	cfqq->slice_end = jiffies + slice;
+	cfqq->allocated_slice = slice;
 	cfq_log_cfqq(cfqd, cfqq, "set_slice=%lu", cfqq->slice_end - jiffies);
 }
 
@@ -859,7 +865,7 @@ cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
 
 static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
 {
-	unsigned int slice_used, allocated_slice;
+	unsigned int slice_used;
 
 	/*
 	 * Queue got expired before even a single request completed or
@@ -876,9 +882,8 @@ static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
 					1);
 	} else {
 		slice_used = jiffies - cfqq->slice_start;
-		allocated_slice = cfqq->slice_end - cfqq->slice_start;
-		if (slice_used > allocated_slice)
-			slice_used = allocated_slice;
+		if (slice_used > cfqq->allocated_slice)
+			slice_used = cfqq->allocated_slice;
 	}
 
 	cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u sect=%lu", slice_used,
@@ -1495,6 +1500,7 @@ static void __cfq_set_active_queue(struct cfq_data *cfqd,
 		cfq_log_cfqq(cfqd, cfqq, "set_active");
 		cfqq->slice_start = 0;
 		cfqq->dispatch_start = jiffies;
+		cfqq->allocated_slice = 0;
 		cfqq->slice_end = 0;
 		cfqq->slice_dispatch = 0;
 		cfqq->nr_sectors = 0;
@@ -1524,6 +1530,8 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 		del_timer(&cfqd->idle_slice_timer);
 
 	cfq_clear_cfqq_wait_request(cfqq);
+	cfq_clear_cfqq_wait_busy(cfqq);
+	cfq_clear_cfqq_wait_busy_done(cfqq);
 
 	/*
 	 * store what was left of this slice, if the queue idled/timed out
@@ -2066,7 +2074,8 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
 	/*
 	 * The active queue has run out of time, expire it and select new.
 	 */
-	if (cfq_slice_used(cfqq) && !cfq_cfqq_must_dispatch(cfqq))
+	if ((cfq_slice_used(cfqq) || cfq_cfqq_wait_busy_done(cfqq))
+	     && !cfq_cfqq_must_dispatch(cfqq))
 		goto expire;
 
 	/*
@@ -3096,6 +3105,10 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 	cfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
 
 	if (cfqq == cfqd->active_queue) {
+		if (cfq_cfqq_wait_busy(cfqq)) {
+			cfq_clear_cfqq_wait_busy(cfqq);
+			cfq_mark_cfqq_wait_busy_done(cfqq);
+		}
 		/*
 		 * Remember that we saw a request from this process, but
 		 * don't start queuing just yet. Otherwise we risk seeing lots
@@ -3214,6 +3227,17 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
 			cfq_set_prio_slice(cfqd, cfqq);
 			cfq_clear_cfqq_slice_new(cfqq);
 		}
+
+		/*
+		 * If this queue consumed its slice and this is last queue
+		 * in the group, wait for next request before we expire
+		 * the queue
+		 */
+		if (cfq_slice_used(cfqq) && cfqq->cfqg->nr_cfqq == 1) {
+			cfqq->slice_end = jiffies + cfqd->cfq_slice_idle;
+			cfq_mark_cfqq_wait_busy(cfqq);
+		}
+
 		/*
 		 * Idling is not enabled on:
 		 * - expired queues
-- 
cgit v1.2.2


From f26bd1f0a3a31bc5e16d285f5e1b00a56abf6238 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Thu, 3 Dec 2009 12:59:54 -0500
Subject: blkio: Determine async workload length based on total number of
 queues

o Async queues are not per group. Instead these are system wide and maintained
  in root group. Hence their workload slice length should be calculated
  based on total number of queues in the system and not just queues in the
  root group.

o As root group's default weight is 1000, make sure to charge async queue
  more in terms of vtime so that it does not get more time on disk because
  root group has higher weight.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 36 +++++++++++++++++++++++++++++++-----
 1 file changed, 31 insertions(+), 5 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 1cc10489eaf0..b9e483d9031e 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -408,6 +408,13 @@ static inline int cfq_group_busy_queues_wl(enum wl_prio_t wl,
 		+ cfqg->service_trees[wl][SYNC_WORKLOAD].count;
 }
 
+static inline int cfqg_busy_async_queues(struct cfq_data *cfqd,
+					struct cfq_group *cfqg)
+{
+	return cfqg->service_trees[RT_WORKLOAD][ASYNC_WORKLOAD].count
+		+ cfqg->service_trees[BE_WORKLOAD][ASYNC_WORKLOAD].count;
+}
+
 static void cfq_dispatch_insert(struct request_queue *, struct request *);
 static struct cfq_queue *cfq_get_queue(struct cfq_data *, bool,
 				       struct io_context *, gfp_t);
@@ -895,13 +902,19 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
 				struct cfq_queue *cfqq)
 {
 	struct cfq_rb_root *st = &cfqd->grp_service_tree;
-	unsigned int used_sl;
+	unsigned int used_sl, charge_sl;
+	int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg)
+			- cfqg->service_tree_idle.count;
+
+	BUG_ON(nr_sync < 0);
+	used_sl = charge_sl = cfq_cfqq_slice_usage(cfqq);
 
-	used_sl = cfq_cfqq_slice_usage(cfqq);
+	if (!cfq_cfqq_sync(cfqq) && !nr_sync)
+		charge_sl = cfqq->allocated_slice;
 
 	/* Can't update vdisktime while group is on service tree */
 	cfq_rb_erase(&cfqg->rb_node, st);
-	cfqg->vdisktime += cfq_scale_slice(used_sl, cfqg);
+	cfqg->vdisktime += cfq_scale_slice(charge_sl, cfqg);
 	__cfq_group_service_tree_add(st, cfqg);
 
 	/* This group is being expired. Save the context */
@@ -2016,11 +2029,24 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
 		max_t(unsigned, cfqg->busy_queues_avg[cfqd->serving_prio],
 		      cfq_group_busy_queues_wl(cfqd->serving_prio, cfqd, cfqg));
 
-	if (cfqd->serving_type == ASYNC_WORKLOAD)
+	if (cfqd->serving_type == ASYNC_WORKLOAD) {
+		unsigned int tmp;
+
+		/*
+		 * Async queues are currently system wide. Just taking
+		 * proportion of queues with-in same group will lead to higher
+		 * async ratio system wide as generally root group is going
+		 * to have higher weight. A more accurate thing would be to
+		 * calculate system wide asnc/sync ratio.
+		 */
+		tmp = cfq_target_latency * cfqg_busy_async_queues(cfqd, cfqg);
+		tmp = tmp/cfqd->busy_queues;
+		slice = min_t(unsigned, slice, tmp);
+
 		/* async workload slice is scaled down according to
 		 * the sync/async slice ratio. */
 		slice = slice * cfqd->cfq_slice[0] / cfqd->cfq_slice[1];
-	else
+	} else
 		/* sync workload slice is at least 2 * cfq_slice_idle */
 		slice = max(slice, 2 * cfqd->cfq_slice_idle);
 
-- 
cgit v1.2.2


From ae30c286553c91c49af5cbc0265a05a6543d0c52 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Thu, 3 Dec 2009 12:59:55 -0500
Subject: blkio: Implement group_isolation tunable

o If a group is running only a random reader, then it will not have enough
  traffic to keep disk busy and we will reduce overall throughput. This
  should result in better latencies for random reader though. If we don't
  idle on random reader service tree, then this random reader will experience
  large latencies if there are other groups present in system with sequential
  readers running in these.

o One solution suggested by corrado is that by default keep the random readers
  or sync-noidle workload in root group so that during one dispatch round
  we idle only once on sync-noidle tree. This means that all the sync-idle
  workload queues will be in their respective group and we will see service
  differentiation in those but not on sync-noidle workload.

o Provide a tunable group_isolation. If set, this will make sure that even
  sync-noidle queues go in their respective group and we wait on these. This
  provides stronger isolation between groups but at the expense of throughput
  if group does not have enough traffic to keep the disk busy.

o By default group_isolation = 0

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 37 ++++++++++++++++++++++++++++++++++++-
 1 file changed, 36 insertions(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index b9e483d9031e..063dcbb714e7 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -144,6 +144,7 @@ struct cfq_queue {
 	struct cfq_rb_root *service_tree;
 	struct cfq_queue *new_cfqq;
 	struct cfq_group *cfqg;
+	struct cfq_group *orig_cfqg;
 	/* Sectors dispatched in current dispatch round */
 	unsigned long nr_sectors;
 };
@@ -273,6 +274,7 @@ struct cfq_data {
 	unsigned int cfq_slice_async_rq;
 	unsigned int cfq_slice_idle;
 	unsigned int cfq_latency;
+	unsigned int cfq_group_isolation;
 
 	struct list_head cic_list;
 
@@ -1120,6 +1122,33 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 	struct cfq_rb_root *service_tree;
 	int left;
 	int new_cfqq = 1;
+	int group_changed = 0;
+
+#ifdef CONFIG_CFQ_GROUP_IOSCHED
+	if (!cfqd->cfq_group_isolation
+	    && cfqq_type(cfqq) == SYNC_NOIDLE_WORKLOAD
+	    && cfqq->cfqg && cfqq->cfqg != &cfqd->root_group) {
+		/* Move this cfq to root group */
+		cfq_log_cfqq(cfqd, cfqq, "moving to root group");
+		if (!RB_EMPTY_NODE(&cfqq->rb_node))
+			cfq_group_service_tree_del(cfqd, cfqq->cfqg);
+		cfqq->orig_cfqg = cfqq->cfqg;
+		cfqq->cfqg = &cfqd->root_group;
+		atomic_inc(&cfqd->root_group.ref);
+		group_changed = 1;
+	} else if (!cfqd->cfq_group_isolation
+		   && cfqq_type(cfqq) == SYNC_WORKLOAD && cfqq->orig_cfqg) {
+		/* cfqq is sequential now needs to go to its original group */
+		BUG_ON(cfqq->cfqg != &cfqd->root_group);
+		if (!RB_EMPTY_NODE(&cfqq->rb_node))
+			cfq_group_service_tree_del(cfqd, cfqq->cfqg);
+		cfq_put_cfqg(cfqq->cfqg);
+		cfqq->cfqg = cfqq->orig_cfqg;
+		cfqq->orig_cfqg = NULL;
+		group_changed = 1;
+		cfq_log_cfqq(cfqd, cfqq, "moved to origin group");
+	}
+#endif
 
 	service_tree = service_tree_for(cfqq->cfqg, cfqq_prio(cfqq),
 						cfqq_type(cfqq), cfqd);
@@ -1190,7 +1219,7 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 	rb_link_node(&cfqq->rb_node, parent, p);
 	rb_insert_color(&cfqq->rb_node, &service_tree->rb);
 	service_tree->count++;
-	if (add_front || !new_cfqq)
+	if ((add_front || !new_cfqq) && !group_changed)
 		return;
 	cfq_group_service_tree_add(cfqd, cfqq->cfqg);
 }
@@ -2357,6 +2386,8 @@ static void cfq_put_queue(struct cfq_queue *cfqq)
 	BUG_ON(cfq_cfqq_on_rr(cfqq));
 	kmem_cache_free(cfq_pool, cfqq);
 	cfq_put_cfqg(cfqg);
+	if (cfqq->orig_cfqg)
+		cfq_put_cfqg(cfqq->orig_cfqg);
 }
 
 /*
@@ -3670,6 +3701,7 @@ static void *cfq_init_queue(struct request_queue *q)
 	cfqd->cfq_slice_async_rq = cfq_slice_async_rq;
 	cfqd->cfq_slice_idle = cfq_slice_idle;
 	cfqd->cfq_latency = 1;
+	cfqd->cfq_group_isolation = 0;
 	cfqd->hw_tag = -1;
 	cfqd->last_end_sync_rq = jiffies;
 	return cfqd;
@@ -3740,6 +3772,7 @@ SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1);
 SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1);
 SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0);
 SHOW_FUNCTION(cfq_low_latency_show, cfqd->cfq_latency, 0);
+SHOW_FUNCTION(cfq_group_isolation_show, cfqd->cfq_group_isolation, 0);
 #undef SHOW_FUNCTION
 
 #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)			\
@@ -3772,6 +3805,7 @@ STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1);
 STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1,
 		UINT_MAX, 0);
 STORE_FUNCTION(cfq_low_latency_store, &cfqd->cfq_latency, 0, 1, 0);
+STORE_FUNCTION(cfq_group_isolation_store, &cfqd->cfq_group_isolation, 0, 1, 0);
 #undef STORE_FUNCTION
 
 #define CFQ_ATTR(name) \
@@ -3788,6 +3822,7 @@ static struct elv_fs_entry cfq_attrs[] = {
 	CFQ_ATTR(slice_async_rq),
 	CFQ_ATTR(slice_idle),
 	CFQ_ATTR(low_latency),
+	CFQ_ATTR(group_isolation),
 	__ATTR_NULL
 };
 
-- 
cgit v1.2.2


From c04645e592d4dd60c58def40c913699d4c806727 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Thu, 3 Dec 2009 12:59:56 -0500
Subject: blkio: Wait on sync-noidle queue even if rq_noidle = 1

o rq_noidle() is supposed to tell cfq that do not expect a request after this
  one, hence don't idle. But this does not seem to work very well. For example
  for direct random readers, rq_noidle = 1 but there is next request coming
  after this. Not idling, leads to a group not getting its share even if
  group_isolation=1.

o The right solution for this issue is to scan the higher layers and set
  right flag (WRITE_SYNC or WRITE_ODIRECT). For the time being, this single
  line fix helps. This should not have any significant impact when we are
  not using cgroups. I will later figure out IO paths in higher layer and
  fix it.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 063dcbb714e7..08b057b1b3b2 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -3314,7 +3314,8 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
 			 * only if we processed at least one !rq_noidle request
 			 */
 			if (cfqd->serving_type == SYNC_WORKLOAD
-			    || cfqd->noidle_tree_requires_idle)
+			    || cfqd->noidle_tree_requires_idle
+			    || cfqq->cfqg->nr_cfqq == 1)
 				cfq_arm_slice_timer(cfqd);
 		}
 	}
-- 
cgit v1.2.2


From 2f5ea47712489a9d2d3cb832eb06062e4e64e0ec Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Thu, 3 Dec 2009 21:06:43 +0100
Subject: cfq-iosched: fix compile problem with !CONFIG_CGROUP

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-cgroup.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'block')

diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index b24ab71db826..257dc6a956b6 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -15,6 +15,8 @@
 
 #include <linux/cgroup.h>
 
+#ifdef CONFIG_BLK_CGROUP
+
 struct blkio_cgroup {
 	struct cgroup_subsys_state css;
 	unsigned int weight;
@@ -41,6 +43,13 @@ struct blkio_group {
 	unsigned long sectors;
 };
 
+#else
+
+struct blkio_group {
+};
+
+#endif
+
 #define BLKIO_WEIGHT_MIN	100
 #define BLKIO_WEIGHT_MAX	1000
 #define BLKIO_WEIGHT_DEFAULT	500
@@ -69,6 +78,7 @@ extern struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg,
 void blkiocg_update_blkio_group_stats(struct blkio_group *blkg,
 			unsigned long time, unsigned long sectors);
 #else
+struct cgroup;
 static inline struct blkio_cgroup *
 cgroup_to_blkio_cgroup(struct cgroup *cgroup) { return NULL; }
 
-- 
cgit v1.2.2


From f2eecb91522686edf8199947b77f435a4031d92f Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Fri, 4 Dec 2009 10:06:35 +0100
Subject: cfq-iosched: move IO controller declerations to a header file

They should not be declared inside some other file that's not related
to CFQ.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-cgroup.c  | 4 +---
 block/cfq-iosched.c | 1 +
 block/cfq-iosched.h | 7 +++++++
 3 files changed, 9 insertions(+), 3 deletions(-)
 create mode 100644 block/cfq-iosched.h

(limited to 'block')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 179ddfaebc5c..73a5525cc0a2 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -14,9 +14,7 @@
 #include <linux/seq_file.h>
 #include <linux/kdev_t.h>
 #include "blk-cgroup.h"
-
-extern void cfq_unlink_blkio_group(void *, struct blkio_group *);
-extern void cfq_update_blkio_group_weight(struct blkio_group *, unsigned int);
+#include "cfq-iosched.h"
 
 struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT };
 
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 08b057b1b3b2..43ec3340cb58 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -14,6 +14,7 @@
 #include <linux/ioprio.h>
 #include <linux/blktrace_api.h>
 #include "blk-cgroup.h"
+#include "cfq-iosched.h"
 
 /*
  * tunables
diff --git a/block/cfq-iosched.h b/block/cfq-iosched.h
new file mode 100644
index 000000000000..ef7b4798a349
--- /dev/null
+++ b/block/cfq-iosched.h
@@ -0,0 +1,7 @@
+#ifndef CFQ_IOSCHED_H
+#define CFQ_IOSCHED_H
+
+void cfq_unlink_blkio_group(void *, struct blkio_group *);
+void cfq_update_blkio_group_weight(struct blkio_group *, unsigned int);
+
+#endif
-- 
cgit v1.2.2


From 237e5bc4e51813e9d8ba9da0f63e7acc608882d7 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Fri, 4 Dec 2009 10:07:38 +0100
Subject: io controller: quick fix for blk-cgroup and modular CFQ

It's currently not an allowed configuration, so express that in Kconfig.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/Kconfig.iosched | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
index b71abfb0d726..5368d74207e8 100644
--- a/block/Kconfig.iosched
+++ b/block/Kconfig.iosched
@@ -34,7 +34,7 @@ config IOSCHED_CFQ
 
 config CFQ_GROUP_IOSCHED
 	bool "CFQ Group Scheduling support"
-	depends on IOSCHED_CFQ && CGROUPS
+	depends on IOSCHED_CFQ=y && CGROUPS
 	select BLK_CGROUP
 	default n
 	---help---
-- 
cgit v1.2.2


From 3c764b7a654668dd04905841d6024f7b6aa843a5 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shaohua.li@intel.com>
Date: Fri, 4 Dec 2009 13:12:06 +0100
Subject: cfq-iosched: make nonrot check logic consistent

cfq_arm_slice_timer() has logic to disable idle window for SSD device. The same
thing should be done at cfq_select_queue() too, otherwise we will still see
idle window. This makes the nonrot check logic consistent in cfq.
Tests in a intel SSD with low_latency knob close, below patch can triple disk
thoughput for muti-thread sequential read.

Signed-off-by: Shaohua Li <shaohua.li@intel.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 43ec3340cb58..b00ca4c86e25 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1796,7 +1796,8 @@ static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 		return false;
 
 	/* We do for queues that were marked with idle window flag. */
-	if (cfq_cfqq_idle_window(cfqq))
+	if (cfq_cfqq_idle_window(cfqq) &&
+	   !(blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag))
 		return true;
 
 	/*
-- 
cgit v1.2.2


From af901ca181d92aac3a7dc265144a9081a86d8f39 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andr=C3=A9=20Goddard=20Rosa?= <andre.goddard@gmail.com>
Date: Sat, 14 Nov 2009 13:09:05 -0200
Subject: tree-wide: fix assorted typos all over the place
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

That is "success", "unknown", "through", "performance", "[re|un]mapping"
, "access", "default", "reasonable", "[con]currently", "temperature"
, "channel", "[un]used", "application", "example","hierarchy", "therefore"
, "[over|under]flow", "contiguous", "threshold", "enough" and others.

Signed-off-by: André Goddard Rosa <andre.goddard@gmail.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 block/blk-iopoll.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/blk-iopoll.c b/block/blk-iopoll.c
index ca564202ed7a..58916afbbda5 100644
--- a/block/blk-iopoll.c
+++ b/block/blk-iopoll.c
@@ -28,7 +28,7 @@ static DEFINE_PER_CPU(struct list_head, blk_cpu_iopoll);
  * Description:
  *     Add this blk_iopoll structure to the pending poll list and trigger the
  *     raise of the blk iopoll softirq. The driver must already have gotten a
- *     succesful return from blk_iopoll_sched_prep() before calling this.
+ *     successful return from blk_iopoll_sched_prep() before calling this.
  **/
 void blk_iopoll_sched(struct blk_iopoll *iop)
 {
-- 
cgit v1.2.2


From 61cc74fbb87af6aa551a06a370590c9bc07e29d9 Mon Sep 17 00:00:00 2001
From: Louis Rilling <louis.rilling@kerlabs.com>
Date: Fri, 4 Dec 2009 14:52:41 +0100
Subject: block: Fix io_context leak after clone with CLONE_IO

With CLONE_IO, copy_io() increments both ioc->refcount and ioc->nr_tasks.
However exit_io_context() only decrements ioc->refcount if ioc->nr_tasks
reaches 0.

Always call put_io_context() in exit_io_context().

Signed-off-by: Louis Rilling <louis.rilling@kerlabs.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-ioc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index d4ed6000147d..dcd041290b28 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -80,8 +80,8 @@ void exit_io_context(void)
 			ioc->aic->exit(ioc->aic);
 		cfq_exit(ioc);
 
-		put_io_context(ioc);
 	}
+	put_io_context(ioc);
 }
 
 struct io_context *alloc_io_context(gfp_t gfp_flags, int node)
-- 
cgit v1.2.2


From b69f2292063d2caf37ca9aec7d63ded203701bf3 Mon Sep 17 00:00:00 2001
From: Louis Rilling <louis.rilling@kerlabs.com>
Date: Fri, 4 Dec 2009 14:52:42 +0100
Subject: block: Fix io_context leak after failure of clone with CLONE_IO

With CLONE_IO, parent's io_context->nr_tasks is incremented, but never
decremented whenever copy_process() fails afterwards, which prevents
exit_io_context() from calling IO schedulers exit functions.

Give a task_struct to exit_io_context(), and call exit_io_context() instead of
put_io_context() in copy_process() cleanup path.

Signed-off-by: Louis Rilling <louis.rilling@kerlabs.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-ioc.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'block')

diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index dcd041290b28..cbdabb0dd6d7 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -66,14 +66,14 @@ static void cfq_exit(struct io_context *ioc)
 }
 
 /* Called by the exitting task */
-void exit_io_context(void)
+void exit_io_context(struct task_struct *task)
 {
 	struct io_context *ioc;
 
-	task_lock(current);
-	ioc = current->io_context;
-	current->io_context = NULL;
-	task_unlock(current);
+	task_lock(task);
+	ioc = task->io_context;
+	task->io_context = NULL;
+	task_unlock(task);
 
 	if (atomic_dec_and_test(&ioc->nr_tasks)) {
 		if (ioc->aic && ioc->aic->exit)
-- 
cgit v1.2.2


From 9d6a986c0b276085f7944cd8ad65f4f82aff7536 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Fri, 4 Dec 2009 10:36:41 -0500
Subject: blkio: Export some symbols from blkio as its user CFQ can be a module

o blkio controller is inside the kernel and cfq makes use of interfaces
  exported by blkio. CFQ can be a module too, hence export symbols used
  by CFQ.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-cgroup.c  | 22 ++++++++++++++++++++++
 block/blk-cgroup.h  |  3 +++
 block/cfq-iosched.c |  4 ++--
 3 files changed, 27 insertions(+), 2 deletions(-)

(limited to 'block')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 73a5525cc0a2..4d4a277b2905 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -13,16 +13,33 @@
 #include <linux/ioprio.h>
 #include <linux/seq_file.h>
 #include <linux/kdev_t.h>
+#include <linux/module.h>
 #include "blk-cgroup.h"
 #include "cfq-iosched.h"
 
 struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT };
+EXPORT_SYMBOL_GPL(blkio_root_cgroup);
+
+bool blkiocg_css_tryget(struct blkio_cgroup *blkcg)
+{
+	if (!css_tryget(&blkcg->css))
+		return false;
+	return true;
+}
+EXPORT_SYMBOL_GPL(blkiocg_css_tryget);
+
+void blkiocg_css_put(struct blkio_cgroup *blkcg)
+{
+	css_put(&blkcg->css);
+}
+EXPORT_SYMBOL_GPL(blkiocg_css_put);
 
 struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup)
 {
 	return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id),
 			    struct blkio_cgroup, css);
 }
+EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup);
 
 void blkiocg_update_blkio_group_stats(struct blkio_group *blkg,
 			unsigned long time, unsigned long sectors)
@@ -30,6 +47,7 @@ void blkiocg_update_blkio_group_stats(struct blkio_group *blkg,
 	blkg->time += time;
 	blkg->sectors += sectors;
 }
+EXPORT_SYMBOL_GPL(blkiocg_update_blkio_group_stats);
 
 void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
 			struct blkio_group *blkg, void *key, dev_t dev)
@@ -47,6 +65,7 @@ void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
 #endif
 	blkg->dev = dev;
 }
+EXPORT_SYMBOL_GPL(blkiocg_add_blkio_group);
 
 static void __blkiocg_del_blkio_group(struct blkio_group *blkg)
 {
@@ -81,6 +100,7 @@ out:
 	rcu_read_unlock();
 	return ret;
 }
+EXPORT_SYMBOL_GPL(blkiocg_del_blkio_group);
 
 /* called under rcu_read_lock(). */
 struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key)
@@ -97,6 +117,7 @@ struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key)
 
 	return NULL;
 }
+EXPORT_SYMBOL_GPL(blkiocg_lookup_group);
 
 #define SHOW_FUNCTION(__VAR)						\
 static u64 blkiocg_##__VAR##_read(struct cgroup *cgroup,		\
@@ -166,6 +187,7 @@ void blkiocg_update_blkio_group_dequeue_stats(struct blkio_group *blkg,
 {
 	blkg->dequeue += dequeue;
 }
+EXPORT_SYMBOL_GPL(blkiocg_update_blkio_group_dequeue_stats);
 #endif
 
 struct cftype blkio_files[] = {
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 257dc6a956b6..4f89b967467f 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -43,6 +43,9 @@ struct blkio_group {
 	unsigned long sectors;
 };
 
+extern bool blkiocg_css_tryget(struct blkio_cgroup *blkcg);
+extern void blkiocg_css_put(struct blkio_cgroup *blkcg);
+
 #else
 
 struct blkio_group {
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index b00ca4c86e25..7f3f343b0c65 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -961,7 +961,7 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
 	unsigned int major, minor;
 
 	/* Do we need to take this reference */
-	if (!css_tryget(&blkcg->css))
+	if (!blkiocg_css_tryget(blkcg))
 		return NULL;;
 
 	cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
@@ -994,7 +994,7 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
 	hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
 
 done:
-	css_put(&blkcg->css);
+	blkiocg_css_put(blkcg);
 	return cfqg;
 }
 
-- 
cgit v1.2.2


From 3e2520668970aab5a764044a298e987aafc1f63d Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Fri, 4 Dec 2009 10:36:42 -0500
Subject: blkio: Implement dynamic io controlling policy registration

o One of the goals of block IO controller is that it should be able to
  support mulitple io control policies, some of which be operational at
  higher level in storage hierarchy.

o To begin with, we had one io controlling policy implemented by CFQ, and
  I hard coded the CFQ functions called by blkio. This created issues when
  CFQ is compiled as module.

o This patch implements a basic dynamic io controlling policy registration
  functionality in blkio. This is similar to elevator functionality where
  ioschedulers register the functions dynamically.

o Now in future, when more IO controlling policies are implemented, these
  can dynakically register with block IO controller.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-cgroup.c  | 36 ++++++++++++++++++++++++++++++++----
 block/blk-cgroup.h  | 24 ++++++++++++++++++++++++
 block/cfq-iosched.c | 14 +++++++++++++-
 block/cfq-iosched.h |  7 -------
 4 files changed, 69 insertions(+), 12 deletions(-)
 delete mode 100644 block/cfq-iosched.h

(limited to 'block')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 4d4a277b2905..3ad497f4eed6 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -15,7 +15,9 @@
 #include <linux/kdev_t.h>
 #include <linux/module.h>
 #include "blk-cgroup.h"
-#include "cfq-iosched.h"
+
+static DEFINE_SPINLOCK(blkio_list_lock);
+static LIST_HEAD(blkio_list);
 
 struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT };
 EXPORT_SYMBOL_GPL(blkio_root_cgroup);
@@ -138,6 +140,7 @@ blkiocg_weight_write(struct cgroup *cgroup, struct cftype *cftype, u64 val)
 	struct blkio_cgroup *blkcg;
 	struct blkio_group *blkg;
 	struct hlist_node *n;
+	struct blkio_policy_type *blkiop;
 
 	if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX)
 		return -EINVAL;
@@ -145,8 +148,13 @@ blkiocg_weight_write(struct cgroup *cgroup, struct cftype *cftype, u64 val)
 	blkcg = cgroup_to_blkio_cgroup(cgroup);
 	spin_lock_irq(&blkcg->lock);
 	blkcg->weight = (unsigned int)val;
-	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node)
-		cfq_update_blkio_group_weight(blkg, blkcg->weight);
+	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
+		spin_lock(&blkio_list_lock);
+		list_for_each_entry(blkiop, &blkio_list, list)
+			blkiop->ops.blkio_update_group_weight_fn(blkg,
+					blkcg->weight);
+		spin_unlock(&blkio_list_lock);
+	}
 	spin_unlock_irq(&blkcg->lock);
 	return 0;
 }
@@ -224,6 +232,7 @@ static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup)
 	unsigned long flags;
 	struct blkio_group *blkg;
 	void *key;
+	struct blkio_policy_type *blkiop;
 
 	rcu_read_lock();
 remove_entry:
@@ -249,7 +258,10 @@ remove_entry:
 	 * we have more policies in place, we need some dynamic registration
 	 * of callback function.
 	 */
-	cfq_unlink_blkio_group(key, blkg);
+	spin_lock(&blkio_list_lock);
+	list_for_each_entry(blkiop, &blkio_list, list)
+		blkiop->ops.blkio_unlink_group_fn(key, blkg);
+	spin_unlock(&blkio_list_lock);
 	goto remove_entry;
 done:
 	free_css_id(&blkio_subsys, &blkcg->css);
@@ -330,3 +342,19 @@ struct cgroup_subsys blkio_subsys = {
 	.subsys_id = blkio_subsys_id,
 	.use_id = 1,
 };
+
+void blkio_policy_register(struct blkio_policy_type *blkiop)
+{
+	spin_lock(&blkio_list_lock);
+	list_add_tail(&blkiop->list, &blkio_list);
+	spin_unlock(&blkio_list_lock);
+}
+EXPORT_SYMBOL_GPL(blkio_policy_register);
+
+void blkio_policy_unregister(struct blkio_policy_type *blkiop)
+{
+	spin_lock(&blkio_list_lock);
+	list_del_init(&blkiop->list);
+	spin_unlock(&blkio_list_lock);
+}
+EXPORT_SYMBOL_GPL(blkio_policy_unregister);
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 4f89b967467f..4d316df863b4 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -46,11 +46,35 @@ struct blkio_group {
 extern bool blkiocg_css_tryget(struct blkio_cgroup *blkcg);
 extern void blkiocg_css_put(struct blkio_cgroup *blkcg);
 
+typedef void (blkio_unlink_group_fn) (void *key, struct blkio_group *blkg);
+typedef void (blkio_update_group_weight_fn) (struct blkio_group *blkg,
+						unsigned int weight);
+
+struct blkio_policy_ops {
+	blkio_unlink_group_fn *blkio_unlink_group_fn;
+	blkio_update_group_weight_fn *blkio_update_group_weight_fn;
+};
+
+struct blkio_policy_type {
+	struct list_head list;
+	struct blkio_policy_ops ops;
+};
+
+/* Blkio controller policy registration */
+extern void blkio_policy_register(struct blkio_policy_type *);
+extern void blkio_policy_unregister(struct blkio_policy_type *);
+
 #else
 
 struct blkio_group {
 };
 
+struct blkio_policy_type {
+};
+
+static inline void blkio_policy_register(struct blkio_policy_type *blkiop) { }
+static inline void blkio_policy_unregister(struct blkio_policy_type *blkiop) { }
+
 #endif
 
 #define BLKIO_WEIGHT_MIN	100
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 7f3f343b0c65..78f4829895bd 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -14,7 +14,6 @@
 #include <linux/ioprio.h>
 #include <linux/blktrace_api.h>
 #include "blk-cgroup.h"
-#include "cfq-iosched.h"
 
 /*
  * tunables
@@ -3855,6 +3854,17 @@ static struct elevator_type iosched_cfq = {
 	.elevator_owner =	THIS_MODULE,
 };
 
+#ifdef CONFIG_CFQ_GROUP_IOSCHED
+static struct blkio_policy_type blkio_policy_cfq = {
+	.ops = {
+		.blkio_unlink_group_fn =	cfq_unlink_blkio_group,
+		.blkio_update_group_weight_fn =	cfq_update_blkio_group_weight,
+	},
+};
+#else
+static struct blkio_policy_type blkio_policy_cfq;
+#endif
+
 static int __init cfq_init(void)
 {
 	/*
@@ -3869,6 +3879,7 @@ static int __init cfq_init(void)
 		return -ENOMEM;
 
 	elv_register(&iosched_cfq);
+	blkio_policy_register(&blkio_policy_cfq);
 
 	return 0;
 }
@@ -3876,6 +3887,7 @@ static int __init cfq_init(void)
 static void __exit cfq_exit(void)
 {
 	DECLARE_COMPLETION_ONSTACK(all_gone);
+	blkio_policy_unregister(&blkio_policy_cfq);
 	elv_unregister(&iosched_cfq);
 	ioc_gone = &all_gone;
 	/* ioc_gone's update must be visible before reading ioc_count */
diff --git a/block/cfq-iosched.h b/block/cfq-iosched.h
deleted file mode 100644
index ef7b4798a349..000000000000
--- a/block/cfq-iosched.h
+++ /dev/null
@@ -1,7 +0,0 @@
-#ifndef CFQ_IOSCHED_H
-#define CFQ_IOSCHED_H
-
-void cfq_unlink_blkio_group(void *, struct blkio_group *);
-void cfq_update_blkio_group_weight(struct blkio_group *, unsigned int);
-
-#endif
-- 
cgit v1.2.2


From 846954b0a32f4ae953e082eabd178e7a98dd2efd Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Fri, 4 Dec 2009 10:36:43 -0500
Subject: blkio: Allow CFQ group IO scheduling even when CFQ is a module

o Now issues of blkio controller and CFQ in module mode should be fixed.
  Enable the cfq group scheduling support in module mode.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/Kconfig.iosched | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
index 5368d74207e8..b71abfb0d726 100644
--- a/block/Kconfig.iosched
+++ b/block/Kconfig.iosched
@@ -34,7 +34,7 @@ config IOSCHED_CFQ
 
 config CFQ_GROUP_IOSCHED
 	bool "CFQ Group Scheduling support"
-	depends on IOSCHED_CFQ=y && CGROUPS
+	depends on IOSCHED_CFQ && CGROUPS
 	select BLK_CGROUP
 	default n
 	---help---
-- 
cgit v1.2.2


From bb729bc98c0f3e6a898d8730df3e2830bf68751a Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Sun, 6 Dec 2009 09:54:19 +0100
Subject: cfq-iosched: use call_rcu() instead of doing grace period stall on
 queue exit

After the merge of the IO controller patches, booting on my megaraid
box ran much slower. Vivek Goyal traced it down to megaraid discovery
creating tons of devices, each suffering a grace period when they later
kill that queue (if no device is found).

So lets use call_rcu() to batch these deferred frees, instead of taking
the grace period hit for each one.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 78f4829895bd..3815f9789b6a 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -287,6 +287,7 @@ struct cfq_data {
 
 	/* List of cfq groups being managed on this device*/
 	struct hlist_head cfqg_list;
+	struct rcu_head rcu;
 };
 
 static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);
@@ -3601,6 +3602,11 @@ static void cfq_put_async_queues(struct cfq_data *cfqd)
 		cfq_put_queue(cfqd->async_idle_cfqq);
 }
 
+static void cfq_cfqd_free(struct rcu_head *head)
+{
+	kfree(container_of(head, struct cfq_data, rcu));
+}
+
 static void cfq_exit_queue(struct elevator_queue *e)
 {
 	struct cfq_data *cfqd = e->elevator_data;
@@ -3630,8 +3636,7 @@ static void cfq_exit_queue(struct elevator_queue *e)
 	cfq_shutdown_timer_wq(cfqd);
 
 	/* Wait for cfqg->blkg->key accessors to exit their grace periods. */
-	synchronize_rcu();
-	kfree(cfqd);
+	call_rcu(&cfqd->rcu, cfq_cfqd_free);
 }
 
 static void *cfq_init_queue(struct request_queue *q)
@@ -3706,6 +3711,7 @@ static void *cfq_init_queue(struct request_queue *q)
 	cfqd->cfq_group_isolation = 0;
 	cfqd->hw_tag = -1;
 	cfqd->last_end_sync_rq = jiffies;
+	INIT_RCU_HEAD(&cfqd->rcu);
 	return cfqd;
 }
 
-- 
cgit v1.2.2


From accee7854b378a8ab5995d8f5dc5d8abc3b3d23a Mon Sep 17 00:00:00 2001
From: Stephen Rothwell <sfr@canb.auug.org.au>
Date: Mon, 7 Dec 2009 19:29:39 +1100
Subject: block: include linux/err.h to use ERR_PTR

Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-cgroup.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'block')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 3ad497f4eed6..1fa2654db0a6 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -14,6 +14,7 @@
 #include <linux/seq_file.h>
 #include <linux/kdev_t.h>
 #include <linux/module.h>
+#include <linux/err.h>
 #include "blk-cgroup.h"
 
 static DEFINE_SPINLOCK(blkio_list_lock);
-- 
cgit v1.2.2


From 878eaddd05d251cefa9632c2b8046833c5eead66 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Mon, 7 Dec 2009 19:37:15 +0100
Subject: cfq-iosched: Do not access cfqq after freeing it

Fix a crash during boot reported by Jeff Moyer. Fix the issue of accessing
cfqq after freeing it.

Reported-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Reviewed-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Jens Axboe <axboe@carl.(none)>
---
 block/cfq-iosched.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 3815f9789b6a..cfb0b2f5f63d 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -2368,7 +2368,7 @@ static int cfq_dispatch_requests(struct request_queue *q, int force)
 static void cfq_put_queue(struct cfq_queue *cfqq)
 {
 	struct cfq_data *cfqd = cfqq->cfqd;
-	struct cfq_group *cfqg;
+	struct cfq_group *cfqg, *orig_cfqg;
 
 	BUG_ON(atomic_read(&cfqq->ref) <= 0);
 
@@ -2379,6 +2379,7 @@ static void cfq_put_queue(struct cfq_queue *cfqq)
 	BUG_ON(rb_first(&cfqq->sort_list));
 	BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]);
 	cfqg = cfqq->cfqg;
+	orig_cfqg = cfqq->orig_cfqg;
 
 	if (unlikely(cfqd->active_queue == cfqq)) {
 		__cfq_slice_expired(cfqd, cfqq, 0);
@@ -2388,8 +2389,8 @@ static void cfq_put_queue(struct cfq_queue *cfqq)
 	BUG_ON(cfq_cfqq_on_rr(cfqq));
 	kmem_cache_free(cfq_pool, cfqq);
 	cfq_put_cfqg(cfqg);
-	if (cfqq->orig_cfqg)
-		cfq_put_cfqg(cfqq->orig_cfqg);
+	if (orig_cfqg)
+		cfq_put_cfqg(orig_cfqg);
 }
 
 /*
-- 
cgit v1.2.2


From 573412b29586e58477adb70e022193a337763319 Mon Sep 17 00:00:00 2001
From: Corrado Zoccolo <czoccolo@gmail.com>
Date: Sun, 6 Dec 2009 11:48:52 +0100
Subject: cfq-iosched: reduce write depth only if sync was delayed

The introduction of ramp-up formula for async queue depths has
slowed down dirty page reclaim, by reducing async write performance.
This patch makes sure the formula kicks in only when sync request
was recently delayed.

Signed-off-by: Corrado Zoccolo <czoccolo@gmail.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index cfb0b2f5f63d..5009af490a0c 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -283,7 +283,7 @@ struct cfq_data {
 	 */
 	struct cfq_queue oom_cfqq;
 
-	unsigned long last_end_sync_rq;
+	unsigned long last_delayed_sync;
 
 	/* List of cfq groups being managed on this device*/
 	struct hlist_head cfqg_list;
@@ -2264,7 +2264,7 @@ static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 	 * based on the last sync IO we serviced
 	 */
 	if (!cfq_cfqq_sync(cfqq) && cfqd->cfq_latency) {
-		unsigned long last_sync = jiffies - cfqd->last_end_sync_rq;
+		unsigned long last_sync = jiffies - cfqd->last_delayed_sync;
 		unsigned int depth;
 
 		depth = last_sync / cfqd->cfq_slice[1];
@@ -3273,7 +3273,8 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
 
 	if (sync) {
 		RQ_CIC(rq)->last_end_request = now;
-		cfqd->last_end_sync_rq = now;
+		if (!time_after(rq->start_time + cfqd->cfq_fifo_expire[1], now))
+			cfqd->last_delayed_sync = now;
 	}
 
 	/*
@@ -3711,7 +3712,7 @@ static void *cfq_init_queue(struct request_queue *q)
 	cfqd->cfq_latency = 1;
 	cfqd->cfq_group_isolation = 0;
 	cfqd->hw_tag = -1;
-	cfqd->last_end_sync_rq = jiffies;
+	cfqd->last_delayed_sync = jiffies - HZ;
 	INIT_RCU_HEAD(&cfqd->rcu);
 	return cfqd;
 }
-- 
cgit v1.2.2


From b9d8f4c73b1af4cfd53f819bf84c2bce31232275 Mon Sep 17 00:00:00 2001
From: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Date: Tue, 8 Dec 2009 08:54:17 +0100
Subject: cfq: Optimization for close cooperating queue searching

It doesn't make any sense to try to find out a close cooperating
queue if current cfqq is the only one in the group.

Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 5009af490a0c..b19cd684bf12 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1749,6 +1749,12 @@ static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd,
 	if (CFQQ_SEEKY(cur_cfqq))
 		return NULL;
 
+	/*
+	 * Don't search priority tree if it's the only queue in the group.
+	 */
+	if (cur_cfqq->cfqg->nr_cfqq == 1)
+		return NULL;
+
 	/*
 	 * We should notice if some of the queues are cooperating, eg
 	 * working closely on the same area of the disk. In that case,
-- 
cgit v1.2.2


From c244bb50a9baa2ec47a458bbafb36b5e559ed5fa Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Tue, 8 Dec 2009 17:52:57 -0500
Subject: cfq-iosched: Get rid of cfqq wait_busy_done flag

o Get rid of wait_busy_done flag. This flag only tells we were doing wait
  busy on a queue and that queue got request so expire it. That information
  can easily be obtained by (cfq_cfqq_wait_busy() && queue_is_not_empty). So
  remove this flag and keep code simple.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index b19cd684bf12..f41fdb5f3e0c 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -319,7 +319,6 @@ enum cfqq_state_flags {
 	CFQ_CFQQ_FLAG_coop,		/* cfqq is shared */
 	CFQ_CFQQ_FLAG_deep,		/* sync cfqq experienced large depth */
 	CFQ_CFQQ_FLAG_wait_busy,	/* Waiting for next request */
-	CFQ_CFQQ_FLAG_wait_busy_done,	/* Got new request. Expire the queue */
 };
 
 #define CFQ_CFQQ_FNS(name)						\
@@ -348,7 +347,6 @@ CFQ_CFQQ_FNS(sync);
 CFQ_CFQQ_FNS(coop);
 CFQ_CFQQ_FNS(deep);
 CFQ_CFQQ_FNS(wait_busy);
-CFQ_CFQQ_FNS(wait_busy_done);
 #undef CFQ_CFQQ_FNS
 
 #ifdef CONFIG_DEBUG_CFQ_IOSCHED
@@ -1574,7 +1572,6 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 
 	cfq_clear_cfqq_wait_request(cfqq);
 	cfq_clear_cfqq_wait_busy(cfqq);
-	cfq_clear_cfqq_wait_busy_done(cfqq);
 
 	/*
 	 * store what was left of this slice, if the queue idled/timed out
@@ -2134,11 +2131,17 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
 
 	if (!cfqd->rq_queued)
 		return NULL;
+
+	/*
+	 * We were waiting for group to get backlogged. Expire the queue
+	 */
+	if (cfq_cfqq_wait_busy(cfqq) && !RB_EMPTY_ROOT(&cfqq->sort_list))
+		goto expire;
+
 	/*
 	 * The active queue has run out of time, expire it and select new.
 	 */
-	if ((cfq_slice_used(cfqq) || cfq_cfqq_wait_busy_done(cfqq))
-	     && !cfq_cfqq_must_dispatch(cfqq))
+	if (cfq_slice_used(cfqq) && !cfq_cfqq_must_dispatch(cfqq))
 		goto expire;
 
 	/*
@@ -3171,10 +3174,6 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 	cfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
 
 	if (cfqq == cfqd->active_queue) {
-		if (cfq_cfqq_wait_busy(cfqq)) {
-			cfq_clear_cfqq_wait_busy(cfqq);
-			cfq_mark_cfqq_wait_busy_done(cfqq);
-		}
 		/*
 		 * Remember that we saw a request from this process, but
 		 * don't start queuing just yet. Otherwise we risk seeing lots
-- 
cgit v1.2.2


From 7667aa0630407bc07dc38dcc79d29cc0a65553c1 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Tue, 8 Dec 2009 17:52:58 -0500
Subject: cfq-iosched: Take care of corner cases of group losing share due to
 deletion

If there is a sequential reader running in a group, we wait for next request
to come in that group after slice expiry and once new request is in, we expire
the queue. Otherwise we delete the group from service tree and group looses
its fair share.

So far I was marking a queue as wait_busy if it had consumed its slice and
it was last queue in the group. But this condition did not cover following
two cases.

1.If a request completed and slice has not expired yet. Next request comes
  in and is dispatched to disk. Now select_queue() hits and slice has expired.
  This group will be deleted. Because request is still in the disk, this queue
  will never get a chance to wait_busy.

2.If request completed and slice has not expired yet. Before next request
  comes in (delay due to think time), select_queue() hits and expires the
  queue hence group. This queue never got a chance to wait busy.

Gui was hitting the boundary condition 1 and not getting fairness numbers
proportional to weight.

This patch puts the checks for above two conditions and improves the fairness
numbers for sequential workload on rotational media. Check in select_queue()
takes care of case 1 and additional check in should_wait_busy() takes care
of case 2.

Reported-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 54 +++++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 48 insertions(+), 6 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index f41fdb5f3e0c..98b15b98b85d 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -2141,8 +2141,22 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
 	/*
 	 * The active queue has run out of time, expire it and select new.
 	 */
-	if (cfq_slice_used(cfqq) && !cfq_cfqq_must_dispatch(cfqq))
-		goto expire;
+	if (cfq_slice_used(cfqq) && !cfq_cfqq_must_dispatch(cfqq)) {
+		/*
+		 * If slice had not expired at the completion of last request
+		 * we might not have turned on wait_busy flag. Don't expire
+		 * the queue yet. Allow the group to get backlogged.
+		 *
+		 * The very fact that we have used the slice, that means we
+		 * have been idling all along on this queue and it should be
+		 * ok to wait for this request to complete.
+		 */
+		if (cfqq->cfqg->nr_cfqq == 1 && cfqq->dispatched
+		    && cfq_should_idle(cfqd, cfqq))
+			goto keep_queue;
+		else
+			goto expire;
+	}
 
 	/*
 	 * The active queue has requests and isn't expired, allow it to
@@ -3256,6 +3270,35 @@ static void cfq_update_hw_tag(struct cfq_data *cfqd)
 		cfqd->hw_tag = 0;
 }
 
+static bool cfq_should_wait_busy(struct cfq_data *cfqd, struct cfq_queue *cfqq)
+{
+	struct cfq_io_context *cic = cfqd->active_cic;
+
+	/* If there are other queues in the group, don't wait */
+	if (cfqq->cfqg->nr_cfqq > 1)
+		return false;
+
+	if (cfq_slice_used(cfqq))
+		return true;
+
+	/* if slice left is less than think time, wait busy */
+	if (cic && sample_valid(cic->ttime_samples)
+	    && (cfqq->slice_end - jiffies < cic->ttime_mean))
+		return true;
+
+	/*
+	 * If think times is less than a jiffy than ttime_mean=0 and above
+	 * will not be true. It might happen that slice has not expired yet
+	 * but will expire soon (4-5 ns) during select_queue(). To cover the
+	 * case where think time is less than a jiffy, mark the queue wait
+	 * busy if only 1 jiffy is left in the slice.
+	 */
+	if (cfqq->slice_end - jiffies == 1)
+		return true;
+
+	return false;
+}
+
 static void cfq_completed_request(struct request_queue *q, struct request *rq)
 {
 	struct cfq_queue *cfqq = RQ_CFQQ(rq);
@@ -3295,11 +3338,10 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
 		}
 
 		/*
-		 * If this queue consumed its slice and this is last queue
-		 * in the group, wait for next request before we expire
-		 * the queue
+		 * Should we wait for next request to come in before we expire
+		 * the queue.
 		 */
-		if (cfq_slice_used(cfqq) && cfqq->cfqg->nr_cfqq == 1) {
+		if (cfq_should_wait_busy(cfqd, cfqq)) {
 			cfqq->slice_end = jiffies + cfqd->cfq_slice_idle;
 			cfq_mark_cfqq_wait_busy(cfqq);
 		}
-- 
cgit v1.2.2


From edc71131c4dc6cc73e2a24aa0a7a79cfce738f12 Mon Sep 17 00:00:00 2001
From: Corrado Zoccolo <czoccolo@gmail.com>
Date: Wed, 9 Dec 2009 20:56:04 +0100
Subject: cfq-iosched: commenting non-obvious initialization

Added a comment to explain the initialization of last_delayed_sync.

Signed-off-by: Corrado Zoccolo <czoccolo@gmail.com>
Acked-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 98b15b98b85d..69ecee7f4ad4 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -3759,6 +3759,10 @@ static void *cfq_init_queue(struct request_queue *q)
 	cfqd->cfq_latency = 1;
 	cfqd->cfq_group_isolation = 0;
 	cfqd->hw_tag = -1;
+	/*
+	 * we optimistically start assuming sync ops weren't delayed in last
+	 * second, in order to have larger depth for async operations.
+	 */
 	cfqd->last_delayed_sync = jiffies - HZ;
 	INIT_RCU_HEAD(&cfqd->rcu);
 	return cfqd;
-- 
cgit v1.2.2


From 554554f60ad619e1efab01897208bc320b81d9da Mon Sep 17 00:00:00 2001
From: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Date: Thu, 10 Dec 2009 09:38:39 +0100
Subject: cfq: Remove wait_request flag when idle time is being deleted

Remove wait_request flag when idle time is being deleted, otherwise
it'll hit this path every time when a request is enqueued.

Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 69ecee7f4ad4..96f59ae5b6e9 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -3202,6 +3202,7 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 			if (blk_rq_bytes(rq) > PAGE_CACHE_SIZE ||
 			    cfqd->busy_queues > 1) {
 				del_timer(&cfqd->idle_slice_timer);
+				cfq_clear_cfqq_wait_request(cfqq);
 				__blk_run_queue(cfqd->queue);
 			} else
 				cfq_mark_cfqq_must_dispatch(cfqq);
-- 
cgit v1.2.2


From 82bbbf28db4beefcd8b897800153e21378270cd1 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Thu, 10 Dec 2009 19:25:41 +0100
Subject: Fix a CFQ crash in "for-2.6.33" branch of block tree

I think my previous patch introduced a bug which can lead to CFQ hitting
BUG_ON().

The offending commit in for-2.6.33 branch is.

commit 7667aa0630407bc07dc38dcc79d29cc0a65553c1
Author: Vivek Goyal <vgoyal@redhat.com>
Date:   Tue Dec 8 17:52:58 2009 -0500

    cfq-iosched: Take care of corner cases of group losing share due to deletion

While doing some stress testing on my box, I enountered following.

login: [ 3165.148841] BUG: scheduling while
atomic: swapper/0/0x10000100
[ 3165.149821] Modules linked in: cfq_iosched dm_multipath qla2xxx igb
scsi_transport_fc dm_snapshot [last unloaded: scsi_wait_scan]
[ 3165.149821] Pid: 0, comm: swapper Not tainted
2.6.32-block-for-33-merged-new #3
[ 3165.149821] Call Trace:
[ 3165.149821]  <IRQ>  [<ffffffff8103fab8>] __schedule_bug+0x5c/0x60
[ 3165.149821]  [<ffffffff8103afd7>] ? __wake_up+0x44/0x4d
[ 3165.149821]  [<ffffffff8153a979>] schedule+0xe3/0x7bc
[ 3165.149821]  [<ffffffff8103a796>] ? cpumask_next+0x1d/0x1f
[ 3165.149821]  [<ffffffffa000b21d>] ? cfq_dispatch_requests+0x6ba/0x93e
[cfq_iosched]
[ 3165.149821]  [<ffffffff810422d8>] __cond_resched+0x2a/0x35
[ 3165.149821]  [<ffffffffa000b21d>] ? cfq_dispatch_requests+0x6ba/0x93e
[cfq_iosched]
[ 3165.149821]  [<ffffffff8153b1ee>] _cond_resched+0x2c/0x37
[ 3165.149821]  [<ffffffff8100e2db>] is_valid_bugaddr+0x16/0x2f
[ 3165.149821]  [<ffffffff811e4161>] report_bug+0x18/0xac
[ 3165.149821]  [<ffffffff8100f1fc>] die+0x39/0x63
[ 3165.149821]  [<ffffffff8153cde1>] do_trap+0x11a/0x129
[ 3165.149821]  [<ffffffff8100d470>] do_invalid_op+0x96/0x9f
[ 3165.149821]  [<ffffffffa000b21d>] ? cfq_dispatch_requests+0x6ba/0x93e
[cfq_iosched]
[ 3165.149821]  [<ffffffff81034b4d>] ? enqueue_task+0x5c/0x67
[ 3165.149821]  [<ffffffff8103ae83>] ? task_rq_unlock+0x11/0x13
[ 3165.149821]  [<ffffffff81041aae>] ? try_to_wake_up+0x292/0x2a4
[ 3165.149821]  [<ffffffff8100c935>] invalid_op+0x15/0x20
[ 3165.149821]  [<ffffffffa000b21d>] ? cfq_dispatch_requests+0x6ba/0x93e
[cfq_iosched]
[ 3165.149821]  [<ffffffff810df5a6>] ? virt_to_head_page+0xe/0x2f
[ 3165.149821]  [<ffffffff811d8c2a>] blk_peek_request+0x191/0x1a7
[ 3165.149821]  [<ffffffff811e5b8d>] ? kobject_get+0x1a/0x21
[ 3165.149821]  [<ffffffff812c8d4c>] scsi_request_fn+0x82/0x3df
[ 3165.149821]  [<ffffffff8110b2de>] ? bio_fs_destructor+0x15/0x17
[ 3165.149821]  [<ffffffff810df5a6>] ? virt_to_head_page+0xe/0x2f
[ 3165.149821]  [<ffffffff811d931f>] __blk_run_queue+0x42/0x71
[ 3165.149821]  [<ffffffff811d9403>] blk_run_queue+0x26/0x3a
[ 3165.149821]  [<ffffffff812c8761>] scsi_run_queue+0x2de/0x375
[ 3165.149821]  [<ffffffff812b60ac>] ? put_device+0x17/0x19
[ 3165.149821]  [<ffffffff812c92d7>] scsi_next_command+0x3b/0x4b
[ 3165.149821]  [<ffffffff812c9b9f>] scsi_io_completion+0x1c9/0x3f5
[ 3165.149821]  [<ffffffff812c3c36>] scsi_finish_command+0xb5/0xbe

I think I have hit following BUG_ON() in cfq_dispatch_request().

BUG_ON(RB_EMPTY_ROOT(&cfqq->sort_list));

Please find attached the patch to fix it. I have done some stress testing
with it and have not seen it happening again.

o We should wait on a queue even after slice expiry only if it is empty. If
  queue is not empty then continue to expire it.

o If we decide to keep the queue then make cfqq=NULL. Otherwise select_queue()
  will return a valid cfqq and cfq_dispatch_request() can hit following
  BUG_ON().

  BUG_ON(RB_EMPTY_ROOT(&cfqq->sort_list))

Reviewed-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 96f59ae5b6e9..f3f62394b986 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -2151,10 +2151,11 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
 		 * have been idling all along on this queue and it should be
 		 * ok to wait for this request to complete.
 		 */
-		if (cfqq->cfqg->nr_cfqq == 1 && cfqq->dispatched
-		    && cfq_should_idle(cfqd, cfqq))
+		if (cfqq->cfqg->nr_cfqq == 1 && RB_EMPTY_ROOT(&cfqq->sort_list)
+		    && cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) {
+			cfqq = NULL;
 			goto keep_queue;
-		else
+		} else
 			goto expire;
 	}
 
-- 
cgit v1.2.2


From 66ae291978177d5c012015f12b8fbc76dc7d0965 Mon Sep 17 00:00:00 2001
From: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Date: Tue, 15 Dec 2009 10:08:45 +0100
Subject: cfq: set workload as expired if it doesn't have any slice left

When a group is resumed, if it doesn't have workload slice left,
we should set workload_expires as expired. Otherwise, we might
start from where we left in previous group by error.
Thanks the idea from Corrado.

Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index f3f62394b986..e2f80463ed0d 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -2113,7 +2113,9 @@ static void cfq_choose_cfqg(struct cfq_data *cfqd)
 		cfqd->workload_expires = jiffies + cfqg->saved_workload_slice;
 		cfqd->serving_type = cfqg->saved_workload;
 		cfqd->serving_prio = cfqg->saved_serving_prio;
-	}
+	} else
+		cfqd->workload_expires = jiffies - 1;
+
 	choose_service_tree(cfqd, cfqg);
 }
 
-- 
cgit v1.2.2


From b568be627a7270eba575bc4406a606e1545f91bb Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Wed, 16 Dec 2009 09:16:41 +0100
Subject: block: temporarily disable discard granularity

Commit 86b37281411cf1e9bc0a6b5406c45edb7bd9ea5d adds a check for
misaligned stacking offsets, but it's buggy since the defaults are 0.
Hence all dm devices that pass in a non-zero starting offset will
be marked as misaligned amd dm will complain.

A real fix is coming, in the mean time disable the discard granularity
check so that users don't worry about dm reporting about misaligned
devices.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-settings.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'block')

diff --git a/block/blk-settings.c b/block/blk-settings.c
index dd1f1e0e196f..6ae118d6e193 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -554,11 +554,18 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 		ret = -1;
 	}
 
+	/*
+	 * Temporarily disable discard granularity. It's currently buggy
+	 * since we default to 0 for discard_granularity, hence this
+	 * "failure" will always trigger for non-zero offsets.
+	 */
+#if 0
 	if (offset &&
 	    (offset & (b->discard_granularity - 1)) != b->discard_alignment) {
 		t->discard_misaligned = 1;
 		ret = -1;
 	}
+#endif
 
 	/* If top has no alignment offset, inherit from bottom */
 	if (!t->alignment_offset)
-- 
cgit v1.2.2


From 1db32c40600437c5e049796bd32f49f61244c6ef Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Wed, 16 Dec 2009 17:52:57 -0500
Subject: cfq-iosched: Remove the check for same cfq group from allow_merge

o allow_merge() already checks if submitting task is pointing to same cfqq
  as rq has been queued in. If everything is fine, we should not be having
  a task in one cgroup and having a pointer to cfqq in other cgroup.

  Well I guess in some situations it can happen and that is, when a random
  IO queue has been moved into root cgroup for group_isolation=0. In
  this case, tasks's cgroup/group is different from where actually cfqq is,
  but this is intentional and in this case merging should be allowed.

  The second situation is where due to close cooperator patches, multiple
  processes can be sharing a cfqq. If everything implemented right, we should
  not end up in a situation where tasks from different processes in different
  groups are sharing the same cfqq as we allow merging of cooperating queues
  only if they are in same group.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Reviewed-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index e2f80463ed0d..a0e5347767d9 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1513,9 +1513,6 @@ static int cfq_allow_merge(struct request_queue *q, struct request *rq,
 	struct cfq_io_context *cic;
 	struct cfq_queue *cfqq;
 
-	/* Deny merge if bio and rq don't belong to same cfq group */
-	if ((RQ_CFQQ(rq))->cfqg != cfq_get_cfqg(cfqd, 0))
-		return false;
 	/*
 	 * Disallow merge of a sync bio into an async request.
 	 */
-- 
cgit v1.2.2


From fb104db41e6e006c85ce1097f372cd1e10c1755c Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Wed, 16 Dec 2009 17:52:58 -0500
Subject: cfq-iosched: Get rid of nr_groups

o Currently code does not seem to be using cfqd->nr_groups. Get rid of it.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Reviewed-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index a0e5347767d9..d9bfa09e68c1 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -208,8 +208,6 @@ struct cfq_data {
 	/* Root service tree for cfq_groups */
 	struct cfq_rb_root grp_service_tree;
 	struct cfq_group root_group;
-	/* Number of active cfq groups on group service tree */
-	int nr_groups;
 
 	/*
 	 * The priority currently being served
@@ -842,7 +840,6 @@ cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
 
 	__cfq_group_service_tree_add(st, cfqg);
 	cfqg->on_st = true;
-	cfqd->nr_groups++;
 	st->total_weight += cfqg->weight;
 }
 
@@ -863,7 +860,6 @@ cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
 
 	cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");
 	cfqg->on_st = false;
-	cfqd->nr_groups--;
 	st->total_weight -= cfqg->weight;
 	if (!RB_EMPTY_NODE(&cfqg->rb_node))
 		cfq_rb_erase(&cfqg->rb_node, st);
-- 
cgit v1.2.2


From 65b32a573eefa1cdd3cbe5ea59326308e6c3b9ad Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Wed, 16 Dec 2009 17:52:59 -0500
Subject: cfq-iosched: Remove prio_change logic for workload selection

o CFQ now internally divides cfq queues in therr workload categories. sync-idle,
  sync-noidle and async. Which workload to run depends primarily on rb_key
  offset across three service trees. Which is a combination of mulitiple things
  including what time queue got queued on the service tree.

  There is one exception though. That is if we switched the prio class, say
  we served some RT tasks and again started serving BE class, then with-in
  BE class we always started with sync-noidle workload irrespective of rb_key
  offset in service trees.

  This can provide better latencies for sync-noidle workload in the presence
  of RT tasks.

o This patch gets rid of that exception and which workload to run with-in
  class always depends on lowest rb_key across service trees. The reason
  being that now we have multiple BE class groups and if we always switch
  to sync-noidle workload with-in group, we can potentially starve a sync-idle
  workload with-in group. Same is true for async workload which will be in
  root group. Also the workload-switching with-in group will become very
  unpredictable as it now depends whether some RT workload was running in
  the system or not.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Reviewed-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Acked-by: Corrado Zoccolo <czoccolo@gmail.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 48 ++++++++++++------------------------------------
 1 file changed, 12 insertions(+), 36 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index d9bfa09e68c1..8df4fe58f4e7 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -292,8 +292,7 @@ static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);
 
 static struct cfq_rb_root *service_tree_for(struct cfq_group *cfqg,
 					    enum wl_prio_t prio,
-					    enum wl_type_t type,
-					    struct cfq_data *cfqd)
+					    enum wl_type_t type)
 {
 	if (!cfqg)
 		return NULL;
@@ -1146,7 +1145,7 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 #endif
 
 	service_tree = service_tree_for(cfqq->cfqg, cfqq_prio(cfqq),
-						cfqq_type(cfqq), cfqd);
+						cfqq_type(cfqq));
 	if (cfq_class_idle(cfqq)) {
 		rb_key = CFQ_IDLE_DELAY;
 		parent = rb_last(&service_tree->rb);
@@ -1609,7 +1608,7 @@ static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
 {
 	struct cfq_rb_root *service_tree =
 		service_tree_for(cfqd->serving_group, cfqd->serving_prio,
-					cfqd->serving_type, cfqd);
+					cfqd->serving_type);
 
 	if (!cfqd->rq_queued)
 		return NULL;
@@ -1956,8 +1955,7 @@ static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq)
 }
 
 static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
-				struct cfq_group *cfqg, enum wl_prio_t prio,
-				bool prio_changed)
+				struct cfq_group *cfqg, enum wl_prio_t prio)
 {
 	struct cfq_queue *queue;
 	int i;
@@ -1965,24 +1963,9 @@ static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
 	unsigned long lowest_key = 0;
 	enum wl_type_t cur_best = SYNC_NOIDLE_WORKLOAD;
 
-	if (prio_changed) {
-		/*
-		 * When priorities switched, we prefer starting
-		 * from SYNC_NOIDLE (first choice), or just SYNC
-		 * over ASYNC
-		 */
-		if (service_tree_for(cfqg, prio, cur_best, cfqd)->count)
-			return cur_best;
-		cur_best = SYNC_WORKLOAD;
-		if (service_tree_for(cfqg, prio, cur_best, cfqd)->count)
-			return cur_best;
-
-		return ASYNC_WORKLOAD;
-	}
-
-	for (i = 0; i < 3; ++i) {
-		/* otherwise, select the one with lowest rb_key */
-		queue = cfq_rb_first(service_tree_for(cfqg, prio, i, cfqd));
+	for (i = 0; i <= SYNC_WORKLOAD; ++i) {
+		/* select the one with lowest rb_key */
+		queue = cfq_rb_first(service_tree_for(cfqg, prio, i));
 		if (queue &&
 		    (!key_valid || time_before(queue->rb_key, lowest_key))) {
 			lowest_key = queue->rb_key;
@@ -1996,8 +1979,6 @@ static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
 
 static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
 {
-	enum wl_prio_t previous_prio = cfqd->serving_prio;
-	bool prio_changed;
 	unsigned slice;
 	unsigned count;
 	struct cfq_rb_root *st;
@@ -2025,24 +2006,19 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
 	 * (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload
 	 * expiration time
 	 */
-	prio_changed = (cfqd->serving_prio != previous_prio);
-	st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type,
-				cfqd);
+	st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type);
 	count = st->count;
 
 	/*
-	 * If priority didn't change, check workload expiration,
-	 * and that we still have other queues ready
+	 * check workload expiration, and that we still have other queues ready
 	 */
-	if (!prio_changed && count &&
-	    !time_after(jiffies, cfqd->workload_expires))
+	if (count && !time_after(jiffies, cfqd->workload_expires))
 		return;
 
 	/* otherwise select new workload type */
 	cfqd->serving_type =
-		cfq_choose_wl(cfqd, cfqg, cfqd->serving_prio, prio_changed);
-	st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type,
-				cfqd);
+		cfq_choose_wl(cfqd, cfqg, cfqd->serving_prio);
+	st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type);
 	count = st->count;
 
 	/*
-- 
cgit v1.2.2


From 9504e0864b58b4a304820dcf3755f1da80d5e72f Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Mon, 21 Dec 2009 15:55:51 +0100
Subject: block: Fix topology stacking for data and discard alignment

The stacking code incorrectly scaled up the data offset in some cases
causing misaligned devices to report alignment.  Rewrite the stacking
algorithm to remedy this and apply the same alignment principles to the
discard handling.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-settings.c | 87 ++++++++++++++++++++++++++++++----------------------
 1 file changed, 50 insertions(+), 37 deletions(-)

(limited to 'block')

diff --git a/block/blk-settings.c b/block/blk-settings.c
index 6ae118d6e193..e14fcbcedbfa 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -517,9 +517,8 @@ static unsigned int lcm(unsigned int a, unsigned int b)
 int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 		     sector_t offset)
 {
-	int ret;
-
-	ret = 0;
+	sector_t alignment;
+	unsigned int top, bottom, granularity;
 
 	t->max_sectors = min_not_zero(t->max_sectors, b->max_sectors);
 	t->max_hw_sectors = min_not_zero(t->max_hw_sectors, b->max_hw_sectors);
@@ -537,6 +536,19 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 	t->max_segment_size = min_not_zero(t->max_segment_size,
 					   b->max_segment_size);
 
+	granularity = max(b->physical_block_size, b->io_min);
+	alignment = b->alignment_offset - (offset & (granularity - 1));
+
+	if (t->alignment_offset != alignment) {
+
+		top = max(t->physical_block_size, t->io_min)
+			+ t->alignment_offset;
+		bottom = granularity + alignment;
+
+		if (max(top, bottom) & (min(top, bottom) - 1))
+			t->misaligned = 1;
+	}
+
 	t->logical_block_size = max(t->logical_block_size,
 				    b->logical_block_size);
 
@@ -544,54 +556,55 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 				     b->physical_block_size);
 
 	t->io_min = max(t->io_min, b->io_min);
+	t->io_opt = lcm(t->io_opt, b->io_opt);
+
 	t->no_cluster |= b->no_cluster;
 	t->discard_zeroes_data &= b->discard_zeroes_data;
 
-	/* Bottom device offset aligned? */
-	if (offset &&
-	    (offset & (b->physical_block_size - 1)) != b->alignment_offset) {
+	if (t->physical_block_size & (t->logical_block_size - 1)) {
+		t->physical_block_size = t->logical_block_size;
 		t->misaligned = 1;
-		ret = -1;
 	}
 
-	/*
-	 * Temporarily disable discard granularity. It's currently buggy
-	 * since we default to 0 for discard_granularity, hence this
-	 * "failure" will always trigger for non-zero offsets.
-	 */
-#if 0
-	if (offset &&
-	    (offset & (b->discard_granularity - 1)) != b->discard_alignment) {
-		t->discard_misaligned = 1;
-		ret = -1;
+	if (t->io_min & (t->physical_block_size - 1)) {
+		t->io_min = t->physical_block_size;
+		t->misaligned = 1;
 	}
-#endif
 
-	/* If top has no alignment offset, inherit from bottom */
-	if (!t->alignment_offset)
-		t->alignment_offset =
-			b->alignment_offset & (b->physical_block_size - 1);
+	if (t->io_opt & (t->physical_block_size - 1)) {
+		t->io_opt = 0;
+		t->misaligned = 1;
+	}
 
-	if (!t->discard_alignment)
-		t->discard_alignment =
-			b->discard_alignment & (b->discard_granularity - 1);
+	t->alignment_offset = lcm(t->alignment_offset, alignment)
+		& (max(t->physical_block_size, t->io_min) - 1);
 
-	/* Top device aligned on logical block boundary? */
-	if (t->alignment_offset & (t->logical_block_size - 1)) {
+	if (t->alignment_offset & (t->logical_block_size - 1))
 		t->misaligned = 1;
-		ret = -1;
-	}
 
-	/* Find lcm() of optimal I/O size and granularity */
-	t->io_opt = lcm(t->io_opt, b->io_opt);
-	t->discard_granularity = lcm(t->discard_granularity,
-				     b->discard_granularity);
+	/* Discard alignment and granularity */
+	if (b->discard_granularity) {
+
+		alignment = b->discard_alignment -
+			(offset & (b->discard_granularity - 1));
+
+		if (t->discard_granularity != 0 &&
+		    t->discard_alignment != alignment) {
+			top = t->discard_granularity + t->discard_alignment;
+			bottom = b->discard_granularity + alignment;
 
-	/* Verify that optimal I/O size is a multiple of io_min */
-	if (t->io_min && t->io_opt % t->io_min)
-		ret = -1;
+			/* Verify that top and bottom intervals line up */
+			if (max(top, bottom) & (min(top, bottom) - 1))
+				t->discard_misaligned = 1;
+		}
+
+		t->discard_granularity = max(t->discard_granularity,
+					     b->discard_granularity);
+		t->discard_alignment = lcm(t->discard_alignment, alignment) &
+			(t->discard_granularity - 1);
+	}
 
-	return ret;
+	return t->misaligned ? -1 : 0;
 }
 EXPORT_SYMBOL(blk_stack_limits);
 
-- 
cgit v1.2.2


From 2f7a2d89a8b5915d89ad9ebeb0065db7d5831cea Mon Sep 17 00:00:00 2001
From: Shaohua Li <shaohua.li@intel.com>
Date: Mon, 28 Dec 2009 13:18:44 +0100
Subject: cfq-iosched: don't regard requests with long distance as close

seek_mean could be very big sometimes, using it as close criteria is meaningless
as this doen't improve any performance. So if it's big, let's fallback to
default value.

Reviewed-by: Corrado Zoccolo <czoccolo@gmail.com>
Signed-off-by: Shaohua Li<shaohua.li@intel.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 8df4fe58f4e7..918c7fd9aeb1 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1667,13 +1667,17 @@ static inline sector_t cfq_dist_from_last(struct cfq_data *cfqd,
 #define CFQQ_SEEKY(cfqq)	((cfqq)->seek_mean > CFQQ_SEEK_THR)
 
 static inline int cfq_rq_close(struct cfq_data *cfqd, struct cfq_queue *cfqq,
-			       struct request *rq)
+			       struct request *rq, bool for_preempt)
 {
 	sector_t sdist = cfqq->seek_mean;
 
 	if (!sample_valid(cfqq->seek_samples))
 		sdist = CFQQ_SEEK_THR;
 
+	/* if seek_mean is big, using it as close criteria is meaningless */
+	if (sdist > CFQQ_SEEK_THR && !for_preempt)
+		sdist = CFQQ_SEEK_THR;
+
 	return cfq_dist_from_last(cfqd, rq) <= sdist;
 }
 
@@ -1701,7 +1705,7 @@ static struct cfq_queue *cfqq_close(struct cfq_data *cfqd,
 	 * will contain the closest sector.
 	 */
 	__cfqq = rb_entry(parent, struct cfq_queue, p_node);
-	if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq))
+	if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq, false))
 		return __cfqq;
 
 	if (blk_rq_pos(__cfqq->next_rq) < sector)
@@ -1712,7 +1716,7 @@ static struct cfq_queue *cfqq_close(struct cfq_data *cfqd,
 		return NULL;
 
 	__cfqq = rb_entry(node, struct cfq_queue, p_node);
-	if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq))
+	if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq, false))
 		return __cfqq;
 
 	return NULL;
@@ -3112,7 +3116,7 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
 	 * if this request is as-good as one we would expect from the
 	 * current cfqq, let it preempt
 	 */
-	if (cfq_rq_close(cfqd, cfqq, rq))
+	if (cfq_rq_close(cfqd, cfqq, rq, true))
 		return true;
 
 	return false;
-- 
cgit v1.2.2


From 81744ee44ab2845c16ffd7d6f762f7b4a49a4750 Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Tue, 29 Dec 2009 08:35:35 +0100
Subject: block: Fix incorrect alignment offset reporting and update
 documentation

queue_sector_alignment_offset returned the wrong value which caused
partitions to report an incorrect alignment_offset.  Since offset
alignment calculation is needed several places it has been split into a
separate helper function.  The topology stacking function has been
updated accordingly.

Furthermore, comments have been added to clarify how the stacking
function works.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Tested-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-settings.c | 44 +++++++++++++++++++++++++++++++++-----------
 1 file changed, 33 insertions(+), 11 deletions(-)

(limited to 'block')

diff --git a/block/blk-settings.c b/block/blk-settings.c
index e14fcbcedbfa..d52d4adc440b 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -505,20 +505,30 @@ static unsigned int lcm(unsigned int a, unsigned int b)
 
 /**
  * blk_stack_limits - adjust queue_limits for stacked devices
- * @t:	the stacking driver limits (top)
- * @b:  the underlying queue limits (bottom)
+ * @t:	the stacking driver limits (top device)
+ * @b:  the underlying queue limits (bottom, component device)
  * @offset:  offset to beginning of data within component device
  *
  * Description:
- *    Merges two queue_limit structs.  Returns 0 if alignment didn't
- *    change.  Returns -1 if adding the bottom device caused
- *    misalignment.
+ *    This function is used by stacking drivers like MD and DM to ensure
+ *    that all component devices have compatible block sizes and
+ *    alignments.  The stacking driver must provide a queue_limits
+ *    struct (top) and then iteratively call the stacking function for
+ *    all component (bottom) devices.  The stacking function will
+ *    attempt to combine the values and ensure proper alignment.
+ *
+ *    Returns 0 if the top and bottom queue_limits are compatible.  The
+ *    top device's block sizes and alignment offsets may be adjusted to
+ *    ensure alignment with the bottom device. If no compatible sizes
+ *    and alignments exist, -1 is returned and the resulting top
+ *    queue_limits will have the misaligned flag set to indicate that
+ *    the alignment_offset is undefined.
  */
 int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 		     sector_t offset)
 {
 	sector_t alignment;
-	unsigned int top, bottom, granularity;
+	unsigned int top, bottom;
 
 	t->max_sectors = min_not_zero(t->max_sectors, b->max_sectors);
 	t->max_hw_sectors = min_not_zero(t->max_hw_sectors, b->max_hw_sectors);
@@ -536,15 +546,18 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 	t->max_segment_size = min_not_zero(t->max_segment_size,
 					   b->max_segment_size);
 
-	granularity = max(b->physical_block_size, b->io_min);
-	alignment = b->alignment_offset - (offset & (granularity - 1));
+	alignment = queue_limit_alignment_offset(b, offset);
 
+	/* Bottom device has different alignment.  Check that it is
+	 * compatible with the current top alignment.
+	 */
 	if (t->alignment_offset != alignment) {
 
 		top = max(t->physical_block_size, t->io_min)
 			+ t->alignment_offset;
-		bottom = granularity + alignment;
+		bottom = max(b->physical_block_size, b->io_min) + alignment;
 
+		/* Verify that top and bottom intervals line up */
 		if (max(top, bottom) & (min(top, bottom) - 1))
 			t->misaligned = 1;
 	}
@@ -561,32 +574,39 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 	t->no_cluster |= b->no_cluster;
 	t->discard_zeroes_data &= b->discard_zeroes_data;
 
+	/* Physical block size a multiple of the logical block size? */
 	if (t->physical_block_size & (t->logical_block_size - 1)) {
 		t->physical_block_size = t->logical_block_size;
 		t->misaligned = 1;
 	}
 
+	/* Minimum I/O a multiple of the physical block size? */
 	if (t->io_min & (t->physical_block_size - 1)) {
 		t->io_min = t->physical_block_size;
 		t->misaligned = 1;
 	}
 
+	/* Optimal I/O a multiple of the physical block size? */
 	if (t->io_opt & (t->physical_block_size - 1)) {
 		t->io_opt = 0;
 		t->misaligned = 1;
 	}
 
+	/* Find lowest common alignment_offset */
 	t->alignment_offset = lcm(t->alignment_offset, alignment)
 		& (max(t->physical_block_size, t->io_min) - 1);
 
+	/* Verify that new alignment_offset is on a logical block boundary */
 	if (t->alignment_offset & (t->logical_block_size - 1))
 		t->misaligned = 1;
 
 	/* Discard alignment and granularity */
 	if (b->discard_granularity) {
+		unsigned int granularity = b->discard_granularity;
+		offset &= granularity - 1;
 
-		alignment = b->discard_alignment -
-			(offset & (b->discard_granularity - 1));
+		alignment = (granularity + b->discard_alignment - offset)
+			& (granularity - 1);
 
 		if (t->discard_granularity != 0 &&
 		    t->discard_alignment != alignment) {
@@ -598,6 +618,8 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 				t->discard_misaligned = 1;
 		}
 
+		t->max_discard_sectors = min_not_zero(t->max_discard_sectors,
+						      b->max_discard_sectors);
 		t->discard_granularity = max(t->discard_granularity,
 					     b->discard_granularity);
 		t->discard_alignment = lcm(t->discard_alignment, alignment) &
-- 
cgit v1.2.2


From e79e95db5cffb2e01170d510686489c40937faa1 Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Tue, 29 Dec 2009 08:53:54 +0100
Subject: block: Honor the gfp_mask for alloc_page() in blkdev_issue_discard()

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-barrier.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/blk-barrier.c b/block/blk-barrier.c
index 8873b9b439ff..8618d8996fea 100644
--- a/block/blk-barrier.c
+++ b/block/blk-barrier.c
@@ -402,7 +402,7 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 		 * our current implementations need.  If we'll ever need
 		 * more the interface will need revisiting.
 		 */
-		page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+		page = alloc_page(gfp_mask | __GFP_ZERO);
 		if (!page)
 			goto out_free_bio;
 		if (bio_add_pc_page(q, bio, page, sector_size, 0) < sector_size)
-- 
cgit v1.2.2


From fe0b393f2c0a0d23a9bc9ed7dc51a1ee511098bd Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Mon, 11 Jan 2010 03:21:47 -0500
Subject: block: Correct handling of bottom device misaligment

The top device misalignment flag would not be set if the added bottom
device was already misaligned as opposed to causing a stacking failure.

Also massage the reporting so that an error is only returned if adding
the bottom device caused the misalignment.  I.e. don't return an error
if the top is already flagged as misaligned.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-settings.c | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

(limited to 'block')

diff --git a/block/blk-settings.c b/block/blk-settings.c
index d52d4adc440b..127f82551855 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -528,7 +528,7 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 		     sector_t offset)
 {
 	sector_t alignment;
-	unsigned int top, bottom;
+	unsigned int top, bottom, ret = 0;
 
 	t->max_sectors = min_not_zero(t->max_sectors, b->max_sectors);
 	t->max_hw_sectors = min_not_zero(t->max_hw_sectors, b->max_hw_sectors);
@@ -546,6 +546,8 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 	t->max_segment_size = min_not_zero(t->max_segment_size,
 					   b->max_segment_size);
 
+	t->misaligned |= b->misaligned;
+
 	alignment = queue_limit_alignment_offset(b, offset);
 
 	/* Bottom device has different alignment.  Check that it is
@@ -558,8 +560,10 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 		bottom = max(b->physical_block_size, b->io_min) + alignment;
 
 		/* Verify that top and bottom intervals line up */
-		if (max(top, bottom) & (min(top, bottom) - 1))
+		if (max(top, bottom) & (min(top, bottom) - 1)) {
 			t->misaligned = 1;
+			ret = -1;
+		}
 	}
 
 	t->logical_block_size = max(t->logical_block_size,
@@ -578,18 +582,21 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 	if (t->physical_block_size & (t->logical_block_size - 1)) {
 		t->physical_block_size = t->logical_block_size;
 		t->misaligned = 1;
+		ret = -1;
 	}
 
 	/* Minimum I/O a multiple of the physical block size? */
 	if (t->io_min & (t->physical_block_size - 1)) {
 		t->io_min = t->physical_block_size;
 		t->misaligned = 1;
+		ret = -1;
 	}
 
 	/* Optimal I/O a multiple of the physical block size? */
 	if (t->io_opt & (t->physical_block_size - 1)) {
 		t->io_opt = 0;
 		t->misaligned = 1;
+		ret = -1;
 	}
 
 	/* Find lowest common alignment_offset */
@@ -597,8 +604,10 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 		& (max(t->physical_block_size, t->io_min) - 1);
 
 	/* Verify that new alignment_offset is on a logical block boundary */
-	if (t->alignment_offset & (t->logical_block_size - 1))
+	if (t->alignment_offset & (t->logical_block_size - 1)) {
 		t->misaligned = 1;
+		ret = -1;
+	}
 
 	/* Discard alignment and granularity */
 	if (b->discard_granularity) {
@@ -626,7 +635,7 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 			(t->discard_granularity - 1);
 	}
 
-	return t->misaligned ? -1 : 0;
+	return ret;
 }
 EXPORT_SYMBOL(blk_stack_limits);
 
-- 
cgit v1.2.2


From dd3d145d49c5816b79acc6761ebbd842bc50b0ee Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Mon, 11 Jan 2010 03:21:48 -0500
Subject: block: Fix discard alignment calculation and printing

Discard alignment reporting for partitions was incorrect.  Update to
match the algorithm used elsewhere.

The alignment can be negative (misaligned).  Fix format string
accordingly.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/genhd.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/genhd.c b/block/genhd.c
index b11a4ad7d571..d13ba76a169c 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -867,7 +867,7 @@ static ssize_t disk_discard_alignment_show(struct device *dev,
 {
 	struct gendisk *disk = dev_to_disk(dev);
 
-	return sprintf(buf, "%u\n", queue_discard_alignment(disk->queue));
+	return sprintf(buf, "%d\n", queue_discard_alignment(disk->queue));
 }
 
 static DEVICE_ATTR(range, S_IRUGO, disk_range_show, NULL);
-- 
cgit v1.2.2


From 17be8c245054b9c7786545af3ba3ca4e54cd4ad9 Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Mon, 11 Jan 2010 03:21:49 -0500
Subject: block: bdev_stack_limits wrapper

DM does not want to know about partition offsets.  Add a partition-aware
wrapper that DM can use when stacking block devices.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Acked-by: Mike Snitzer <snitzer@redhat.com>
Reviewed-by: Alasdair G Kergon <agk@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-settings.c | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

(limited to 'block')

diff --git a/block/blk-settings.c b/block/blk-settings.c
index 127f82551855..5eeb9e0d256e 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -639,6 +639,28 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 }
 EXPORT_SYMBOL(blk_stack_limits);
 
+/**
+ * bdev_stack_limits - adjust queue limits for stacked drivers
+ * @t:	the stacking driver limits (top device)
+ * @bdev:  the component block_device (bottom)
+ * @start:  first data sector within component device
+ *
+ * Description:
+ *    Merges queue limits for a top device and a block_device.  Returns
+ *    0 if alignment didn't change.  Returns -1 if adding the bottom
+ *    device caused misalignment.
+ */
+int bdev_stack_limits(struct queue_limits *t, struct block_device *bdev,
+		      sector_t start)
+{
+	struct request_queue *bq = bdev_get_queue(bdev);
+
+	start += get_start_sect(bdev);
+
+	return blk_stack_limits(t, &bq->limits, start << 9);
+}
+EXPORT_SYMBOL(bdev_stack_limits);
+
 /**
  * disk_stack_limits - adjust queue limits for stacked drivers
  * @disk:  MD/DM gendisk (top)
-- 
cgit v1.2.2


From ce289321b7dc1eb108e3df0dec872b7429ef49f7 Mon Sep 17 00:00:00 2001
From: Kirill Afonshin <kirill_nnov@mail.ru>
Date: Fri, 8 Jan 2010 22:09:59 +0300
Subject: block: removed unused as_io_context

It isn't used anymore, since AS was deleted.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-ioc.c | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'block')

diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index cbdabb0dd6d7..98e6bf61b0ac 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -39,8 +39,6 @@ int put_io_context(struct io_context *ioc)
 
 	if (atomic_long_dec_and_test(&ioc->refcount)) {
 		rcu_read_lock();
-		if (ioc->aic && ioc->aic->dtor)
-			ioc->aic->dtor(ioc->aic);
 		cfq_dtor(ioc);
 		rcu_read_unlock();
 
@@ -76,8 +74,6 @@ void exit_io_context(struct task_struct *task)
 	task_unlock(task);
 
 	if (atomic_dec_and_test(&ioc->nr_tasks)) {
-		if (ioc->aic && ioc->aic->exit)
-			ioc->aic->exit(ioc->aic);
 		cfq_exit(ioc);
 
 	}
@@ -97,7 +93,6 @@ struct io_context *alloc_io_context(gfp_t gfp_flags, int node)
 		ret->ioprio = 0;
 		ret->last_waited = jiffies; /* doesn't matter... */
 		ret->nr_batch_requests = 0; /* because this is 0 */
-		ret->aic = NULL;
 		INIT_RADIX_TREE(&ret->radix_root, GFP_ATOMIC | __GFP_HIGH);
 		INIT_HLIST_HEAD(&ret->cic_list);
 		ret->ioc_data = NULL;
-- 
cgit v1.2.2


From e03a72e13648ac6277bf2bab6b8324d51f89c0fa Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Mon, 11 Jan 2010 03:21:51 -0500
Subject: block: Stop using byte offsets

All callers of the stacking functions use 512-byte sector units rather
than byte offsets.  Simplify the code so the stacking functions take
sectors when specifying data offsets.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-settings.c | 26 +++++++++-----------------
 1 file changed, 9 insertions(+), 17 deletions(-)

(limited to 'block')

diff --git a/block/blk-settings.c b/block/blk-settings.c
index 5eeb9e0d256e..78549c723783 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -507,7 +507,7 @@ static unsigned int lcm(unsigned int a, unsigned int b)
  * blk_stack_limits - adjust queue_limits for stacked devices
  * @t:	the stacking driver limits (top device)
  * @b:  the underlying queue limits (bottom, component device)
- * @offset:  offset to beginning of data within component device
+ * @start:  first data sector within component device
  *
  * Description:
  *    This function is used by stacking drivers like MD and DM to ensure
@@ -525,10 +525,9 @@ static unsigned int lcm(unsigned int a, unsigned int b)
  *    the alignment_offset is undefined.
  */
 int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
-		     sector_t offset)
+		     sector_t start)
 {
-	sector_t alignment;
-	unsigned int top, bottom, ret = 0;
+	unsigned int top, bottom, alignment, ret = 0;
 
 	t->max_sectors = min_not_zero(t->max_sectors, b->max_sectors);
 	t->max_hw_sectors = min_not_zero(t->max_hw_sectors, b->max_hw_sectors);
@@ -548,7 +547,7 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 
 	t->misaligned |= b->misaligned;
 
-	alignment = queue_limit_alignment_offset(b, offset);
+	alignment = queue_limit_alignment_offset(b, start);
 
 	/* Bottom device has different alignment.  Check that it is
 	 * compatible with the current top alignment.
@@ -611,11 +610,7 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 
 	/* Discard alignment and granularity */
 	if (b->discard_granularity) {
-		unsigned int granularity = b->discard_granularity;
-		offset &= granularity - 1;
-
-		alignment = (granularity + b->discard_alignment - offset)
-			& (granularity - 1);
+		alignment = queue_limit_discard_alignment(b, start);
 
 		if (t->discard_granularity != 0 &&
 		    t->discard_alignment != alignment) {
@@ -657,7 +652,7 @@ int bdev_stack_limits(struct queue_limits *t, struct block_device *bdev,
 
 	start += get_start_sect(bdev);
 
-	return blk_stack_limits(t, &bq->limits, start << 9);
+	return blk_stack_limits(t, &bq->limits, start);
 }
 EXPORT_SYMBOL(bdev_stack_limits);
 
@@ -668,9 +663,8 @@ EXPORT_SYMBOL(bdev_stack_limits);
  * @offset:  offset to beginning of data within component device
  *
  * Description:
- *    Merges the limits for two queues.  Returns 0 if alignment
- *    didn't change.  Returns -1 if adding the bottom device caused
- *    misalignment.
+ *    Merges the limits for a top level gendisk and a bottom level
+ *    block_device.
  */
 void disk_stack_limits(struct gendisk *disk, struct block_device *bdev,
 		       sector_t offset)
@@ -678,9 +672,7 @@ void disk_stack_limits(struct gendisk *disk, struct block_device *bdev,
 	struct request_queue *t = disk->queue;
 	struct request_queue *b = bdev_get_queue(bdev);
 
-	offset += get_start_sect(bdev) << 9;
-
-	if (blk_stack_limits(&t->limits, &b->limits, offset) < 0) {
+	if (bdev_stack_limits(&t->limits, bdev, offset >> 9) < 0) {
 		char top[BDEVNAME_SIZE], bottom[BDEVNAME_SIZE];
 
 		disk_name(disk, 0, top);
-- 
cgit v1.2.2


From 875feb63b9567442be73efbcc9a8470e376d6423 Mon Sep 17 00:00:00 2001
From: Divyesh Shah <dpshah@google.com>
Date: Wed, 6 Jan 2010 18:58:20 -0800
Subject: cfq-iosched: Respect ioprio_class when preempting

In cfq_should_preempt(), we currently allow some cases where a non-RT request
can preempt an ongoing RT cfqq timeslice. This should not happen.
Examples include:

o A sync_noidle wl type non-RT request pre-empting a sync_noidle wl type cfqq
  on which we are idling.
o Once we have per-cgroup async queues, a non-RT sync request pre-empting a RT
  async cfqq.

Signed-off-by: Divyesh Shah<dpshah@google.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 918c7fd9aeb1..ee130f14d1fc 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -3076,6 +3076,12 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
 	if (cfq_class_idle(cfqq))
 		return true;
 
+	/*
+	 * Don't allow a non-RT request to preempt an ongoing RT cfqq timeslice.
+	 */
+	if (cfq_class_rt(cfqq) && !cfq_class_rt(new_cfqq))
+		return false;
+
 	/*
 	 * if the new request is sync, but the currently running queue is
 	 * not, let the sync request have priority.
-- 
cgit v1.2.2


From 488991e28e55b4fbca8067edf0259f69d1a6f92c Mon Sep 17 00:00:00 2001
From: "Alan D. Brunelle" <Alan.Brunelle@hp.com>
Date: Fri, 29 Jan 2010 09:04:08 +0100
Subject: block: Added in stricter no merge semantics for block I/O

Updated 'nomerges' tunable to accept a value of '2' - indicating that _no_
merges at all are to be attempted (not even the simple one-hit cache).

The following table illustrates the additional benefit - 5 minute runs of
a random I/O load were applied to a dozen devices on a 16-way x86_64 system.

nomerges        Throughput      %System         Improvement (tput / %sys)
--------        ------------    -----------     -------------------------
0               12.45 MB/sec    0.669365609
1               12.50 MB/sec    0.641519199     0.40% / 2.71%
2               12.52 MB/sec    0.639849750     0.56% / 2.96%

Signed-off-by: Alan D. Brunelle <alan.brunelle@hp.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-sysfs.c | 11 +++++++----
 block/elevator.c  | 11 ++++++++++-
 2 files changed, 17 insertions(+), 5 deletions(-)

(limited to 'block')

diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 8606c9543fdd..e85442415db3 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -189,7 +189,8 @@ static ssize_t queue_nonrot_store(struct request_queue *q, const char *page,
 
 static ssize_t queue_nomerges_show(struct request_queue *q, char *page)
 {
-	return queue_var_show(blk_queue_nomerges(q), page);
+	return queue_var_show((blk_queue_nomerges(q) << 1) |
+			       blk_queue_noxmerges(q), page);
 }
 
 static ssize_t queue_nomerges_store(struct request_queue *q, const char *page,
@@ -199,10 +200,12 @@ static ssize_t queue_nomerges_store(struct request_queue *q, const char *page,
 	ssize_t ret = queue_var_store(&nm, page, count);
 
 	spin_lock_irq(q->queue_lock);
-	if (nm)
+	queue_flag_clear(QUEUE_FLAG_NOMERGES, q);
+	queue_flag_clear(QUEUE_FLAG_NOXMERGES, q);
+	if (nm == 2)
 		queue_flag_set(QUEUE_FLAG_NOMERGES, q);
-	else
-		queue_flag_clear(QUEUE_FLAG_NOMERGES, q);
+	else if (nm)
+		queue_flag_set(QUEUE_FLAG_NOXMERGES, q);
 	spin_unlock_irq(q->queue_lock);
 
 	return ret;
diff --git a/block/elevator.c b/block/elevator.c
index 9ad5ccc4c5ee..ee3a883840f2 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -473,6 +473,15 @@ int elv_merge(struct request_queue *q, struct request **req, struct bio *bio)
 	struct request *__rq;
 	int ret;
 
+	/*
+	 * Levels of merges:
+	 * 	nomerges:  No merges at all attempted
+	 * 	noxmerges: Only simple one-hit cache try
+	 * 	merges:	   All merge tries attempted
+	 */
+	if (blk_queue_nomerges(q))
+		return ELEVATOR_NO_MERGE;
+
 	/*
 	 * First try one-hit cache.
 	 */
@@ -484,7 +493,7 @@ int elv_merge(struct request_queue *q, struct request **req, struct bio *bio)
 		}
 	}
 
-	if (blk_queue_nomerges(q))
+	if (blk_queue_noxmerges(q))
 		return ELEVATOR_NO_MERGE;
 
 	/*
-- 
cgit v1.2.2


From bcf4dd43424cdfd8195f3955300a579fe58e9911 Mon Sep 17 00:00:00 2001
From: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Date: Mon, 1 Feb 2010 09:58:54 +0100
Subject: blk-cgroup: Fix potential deadlock in blk-cgroup

I triggered a lockdep warning as following.

=======================================================
[ INFO: possible circular locking dependency detected ]
2.6.33-rc2 #1
-------------------------------------------------------
test_io_control/7357 is trying to acquire lock:
 (blkio_list_lock){+.+...}, at: [<c053a990>] blkiocg_weight_write+0x82/0x9e

but task is already holding lock:
 (&(&blkcg->lock)->rlock){......}, at: [<c053a949>] blkiocg_weight_write+0x3b/0x9e

which lock already depends on the new lock.

the existing dependency chain (in reverse order) is:

-> #2 (&(&blkcg->lock)->rlock){......}:
       [<c04583b7>] validate_chain+0x8bc/0xb9c
       [<c0458dba>] __lock_acquire+0x723/0x789
       [<c0458eb0>] lock_acquire+0x90/0xa7
       [<c0692b0a>] _raw_spin_lock_irqsave+0x27/0x5a
       [<c053a4e1>] blkiocg_add_blkio_group+0x1a/0x6d
       [<c053cac7>] cfq_get_queue+0x225/0x3de
       [<c053eec2>] cfq_set_request+0x217/0x42d
       [<c052c8a6>] elv_set_request+0x17/0x26
       [<c0532a0f>] get_request+0x203/0x2c5
       [<c0532ae9>] get_request_wait+0x18/0x10e
       [<c0533470>] __make_request+0x2ba/0x375
       [<c0531985>] generic_make_request+0x28d/0x30f
       [<c0532da7>] submit_bio+0x8a/0x8f
       [<c04d827a>] submit_bh+0xf0/0x10f
       [<c04d91d2>] ll_rw_block+0xc0/0xf9
       [<f86e9705>] ext3_find_entry+0x319/0x544 [ext3]
       [<f86eae58>] ext3_lookup+0x2c/0xb9 [ext3]
       [<c04c3e1b>] do_lookup+0xd3/0x172
       [<c04c56c8>] link_path_walk+0x5fb/0x95c
       [<c04c5a65>] path_walk+0x3c/0x81
       [<c04c5b63>] do_path_lookup+0x21/0x8a
       [<c04c66cc>] do_filp_open+0xf0/0x978
       [<c04c0c7e>] open_exec+0x1b/0xb7
       [<c04c1436>] do_execve+0xbb/0x266
       [<c04081a9>] sys_execve+0x24/0x4a
       [<c04028a2>] ptregs_execve+0x12/0x18

-> #1 (&(&q->__queue_lock)->rlock){..-.-.}:
       [<c04583b7>] validate_chain+0x8bc/0xb9c
       [<c0458dba>] __lock_acquire+0x723/0x789
       [<c0458eb0>] lock_acquire+0x90/0xa7
       [<c0692b0a>] _raw_spin_lock_irqsave+0x27/0x5a
       [<c053dd2a>] cfq_unlink_blkio_group+0x17/0x41
       [<c053a6eb>] blkiocg_destroy+0x72/0xc7
       [<c0467df0>] cgroup_diput+0x4a/0xb2
       [<c04ca473>] dentry_iput+0x93/0xb7
       [<c04ca4b3>] d_kill+0x1c/0x36
       [<c04cb5c5>] dput+0xf5/0xfe
       [<c04c6084>] do_rmdir+0x95/0xbe
       [<c04c60ec>] sys_rmdir+0x10/0x12
       [<c04027cc>] sysenter_do_call+0x12/0x32

-> #0 (blkio_list_lock){+.+...}:
       [<c0458117>] validate_chain+0x61c/0xb9c
       [<c0458dba>] __lock_acquire+0x723/0x789
       [<c0458eb0>] lock_acquire+0x90/0xa7
       [<c06929fd>] _raw_spin_lock+0x1e/0x4e
       [<c053a990>] blkiocg_weight_write+0x82/0x9e
       [<c0467f1e>] cgroup_file_write+0xc6/0x1c0
       [<c04bd2f3>] vfs_write+0x8c/0x116
       [<c04bd7c6>] sys_write+0x3b/0x60
       [<c04027cc>] sysenter_do_call+0x12/0x32

other info that might help us debug this:

1 lock held by test_io_control/7357:
 #0:  (&(&blkcg->lock)->rlock){......}, at: [<c053a949>] blkiocg_weight_write+0x3b/0x9e
stack backtrace:
Pid: 7357, comm: test_io_control Not tainted 2.6.33-rc2 #1
Call Trace:
 [<c045754f>] print_circular_bug+0x91/0x9d
 [<c0458117>] validate_chain+0x61c/0xb9c
 [<c0458dba>] __lock_acquire+0x723/0x789
 [<c0458eb0>] lock_acquire+0x90/0xa7
 [<c053a990>] ? blkiocg_weight_write+0x82/0x9e
 [<c06929fd>] _raw_spin_lock+0x1e/0x4e
 [<c053a990>] ? blkiocg_weight_write+0x82/0x9e
 [<c053a990>] blkiocg_weight_write+0x82/0x9e
 [<c0467f1e>] cgroup_file_write+0xc6/0x1c0
 [<c0454df5>] ? trace_hardirqs_off+0xb/0xd
 [<c044d93a>] ? cpu_clock+0x2e/0x44
 [<c050e6ec>] ? security_file_permission+0xf/0x11
 [<c04bcdda>] ? rw_verify_area+0x8a/0xad
 [<c0467e58>] ? cgroup_file_write+0x0/0x1c0
 [<c04bd2f3>] vfs_write+0x8c/0x116
 [<c04bd7c6>] sys_write+0x3b/0x60
 [<c04027cc>] sysenter_do_call+0x12/0x32

To prevent deadlock, we should take locks as following sequence:

blkio_list_lock -> queue_lock ->  blkcg_lock.

The following patch should fix this bug.

Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-cgroup.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'block')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 1fa2654db0a6..e7dbbaf5fb3e 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -147,16 +147,16 @@ blkiocg_weight_write(struct cgroup *cgroup, struct cftype *cftype, u64 val)
 		return -EINVAL;
 
 	blkcg = cgroup_to_blkio_cgroup(cgroup);
+	spin_lock(&blkio_list_lock);
 	spin_lock_irq(&blkcg->lock);
 	blkcg->weight = (unsigned int)val;
 	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
-		spin_lock(&blkio_list_lock);
 		list_for_each_entry(blkiop, &blkio_list, list)
 			blkiop->ops.blkio_update_group_weight_fn(blkg,
 					blkcg->weight);
-		spin_unlock(&blkio_list_lock);
 	}
 	spin_unlock_irq(&blkcg->lock);
+	spin_unlock(&blkio_list_lock);
 	return 0;
 }
 
-- 
cgit v1.2.2


From 1efe8fe1c2240acc476bed77740883df63373862 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Tue, 2 Feb 2010 20:45:46 +0100
Subject: cfq-iosched: Do not idle on async queues

Few weeks back, Shaohua Li had posted similar patch. I am reposting it
with more test results.

This patch does two things.

- Do not idle on async queues.

- It also changes the write queue depth CFQ drives (cfq_may_dispatch()).
  Currently, we seem to driving queue depth of 1 always for WRITES. This is
  true even if there is only one write queue in the system and all the logic
  of infinite queue depth in case of single busy queue as well as slowly
  increasing queue depth based on last delayed sync request does not seem to
  be kicking in at all.

This patch will allow deeper WRITE queue depths (subjected to the other
WRITE queue depth contstraints like cfq_quantum and last delayed sync
request).

Shaohua Li had reported getting more out of his SSD. For me, I have got
one Lun exported from an HP EVA and when pure buffered writes are on, I
can get more out of the system. Following are test results of pure
buffered writes (with end_fsync=1) with vanilla and patched kernel. These
results are average of 3 sets of run with increasing number of threads.

AVERAGE[bufwfs][vanilla]
-------
job       Set NR  ReadBW(KB/s)   MaxClat(us)    WriteBW(KB/s)  MaxClat(us)
---       --- --  ------------   -----------    -------------  -----------
bufwfs    3   1   0              0              95349          474141
bufwfs    3   2   0              0              100282         806926
bufwfs    3   4   0              0              109989         2.7301e+06
bufwfs    3   8   0              0              116642         3762231
bufwfs    3   16  0              0              118230         6902970

AVERAGE[bufwfs] [patched kernel]
-------
bufwfs    3   1   0              0              270722         404352
bufwfs    3   2   0              0              206770         1.06552e+06
bufwfs    3   4   0              0              195277         1.62283e+06
bufwfs    3   8   0              0              260960         2.62979e+06
bufwfs    3   16  0              0              299260         1.70731e+06

I also ran buffered writes along with some sequential reads and some
buffered reads going on in the system on a SATA disk because the potential
risk could be that we should not be driving queue depth higher in presence
of sync IO going to keep the max clat low.

With some random and sequential reads going on in the system on one SATA
disk I did not see any significant increase in max clat. So it looks like
other WRITE queue depth control logic is doing its job. Here are the
results.

AVERAGE[brr, bsr, bufw together] [vanilla]
-------
job       Set NR  ReadBW(KB/s)   MaxClat(us)    WriteBW(KB/s)  MaxClat(us)
---       --- --  ------------   -----------    -------------  -----------
brr       3   1   850            546345         0              0
bsr       3   1   14650          729543         0              0
bufw      3   1   0              0              23908          8274517

brr       3   2   981.333        579395         0              0
bsr       3   2   14149.7        1175689        0              0
bufw      3   2   0              0              21921          1.28108e+07

brr       3   4   898.333        1.75527e+06    0              0
bsr       3   4   12230.7        1.40072e+06    0              0
bufw      3   4   0              0              19722.3        2.4901e+07

brr       3   8   900            3160594        0              0
bsr       3   8   9282.33        1.91314e+06    0              0
bufw      3   8   0              0              18789.3        23890622

AVERAGE[brr, bsr, bufw mixed] [patched kernel]
-------
job       Set NR  ReadBW(KB/s)   MaxClat(us)    WriteBW(KB/s)  MaxClat(us)
---       --- --  ------------   -----------    -------------  -----------
brr       3   1   837            417973         0              0
bsr       3   1   14357.7        591275         0              0
bufw      3   1   0              0              24869.7        8910662

brr       3   2   1038.33        543434         0              0
bsr       3   2   13351.3        1205858        0              0
bufw      3   2   0              0              18626.3        13280370

brr       3   4   913            1.86861e+06    0              0
bsr       3   4   12652.3        1430974        0              0
bufw      3   4   0              0              15343.3        2.81305e+07

brr       3   8   890            2.92695e+06    0              0
bsr       3   8   9635.33        1.90244e+06    0              0
bufw      3   8   0              0              17200.3        24424392

So looks like it might make sense to include this patch.

Thanks
Vivek

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index ee130f14d1fc..17b768d0d42f 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1803,7 +1803,7 @@ static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 	 * Otherwise, we do only if they are the last ones
 	 * in their service tree.
 	 */
-	return service_tree->count == 1;
+	return service_tree->count == 1 && cfq_cfqq_sync(cfqq);
 }
 
 static void cfq_arm_slice_timer(struct cfq_data *cfqd)
-- 
cgit v1.2.2


From ae54abed636d18f7939c965f21ad126001dbe34c Mon Sep 17 00:00:00 2001
From: Shaohua Li <shaohua.li@intel.com>
Date: Fri, 5 Feb 2010 13:11:45 +0100
Subject: cfq-iosched: split seeky coop queues after one slice

Currently we split seeky coop queues after 1s, which is too big. Below patch
marks seeky coop queue split_coop flag after one slice. After that, if new
requests come in, the queues will be splitted. Patch is suggested by Corrado.

Signed-off-by: Shaohua Li <shaohua.li@intel.com>
Reviewed-by: Corrado Zoccolo <czoccolo@gmail.com>
Acked-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 49 ++++++++++++++++---------------------------------
 1 file changed, 16 insertions(+), 33 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 17b768d0d42f..023f4e69a337 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -42,16 +42,13 @@ static const int cfq_hist_divisor = 4;
  */
 #define CFQ_MIN_TT		(2)
 
-/*
- * Allow merged cfqqs to perform this amount of seeky I/O before
- * deciding to break the queues up again.
- */
-#define CFQQ_COOP_TOUT		(HZ)
-
 #define CFQ_SLICE_SCALE		(5)
 #define CFQ_HW_QUEUE_MIN	(5)
 #define CFQ_SERVICE_SHIFT       12
 
+#define CFQQ_SEEK_THR		8 * 1024
+#define CFQQ_SEEKY(cfqq)	((cfqq)->seek_mean > CFQQ_SEEK_THR)
+
 #define RQ_CIC(rq)		\
 	((struct cfq_io_context *) (rq)->elevator_private)
 #define RQ_CFQQ(rq)		(struct cfq_queue *) ((rq)->elevator_private2)
@@ -137,7 +134,6 @@ struct cfq_queue {
 	u64 seek_total;
 	sector_t seek_mean;
 	sector_t last_request_pos;
-	unsigned long seeky_start;
 
 	pid_t pid;
 
@@ -314,6 +310,7 @@ enum cfqq_state_flags {
 	CFQ_CFQQ_FLAG_slice_new,	/* no requests dispatched in slice */
 	CFQ_CFQQ_FLAG_sync,		/* synchronous queue */
 	CFQ_CFQQ_FLAG_coop,		/* cfqq is shared */
+	CFQ_CFQQ_FLAG_split_coop,	/* shared cfqq will be splitted */
 	CFQ_CFQQ_FLAG_deep,		/* sync cfqq experienced large depth */
 	CFQ_CFQQ_FLAG_wait_busy,	/* Waiting for next request */
 };
@@ -342,6 +339,7 @@ CFQ_CFQQ_FNS(prio_changed);
 CFQ_CFQQ_FNS(slice_new);
 CFQ_CFQQ_FNS(sync);
 CFQ_CFQQ_FNS(coop);
+CFQ_CFQQ_FNS(split_coop);
 CFQ_CFQQ_FNS(deep);
 CFQ_CFQQ_FNS(wait_busy);
 #undef CFQ_CFQQ_FNS
@@ -1565,6 +1563,15 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 	cfq_clear_cfqq_wait_request(cfqq);
 	cfq_clear_cfqq_wait_busy(cfqq);
 
+	/*
+	 * If this cfqq is shared between multiple processes, check to
+	 * make sure that those processes are still issuing I/Os within
+	 * the mean seek distance.  If not, it may be time to break the
+	 * queues apart again.
+	 */
+	if (cfq_cfqq_coop(cfqq) && CFQQ_SEEKY(cfqq))
+		cfq_mark_cfqq_split_coop(cfqq);
+
 	/*
 	 * store what was left of this slice, if the queue idled/timed out
 	 */
@@ -1663,9 +1670,6 @@ static inline sector_t cfq_dist_from_last(struct cfq_data *cfqd,
 		return cfqd->last_position - blk_rq_pos(rq);
 }
 
-#define CFQQ_SEEK_THR		8 * 1024
-#define CFQQ_SEEKY(cfqq)	((cfqq)->seek_mean > CFQQ_SEEK_THR)
-
 static inline int cfq_rq_close(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 			       struct request *rq, bool for_preempt)
 {
@@ -3000,19 +3004,6 @@ cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 	total = cfqq->seek_total + (cfqq->seek_samples/2);
 	do_div(total, cfqq->seek_samples);
 	cfqq->seek_mean = (sector_t)total;
-
-	/*
-	 * If this cfqq is shared between multiple processes, check to
-	 * make sure that those processes are still issuing I/Os within
-	 * the mean seek distance.  If not, it may be time to break the
-	 * queues apart again.
-	 */
-	if (cfq_cfqq_coop(cfqq)) {
-		if (CFQQ_SEEKY(cfqq) && !cfqq->seeky_start)
-			cfqq->seeky_start = jiffies;
-		else if (!CFQQ_SEEKY(cfqq))
-			cfqq->seeky_start = 0;
-	}
 }
 
 /*
@@ -3453,14 +3444,6 @@ cfq_merge_cfqqs(struct cfq_data *cfqd, struct cfq_io_context *cic,
 	return cic_to_cfqq(cic, 1);
 }
 
-static int should_split_cfqq(struct cfq_queue *cfqq)
-{
-	if (cfqq->seeky_start &&
-	    time_after(jiffies, cfqq->seeky_start + CFQQ_COOP_TOUT))
-		return 1;
-	return 0;
-}
-
 /*
  * Returns NULL if a new cfqq should be allocated, or the old cfqq if this
  * was the last process referring to said cfqq.
@@ -3469,9 +3452,9 @@ static struct cfq_queue *
 split_cfqq(struct cfq_io_context *cic, struct cfq_queue *cfqq)
 {
 	if (cfqq_process_refs(cfqq) == 1) {
-		cfqq->seeky_start = 0;
 		cfqq->pid = current->pid;
 		cfq_clear_cfqq_coop(cfqq);
+		cfq_clear_cfqq_split_coop(cfqq);
 		return cfqq;
 	}
 
@@ -3510,7 +3493,7 @@ new_queue:
 		/*
 		 * If the queue was seeky for too long, break it apart.
 		 */
-		if (cfq_cfqq_coop(cfqq) && should_split_cfqq(cfqq)) {
+		if (cfq_cfqq_coop(cfqq) && cfq_cfqq_split_coop(cfqq)) {
 			cfq_log_cfqq(cfqd, cfqq, "breaking apart cfqq");
 			cfqq = split_cfqq(cic, cfqq);
 			if (!cfqq)
-- 
cgit v1.2.2


From 3ad2f3fbb961429d2aa627465ae4829758bc7e07 Mon Sep 17 00:00:00 2001
From: Daniel Mack <daniel@caiaq.de>
Date: Wed, 3 Feb 2010 08:01:28 +0800
Subject: tree-wide: Assorted spelling fixes

In particular, several occurances of funny versions of 'success',
'unknown', 'therefore', 'acknowledge', 'argument', 'achieve', 'address',
'beginning', 'desirable', 'separate' and 'necessary' are fixed.

Signed-off-by: Daniel Mack <daniel@caiaq.de>
Cc: Joe Perches <joe@perches.com>
Cc: Junio C Hamano <gitster@pobox.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 block/bsg.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/bsg.c b/block/bsg.c
index a9fd2d84b53a..46597a6bd112 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -260,7 +260,7 @@ bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, fmode_t has_write_perm,
 		return ERR_PTR(ret);
 
 	/*
-	 * map scatter-gather elements seperately and string them to request
+	 * map scatter-gather elements separately and string them to request
 	 */
 	rq = blk_get_request(q, rw, GFP_KERNEL);
 	if (!rq)
-- 
cgit v1.2.2


From c4081ba5c9f6f7bdffe49e501a8604a2c0797ef9 Mon Sep 17 00:00:00 2001
From: Richard Kennedy <richard@rsk.demon.co.uk>
Date: Mon, 22 Feb 2010 13:49:24 +0100
Subject: cfq: reorder cfq_queue removing padding on 64bit

This removes 8 bytes of padding from struct cfq_queue on 64 bit builds,
shrinking it's size to 256 bytes, so fitting into 1 fewer cachelines and
allowing 1 more object/slab in it's kmem_cache.

Signed-off-by: Richard Kennedy <richard@rsk.demon.co.uk>
Reviewed-by: Jeff Moyer <jmoyer@redhat.com>
----
patch against 2.6.33-rc8
tested on x86_64 AMDX2
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 023f4e69a337..e3dedfd3bcb4 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -115,11 +115,11 @@ struct cfq_queue {
 	/* time when queue got scheduled in to dispatch first request. */
 	unsigned long dispatch_start;
 	unsigned int allocated_slice;
+	unsigned int slice_dispatch;
 	/* time when first request from queue completed and slice started. */
 	unsigned long slice_start;
 	unsigned long slice_end;
 	long slice_resid;
-	unsigned int slice_dispatch;
 
 	/* pending metadata requests */
 	int meta_pending;
@@ -130,13 +130,13 @@ struct cfq_queue {
 	unsigned short ioprio, org_ioprio;
 	unsigned short ioprio_class, org_ioprio_class;
 
+	pid_t pid;
+
 	unsigned int seek_samples;
 	u64 seek_total;
 	sector_t seek_mean;
 	sector_t last_request_pos;
 
-	pid_t pid;
-
 	struct cfq_rb_root *service_tree;
 	struct cfq_queue *new_cfqq;
 	struct cfq_group *cfqg;
-- 
cgit v1.2.2


From 79da0644a8e0838522828f106e4049639eea6baf Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Tue, 23 Feb 2010 08:40:43 +0100
Subject: Revert "block: improve queue_should_plug() by looking at IO depths"

This reverts commit fb1e75389bd06fd5987e9cda1b4e0305c782f854.

"Benjamin S." <sbenni@gmx.de> reports that the patch in question
causes a big drop in sequential throughput for him, dropping from
200MB/sec down to only 70MB/sec.

Needs to be investigated more fully, for now lets just revert the
offending commit.

Conflicts:

	include/linux/blkdev.h

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-core.c | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index 718897e6d37f..d1a9a0a64f95 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1147,7 +1147,7 @@ void init_request_from_bio(struct request *req, struct bio *bio)
  */
 static inline bool queue_should_plug(struct request_queue *q)
 {
-	return !(blk_queue_nonrot(q) && blk_queue_queuing(q));
+	return !(blk_queue_nonrot(q) && blk_queue_tagged(q));
 }
 
 static int __make_request(struct request_queue *q, struct bio *bio)
@@ -1859,15 +1859,8 @@ void blk_dequeue_request(struct request *rq)
 	 * and to it is freed is accounted as io that is in progress at
 	 * the driver side.
 	 */
-	if (blk_account_rq(rq)) {
+	if (blk_account_rq(rq))
 		q->in_flight[rq_is_sync(rq)]++;
-		/*
-		 * Mark this device as supporting hardware queuing, if
-		 * we have more IOs in flight than 4.
-		 */
-		if (!blk_queue_queuing(q) && queue_in_flight(q) > 4)
-			set_bit(QUEUE_FLAG_CQ, &q->queue_flags);
-	}
 }
 
 /**
-- 
cgit v1.2.2


From bddd87c7e622ea681c665049027ed84cdcafcb09 Mon Sep 17 00:00:00 2001
From: Akinobu Mita <akinobu.mita@gmail.com>
Date: Tue, 23 Feb 2010 08:55:42 +0100
Subject: blk-core: use BIO list management functions

Now that the bio list management stuff is generic, convert
generic_make_request to use bio lists instead of its own private bio
list implementation.

Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
Cc: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-core.c | 31 ++++++++++++++-----------------
 1 file changed, 14 insertions(+), 17 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index 718897e6d37f..44b6d691728d 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1490,9 +1490,9 @@ end_io:
 /*
  * We only want one ->make_request_fn to be active at a time,
  * else stack usage with stacked devices could be a problem.
- * So use current->bio_{list,tail} to keep a list of requests
+ * So use current->bio_list to keep a list of requests
  * submited by a make_request_fn function.
- * current->bio_tail is also used as a flag to say if
+ * current->bio_list is also used as a flag to say if
  * generic_make_request is currently active in this task or not.
  * If it is NULL, then no make_request is active.  If it is non-NULL,
  * then a make_request is active, and new requests should be added
@@ -1500,11 +1500,11 @@ end_io:
  */
 void generic_make_request(struct bio *bio)
 {
-	if (current->bio_tail) {
+	struct bio_list bio_list_on_stack;
+
+	if (current->bio_list) {
 		/* make_request is active */
-		*(current->bio_tail) = bio;
-		bio->bi_next = NULL;
-		current->bio_tail = &bio->bi_next;
+		bio_list_add(current->bio_list, bio);
 		return;
 	}
 	/* following loop may be a bit non-obvious, and so deserves some
@@ -1512,30 +1512,27 @@ void generic_make_request(struct bio *bio)
 	 * Before entering the loop, bio->bi_next is NULL (as all callers
 	 * ensure that) so we have a list with a single bio.
 	 * We pretend that we have just taken it off a longer list, so
-	 * we assign bio_list to the next (which is NULL) and bio_tail
-	 * to &bio_list, thus initialising the bio_list of new bios to be
+	 * we assign bio_list to a pointer to the bio_list_on_stack,
+	 * thus initialising the bio_list of new bios to be
 	 * added.  __generic_make_request may indeed add some more bios
 	 * through a recursive call to generic_make_request.  If it
 	 * did, we find a non-NULL value in bio_list and re-enter the loop
 	 * from the top.  In this case we really did just take the bio
-	 * of the top of the list (no pretending) and so fixup bio_list and
-	 * bio_tail or bi_next, and call into __generic_make_request again.
+	 * of the top of the list (no pretending) and so remove it from
+	 * bio_list, and call into __generic_make_request again.
 	 *
 	 * The loop was structured like this to make only one call to
 	 * __generic_make_request (which is important as it is large and
 	 * inlined) and to keep the structure simple.
 	 */
 	BUG_ON(bio->bi_next);
+	bio_list_init(&bio_list_on_stack);
+	current->bio_list = &bio_list_on_stack;
 	do {
-		current->bio_list = bio->bi_next;
-		if (bio->bi_next == NULL)
-			current->bio_tail = &current->bio_list;
-		else
-			bio->bi_next = NULL;
 		__generic_make_request(bio);
-		bio = current->bio_list;
+		bio = bio_list_pop(current->bio_list);
 	} while (bio);
-	current->bio_tail = NULL; /* deactivate */
+	current->bio_list = NULL; /* deactivate */
 }
 EXPORT_SYMBOL(generic_make_request);
 
-- 
cgit v1.2.2


From 024f9066165ffe9c8284431c78adcbcddd309831 Mon Sep 17 00:00:00 2001
From: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Date: Fri, 26 Feb 2010 08:56:15 +0100
Subject: cfq: Remove useless css reference get

There's no need to take css reference here, for the caller
has already called rcu_read_lock() to prevent cgroup from
being removed.

Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Reviewed-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-cgroup.c  | 14 --------------
 block/blk-cgroup.h  |  3 ---
 block/cfq-iosched.c |  5 -----
 3 files changed, 22 deletions(-)

(limited to 'block')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index e7dbbaf5fb3e..c85d74cae200 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -23,20 +23,6 @@ static LIST_HEAD(blkio_list);
 struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT };
 EXPORT_SYMBOL_GPL(blkio_root_cgroup);
 
-bool blkiocg_css_tryget(struct blkio_cgroup *blkcg)
-{
-	if (!css_tryget(&blkcg->css))
-		return false;
-	return true;
-}
-EXPORT_SYMBOL_GPL(blkiocg_css_tryget);
-
-void blkiocg_css_put(struct blkio_cgroup *blkcg)
-{
-	css_put(&blkcg->css);
-}
-EXPORT_SYMBOL_GPL(blkiocg_css_put);
-
 struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup)
 {
 	return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id),
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 4d316df863b4..84bf745fa775 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -43,9 +43,6 @@ struct blkio_group {
 	unsigned long sectors;
 };
 
-extern bool blkiocg_css_tryget(struct blkio_cgroup *blkcg);
-extern void blkiocg_css_put(struct blkio_cgroup *blkcg);
-
 typedef void (blkio_unlink_group_fn) (void *key, struct blkio_group *blkg);
 typedef void (blkio_update_group_weight_fn) (struct blkio_group *blkg,
 						unsigned int weight);
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index e3dedfd3bcb4..10eb286f1f49 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -951,10 +951,6 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
 	struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
 	unsigned int major, minor;
 
-	/* Do we need to take this reference */
-	if (!blkiocg_css_tryget(blkcg))
-		return NULL;;
-
 	cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
 	if (cfqg || !create)
 		goto done;
@@ -985,7 +981,6 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
 	hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
 
 done:
-	blkiocg_css_put(blkcg);
 	return cfqg;
 }
 
-- 
cgit v1.2.2


From 2800aac1114548a9b47b6e0d398117cc01b89685 Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Fri, 26 Feb 2010 00:20:35 -0500
Subject: block: Update blk_queue_max_sectors and documentation

Clarify blk_queue_max_sectors and update documentation.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-settings.c | 30 +++++++++++++++++-------------
 1 file changed, 17 insertions(+), 13 deletions(-)

(limited to 'block')

diff --git a/block/blk-settings.c b/block/blk-settings.c
index 78549c723783..605df9b3de8f 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -212,26 +212,30 @@ EXPORT_SYMBOL(blk_queue_bounce_limit);
 /**
  * blk_queue_max_sectors - set max sectors for a request for this queue
  * @q:  the request queue for the device
- * @max_sectors:  max sectors in the usual 512b unit
+ * @max_hw_sectors:  max hardware sectors in the usual 512b unit
  *
  * Description:
- *    Enables a low level driver to set an upper limit on the size of
- *    received requests.
+ *    Enables a low level driver to set a hard upper limit,
+ *    max_hw_sectors, on the size of requests.  max_hw_sectors is set by
+ *    the device driver based upon the combined capabilities of I/O
+ *    controller and storage device.
+ *
+ *    max_sectors is a soft limit imposed by the block layer for
+ *    filesystem type requests.  This value can be overridden on a
+ *    per-device basis in /sys/block/<device>/queue/max_sectors_kb.
+ *    The soft limit can not exceed max_hw_sectors.
  **/
-void blk_queue_max_sectors(struct request_queue *q, unsigned int max_sectors)
+void blk_queue_max_sectors(struct request_queue *q, unsigned int max_hw_sectors)
 {
-	if ((max_sectors << 9) < PAGE_CACHE_SIZE) {
-		max_sectors = 1 << (PAGE_CACHE_SHIFT - 9);
+	if ((max_hw_sectors << 9) < PAGE_CACHE_SIZE) {
+		max_hw_sectors = 1 << (PAGE_CACHE_SHIFT - 9);
 		printk(KERN_INFO "%s: set to minimum %d\n",
-		       __func__, max_sectors);
+		       __func__, max_hw_sectors);
 	}
 
-	if (BLK_DEF_MAX_SECTORS > max_sectors)
-		q->limits.max_hw_sectors = q->limits.max_sectors = max_sectors;
-	else {
-		q->limits.max_sectors = BLK_DEF_MAX_SECTORS;
-		q->limits.max_hw_sectors = max_sectors;
-	}
+	q->limits.max_hw_sectors = max_hw_sectors;
+	q->limits.max_sectors = min_t(unsigned int, max_hw_sectors,
+				      BLK_DEF_MAX_SECTORS);
 }
 EXPORT_SYMBOL(blk_queue_max_sectors);
 
-- 
cgit v1.2.2


From e751e76a5f7adeee7438e68b0965559ad2864d0d Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Fri, 26 Feb 2010 00:20:36 -0500
Subject: block: Remove unused accessor function

blk_queue_max_hw_sectors is no longer called by any subsystem and can be
removed.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-settings.c | 9 ---------
 1 file changed, 9 deletions(-)

(limited to 'block')

diff --git a/block/blk-settings.c b/block/blk-settings.c
index 605df9b3de8f..4db46f2fcbe5 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -239,15 +239,6 @@ void blk_queue_max_sectors(struct request_queue *q, unsigned int max_hw_sectors)
 }
 EXPORT_SYMBOL(blk_queue_max_sectors);
 
-void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_sectors)
-{
-	if (BLK_DEF_MAX_SECTORS > max_sectors)
-		q->limits.max_hw_sectors = BLK_DEF_MAX_SECTORS;
-	else
-		q->limits.max_hw_sectors = max_sectors;
-}
-EXPORT_SYMBOL(blk_queue_max_hw_sectors);
-
 /**
  * blk_queue_max_discard_sectors - set max sectors for a single discard
  * @q:  the request queue for the device
-- 
cgit v1.2.2


From eb28d31bc97e6374d81f404da309401ffaed467b Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Fri, 26 Feb 2010 00:20:37 -0500
Subject: block: Add BLK_ prefix to definitions

Add a BLK_ prefix to block layer constants.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-settings.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'block')

diff --git a/block/blk-settings.c b/block/blk-settings.c
index 4db46f2fcbe5..3c53b0beb8dd 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -94,7 +94,7 @@ void blk_set_default_limits(struct queue_limits *lim)
 	lim->max_phys_segments = MAX_PHYS_SEGMENTS;
 	lim->max_hw_segments = MAX_HW_SEGMENTS;
 	lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK;
-	lim->max_segment_size = MAX_SEGMENT_SIZE;
+	lim->max_segment_size = BLK_MAX_SEGMENT_SIZE;
 	lim->max_sectors = BLK_DEF_MAX_SECTORS;
 	lim->max_hw_sectors = INT_MAX;
 	lim->max_discard_sectors = 0;
@@ -154,7 +154,7 @@ void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn)
 	q->unplug_timer.data = (unsigned long)q;
 
 	blk_set_default_limits(&q->limits);
-	blk_queue_max_sectors(q, SAFE_MAX_SECTORS);
+	blk_queue_max_sectors(q, BLK_SAFE_MAX_SECTORS);
 
 	/*
 	 * If the caller didn't supply a lock, fall back to our embedded
-- 
cgit v1.2.2


From 086fa5ff0854c676ec333760f4c0154b3b242616 Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Fri, 26 Feb 2010 00:20:38 -0500
Subject: block: Rename blk_queue_max_sectors to blk_queue_max_hw_sectors

The block layer calling convention is blk_queue_<limit name>.
blk_queue_max_sectors predates this practice, leading to some confusion.
Rename the function to appropriately reflect that its intended use is to
set max_hw_sectors.

Also introduce a temporary wrapper for backwards compability.  This can
be removed after the merge window is closed.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-settings.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'block')

diff --git a/block/blk-settings.c b/block/blk-settings.c
index 3c53b0beb8dd..61afae9dbc6d 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -154,7 +154,7 @@ void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn)
 	q->unplug_timer.data = (unsigned long)q;
 
 	blk_set_default_limits(&q->limits);
-	blk_queue_max_sectors(q, BLK_SAFE_MAX_SECTORS);
+	blk_queue_max_hw_sectors(q, BLK_SAFE_MAX_SECTORS);
 
 	/*
 	 * If the caller didn't supply a lock, fall back to our embedded
@@ -210,7 +210,7 @@ void blk_queue_bounce_limit(struct request_queue *q, u64 dma_mask)
 EXPORT_SYMBOL(blk_queue_bounce_limit);
 
 /**
- * blk_queue_max_sectors - set max sectors for a request for this queue
+ * blk_queue_max_hw_sectors - set max sectors for a request for this queue
  * @q:  the request queue for the device
  * @max_hw_sectors:  max hardware sectors in the usual 512b unit
  *
@@ -225,7 +225,7 @@ EXPORT_SYMBOL(blk_queue_bounce_limit);
  *    per-device basis in /sys/block/<device>/queue/max_sectors_kb.
  *    The soft limit can not exceed max_hw_sectors.
  **/
-void blk_queue_max_sectors(struct request_queue *q, unsigned int max_hw_sectors)
+void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_sectors)
 {
 	if ((max_hw_sectors << 9) < PAGE_CACHE_SIZE) {
 		max_hw_sectors = 1 << (PAGE_CACHE_SHIFT - 9);
@@ -237,7 +237,7 @@ void blk_queue_max_sectors(struct request_queue *q, unsigned int max_hw_sectors)
 	q->limits.max_sectors = min_t(unsigned int, max_hw_sectors,
 				      BLK_DEF_MAX_SECTORS);
 }
-EXPORT_SYMBOL(blk_queue_max_sectors);
+EXPORT_SYMBOL(blk_queue_max_hw_sectors);
 
 /**
  * blk_queue_max_discard_sectors - set max sectors for a single discard
-- 
cgit v1.2.2


From 8a78362c4eefc1deddbefe2c7f38aabbc2429d6b Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Fri, 26 Feb 2010 00:20:39 -0500
Subject: block: Consolidate phys_segment and hw_segment limits

Except for SCSI no device drivers distinguish between physical and
hardware segment limits.  Consolidate the two into a single segment
limit.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-core.c     |  3 +--
 block/blk-merge.c    |  8 ++-----
 block/blk-settings.c | 60 ++++++++++++----------------------------------------
 3 files changed, 16 insertions(+), 55 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index 36c0deebc2dc..9fe174dc74d1 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1614,8 +1614,7 @@ int blk_rq_check_limits(struct request_queue *q, struct request *rq)
 	 * limitation.
 	 */
 	blk_recalc_rq_segments(rq);
-	if (rq->nr_phys_segments > queue_max_phys_segments(q) ||
-	    rq->nr_phys_segments > queue_max_hw_segments(q)) {
+	if (rq->nr_phys_segments > queue_max_segments(q)) {
 		printk(KERN_ERR "%s: over max segments limit.\n", __func__);
 		return -EIO;
 	}
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 99cb5cf1f447..5e7dc9973458 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -206,8 +206,7 @@ static inline int ll_new_hw_segment(struct request_queue *q,
 {
 	int nr_phys_segs = bio_phys_segments(q, bio);
 
-	if (req->nr_phys_segments + nr_phys_segs > queue_max_hw_segments(q) ||
-	    req->nr_phys_segments + nr_phys_segs > queue_max_phys_segments(q)) {
+	if (req->nr_phys_segments + nr_phys_segs > queue_max_segments(q)) {
 		req->cmd_flags |= REQ_NOMERGE;
 		if (req == q->last_merge)
 			q->last_merge = NULL;
@@ -300,10 +299,7 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
 		total_phys_segments--;
 	}
 
-	if (total_phys_segments > queue_max_phys_segments(q))
-		return 0;
-
-	if (total_phys_segments > queue_max_hw_segments(q))
+	if (total_phys_segments > queue_max_segments(q))
 		return 0;
 
 	/* Merge is OK... */
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 61afae9dbc6d..31e7a9375c13 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -91,8 +91,7 @@ EXPORT_SYMBOL_GPL(blk_queue_lld_busy);
  */
 void blk_set_default_limits(struct queue_limits *lim)
 {
-	lim->max_phys_segments = MAX_PHYS_SEGMENTS;
-	lim->max_hw_segments = MAX_HW_SEGMENTS;
+	lim->max_segments = BLK_MAX_SEGMENTS;
 	lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK;
 	lim->max_segment_size = BLK_MAX_SEGMENT_SIZE;
 	lim->max_sectors = BLK_DEF_MAX_SECTORS;
@@ -252,17 +251,15 @@ void blk_queue_max_discard_sectors(struct request_queue *q,
 EXPORT_SYMBOL(blk_queue_max_discard_sectors);
 
 /**
- * blk_queue_max_phys_segments - set max phys segments for a request for this queue
+ * blk_queue_max_segments - set max hw segments for a request for this queue
  * @q:  the request queue for the device
  * @max_segments:  max number of segments
  *
  * Description:
  *    Enables a low level driver to set an upper limit on the number of
- *    physical data segments in a request.  This would be the largest sized
- *    scatter list the driver could handle.
+ *    hw data segments in a request.
  **/
-void blk_queue_max_phys_segments(struct request_queue *q,
-				 unsigned short max_segments)
+void blk_queue_max_segments(struct request_queue *q, unsigned short max_segments)
 {
 	if (!max_segments) {
 		max_segments = 1;
@@ -270,33 +267,9 @@ void blk_queue_max_phys_segments(struct request_queue *q,
 		       __func__, max_segments);
 	}
 
-	q->limits.max_phys_segments = max_segments;
+	q->limits.max_segments = max_segments;
 }
-EXPORT_SYMBOL(blk_queue_max_phys_segments);
-
-/**
- * blk_queue_max_hw_segments - set max hw segments for a request for this queue
- * @q:  the request queue for the device
- * @max_segments:  max number of segments
- *
- * Description:
- *    Enables a low level driver to set an upper limit on the number of
- *    hw data segments in a request.  This would be the largest number of
- *    address/length pairs the host adapter can actually give at once
- *    to the device.
- **/
-void blk_queue_max_hw_segments(struct request_queue *q,
-			       unsigned short max_segments)
-{
-	if (!max_segments) {
-		max_segments = 1;
-		printk(KERN_INFO "%s: set to minimum %d\n",
-		       __func__, max_segments);
-	}
-
-	q->limits.max_hw_segments = max_segments;
-}
-EXPORT_SYMBOL(blk_queue_max_hw_segments);
+EXPORT_SYMBOL(blk_queue_max_segments);
 
 /**
  * blk_queue_max_segment_size - set max segment size for blk_rq_map_sg
@@ -531,11 +504,7 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 	t->seg_boundary_mask = min_not_zero(t->seg_boundary_mask,
 					    b->seg_boundary_mask);
 
-	t->max_phys_segments = min_not_zero(t->max_phys_segments,
-					    b->max_phys_segments);
-
-	t->max_hw_segments = min_not_zero(t->max_hw_segments,
-					  b->max_hw_segments);
+	t->max_segments = min_not_zero(t->max_segments, b->max_segments);
 
 	t->max_segment_size = min_not_zero(t->max_segment_size,
 					   b->max_segment_size);
@@ -739,22 +708,19 @@ EXPORT_SYMBOL(blk_queue_update_dma_pad);
  * does is adjust the queue so that the buf is always appended
  * silently to the scatterlist.
  *
- * Note: This routine adjusts max_hw_segments to make room for
- * appending the drain buffer.  If you call
- * blk_queue_max_hw_segments() or blk_queue_max_phys_segments() after
- * calling this routine, you must set the limit to one fewer than your
- * device can support otherwise there won't be room for the drain
- * buffer.
+ * Note: This routine adjusts max_hw_segments to make room for appending
+ * the drain buffer.  If you call blk_queue_max_segments() after calling
+ * this routine, you must set the limit to one fewer than your device
+ * can support otherwise there won't be room for the drain buffer.
  */
 int blk_queue_dma_drain(struct request_queue *q,
 			       dma_drain_needed_fn *dma_drain_needed,
 			       void *buf, unsigned int size)
 {
-	if (queue_max_hw_segments(q) < 2 || queue_max_phys_segments(q) < 2)
+	if (queue_max_segments(q) < 2)
 		return -EINVAL;
 	/* make room for appending the drain */
-	blk_queue_max_hw_segments(q, queue_max_hw_segments(q) - 1);
-	blk_queue_max_phys_segments(q, queue_max_phys_segments(q) - 1);
+	blk_queue_max_segments(q, queue_max_segments(q) - 1);
 	q->dma_drain_needed = dma_drain_needed;
 	q->dma_drain_buffer = buf;
 	q->dma_drain_size = size;
-- 
cgit v1.2.2


From 3dde36ddea3e07dd025c4c1ba47edec91606fec0 Mon Sep 17 00:00:00 2001
From: Corrado Zoccolo <czoccolo@gmail.com>
Date: Sat, 27 Feb 2010 19:45:39 +0100
Subject: cfq-iosched: rework seeky detection

Current seeky detection is based on average seek lenght.
This is suboptimal, since the average will not distinguish between:
* a process doing medium sized seeks
* a process doing some sequential requests interleaved with larger seeks
and even a medium seek can take lot of time, if the requested sector
happens to be behind the disk head in the rotation (50% probability).

Therefore, we change the seeky queue detection to work as follows:
* each request can be classified as sequential if it is very close to
  the current head position, i.e. it is likely in the disk cache (disks
  usually read more data than requested, and put it in cache for
  subsequent reads). Otherwise, the request is classified as seeky.
* an history window of the last 32 requests is kept, storing the
  classification result.
* A queue is marked as seeky if more than 1/8 of the last 32 requests
  were seeky.

This patch fixes a regression reported by Yanmin, on mmap 64k random
reads.

Reported-by: Yanmin Zhang <yanmin_zhang@linux.intel.com>
Signed-off-by: Corrado Zoccolo <czoccolo@gmail.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 54 ++++++++++++++---------------------------------------
 1 file changed, 14 insertions(+), 40 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 10eb286f1f49..3fd8afc2174e 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -46,8 +46,8 @@ static const int cfq_hist_divisor = 4;
 #define CFQ_HW_QUEUE_MIN	(5)
 #define CFQ_SERVICE_SHIFT       12
 
-#define CFQQ_SEEK_THR		8 * 1024
-#define CFQQ_SEEKY(cfqq)	((cfqq)->seek_mean > CFQQ_SEEK_THR)
+#define CFQQ_SEEK_THR		(sector_t)(8 * 100)
+#define CFQQ_SEEKY(cfqq)	(hweight32(cfqq->seek_history) > 32/8)
 
 #define RQ_CIC(rq)		\
 	((struct cfq_io_context *) (rq)->elevator_private)
@@ -132,9 +132,7 @@ struct cfq_queue {
 
 	pid_t pid;
 
-	unsigned int seek_samples;
-	u64 seek_total;
-	sector_t seek_mean;
+	u32 seek_history;
 	sector_t last_request_pos;
 
 	struct cfq_rb_root *service_tree;
@@ -1668,16 +1666,7 @@ static inline sector_t cfq_dist_from_last(struct cfq_data *cfqd,
 static inline int cfq_rq_close(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 			       struct request *rq, bool for_preempt)
 {
-	sector_t sdist = cfqq->seek_mean;
-
-	if (!sample_valid(cfqq->seek_samples))
-		sdist = CFQQ_SEEK_THR;
-
-	/* if seek_mean is big, using it as close criteria is meaningless */
-	if (sdist > CFQQ_SEEK_THR && !for_preempt)
-		sdist = CFQQ_SEEK_THR;
-
-	return cfq_dist_from_last(cfqd, rq) <= sdist;
+	return cfq_dist_from_last(cfqd, rq) <= CFQQ_SEEK_THR;
 }
 
 static struct cfq_queue *cfqq_close(struct cfq_data *cfqd,
@@ -2975,30 +2964,16 @@ static void
 cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 		       struct request *rq)
 {
-	sector_t sdist;
-	u64 total;
-
-	if (!cfqq->last_request_pos)
-		sdist = 0;
-	else if (cfqq->last_request_pos < blk_rq_pos(rq))
-		sdist = blk_rq_pos(rq) - cfqq->last_request_pos;
-	else
-		sdist = cfqq->last_request_pos - blk_rq_pos(rq);
-
-	/*
-	 * Don't allow the seek distance to get too large from the
-	 * odd fragment, pagein, etc
-	 */
-	if (cfqq->seek_samples <= 60) /* second&third seek */
-		sdist = min(sdist, (cfqq->seek_mean * 4) + 2*1024*1024);
-	else
-		sdist = min(sdist, (cfqq->seek_mean * 4) + 2*1024*64);
+	sector_t sdist = 0;
+	if (cfqq->last_request_pos) {
+		if (cfqq->last_request_pos < blk_rq_pos(rq))
+			sdist = blk_rq_pos(rq) - cfqq->last_request_pos;
+		else
+			sdist = cfqq->last_request_pos - blk_rq_pos(rq);
+	}
 
-	cfqq->seek_samples = (7*cfqq->seek_samples + 256) / 8;
-	cfqq->seek_total = (7*cfqq->seek_total + (u64)256*sdist) / 8;
-	total = cfqq->seek_total + (cfqq->seek_samples/2);
-	do_div(total, cfqq->seek_samples);
-	cfqq->seek_mean = (sector_t)total;
+	cfqq->seek_history <<= 1;
+	cfqq->seek_history |= (sdist > CFQQ_SEEK_THR);
 }
 
 /*
@@ -3023,8 +2998,7 @@ cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 		cfq_mark_cfqq_deep(cfqq);
 
 	if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle ||
-	    (!cfq_cfqq_deep(cfqq) && sample_valid(cfqq->seek_samples)
-	     && CFQQ_SEEKY(cfqq)))
+	    (!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq)))
 		enable_idle = 0;
 	else if (sample_valid(cic->ttime_samples)) {
 		if (cic->ttime_mean > cfqd->cfq_slice_idle)
-- 
cgit v1.2.2


From 41647e7a91338dba21773a16af7474ef95e0929e Mon Sep 17 00:00:00 2001
From: Corrado Zoccolo <czoccolo@gmail.com>
Date: Sat, 27 Feb 2010 19:45:40 +0100
Subject: cfq-iosched: rethink seeky detection for SSDs

CFQ currently applies the same logic of detecting seeky queues and
grouping them together for rotational disks as well as SSDs.
For SSDs, the time to complete a request doesn't depend on the
request location, but only on the size.
This patch therefore changes the criterion to group queues by
request size in case of SSDs, in order to achieve better fairness.

Signed-off-by: Corrado Zoccolo <czoccolo@gmail.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 3fd8afc2174e..423aee3fd19b 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -47,6 +47,7 @@ static const int cfq_hist_divisor = 4;
 #define CFQ_SERVICE_SHIFT       12
 
 #define CFQQ_SEEK_THR		(sector_t)(8 * 100)
+#define CFQQ_SECT_THR_NONROT	(sector_t)(2 * 32)
 #define CFQQ_SEEKY(cfqq)	(hweight32(cfqq->seek_history) > 32/8)
 
 #define RQ_CIC(rq)		\
@@ -2965,6 +2966,7 @@ cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 		       struct request *rq)
 {
 	sector_t sdist = 0;
+	sector_t n_sec = blk_rq_sectors(rq);
 	if (cfqq->last_request_pos) {
 		if (cfqq->last_request_pos < blk_rq_pos(rq))
 			sdist = blk_rq_pos(rq) - cfqq->last_request_pos;
@@ -2973,7 +2975,10 @@ cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 	}
 
 	cfqq->seek_history <<= 1;
-	cfqq->seek_history |= (sdist > CFQQ_SEEK_THR);
+	if (blk_queue_nonrot(cfqd->queue))
+		cfqq->seek_history |= (n_sec < CFQQ_SECT_THR_NONROT);
+	else
+		cfqq->seek_history |= (sdist > CFQQ_SEEK_THR);
 }
 
 /*
-- 
cgit v1.2.2


From 53c583d2269851de9df1c2e992cb2f7f124a5f55 Mon Sep 17 00:00:00 2001
From: Corrado Zoccolo <czoccolo@gmail.com>
Date: Sun, 28 Feb 2010 19:45:05 +0100
Subject: cfq-iosched: requests "in flight" vs "in driver" clarification

Counters for requests "in flight" and "in driver" are used asymmetrically
in cfq_may_dispatch, and have slightly different meaning.
We split the rq_in_flight counter (was sync_flight) to count both sync
and async requests, in order to use this one, which is more accurate in
some corner cases.
The rq_in_driver counter is coalesced, since individual sync/async counts
are not used any more.

Signed-off-by: Corrado Zoccolo <czoccolo@gmail.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 44 ++++++++++++++++++--------------------------
 1 file changed, 18 insertions(+), 26 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 423aee3fd19b..f27e535ce262 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -222,8 +222,8 @@ struct cfq_data {
 
 	unsigned int busy_queues;
 
-	int rq_in_driver[2];
-	int sync_flight;
+	int rq_in_driver;
+	int rq_in_flight[2];
 
 	/*
 	 * queue-depth detection
@@ -416,11 +416,6 @@ static struct cfq_queue *cfq_get_queue(struct cfq_data *, bool,
 static struct cfq_io_context *cfq_cic_lookup(struct cfq_data *,
 						struct io_context *);
 
-static inline int rq_in_driver(struct cfq_data *cfqd)
-{
-	return cfqd->rq_in_driver[0] + cfqd->rq_in_driver[1];
-}
-
 static inline struct cfq_queue *cic_to_cfqq(struct cfq_io_context *cic,
 					    bool is_sync)
 {
@@ -1414,9 +1409,9 @@ static void cfq_activate_request(struct request_queue *q, struct request *rq)
 {
 	struct cfq_data *cfqd = q->elevator->elevator_data;
 
-	cfqd->rq_in_driver[rq_is_sync(rq)]++;
+	cfqd->rq_in_driver++;
 	cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "activate rq, drv=%d",
-						rq_in_driver(cfqd));
+						cfqd->rq_in_driver);
 
 	cfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);
 }
@@ -1424,12 +1419,11 @@ static void cfq_activate_request(struct request_queue *q, struct request *rq)
 static void cfq_deactivate_request(struct request_queue *q, struct request *rq)
 {
 	struct cfq_data *cfqd = q->elevator->elevator_data;
-	const int sync = rq_is_sync(rq);
 
-	WARN_ON(!cfqd->rq_in_driver[sync]);
-	cfqd->rq_in_driver[sync]--;
+	WARN_ON(!cfqd->rq_in_driver);
+	cfqd->rq_in_driver--;
 	cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "deactivate rq, drv=%d",
-						rq_in_driver(cfqd));
+						cfqd->rq_in_driver);
 }
 
 static void cfq_remove_request(struct request *rq)
@@ -1863,8 +1857,7 @@ static void cfq_dispatch_insert(struct request_queue *q, struct request *rq)
 	cfqq->dispatched++;
 	elv_dispatch_sort(q, rq);
 
-	if (cfq_cfqq_sync(cfqq))
-		cfqd->sync_flight++;
+	cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++;
 	cfqq->nr_sectors += blk_rq_sectors(rq);
 }
 
@@ -2211,13 +2204,13 @@ static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 	/*
 	 * Drain async requests before we start sync IO
 	 */
-	if (cfq_should_idle(cfqd, cfqq) && cfqd->rq_in_driver[BLK_RW_ASYNC])
+	if (cfq_should_idle(cfqd, cfqq) && cfqd->rq_in_flight[BLK_RW_ASYNC])
 		return false;
 
 	/*
 	 * If this is an async queue and we have sync IO in flight, let it wait
 	 */
-	if (cfqd->sync_flight && !cfq_cfqq_sync(cfqq))
+	if (cfqd->rq_in_flight[BLK_RW_SYNC] && !cfq_cfqq_sync(cfqq))
 		return false;
 
 	max_dispatch = cfqd->cfq_quantum;
@@ -3189,14 +3182,14 @@ static void cfq_update_hw_tag(struct cfq_data *cfqd)
 {
 	struct cfq_queue *cfqq = cfqd->active_queue;
 
-	if (rq_in_driver(cfqd) > cfqd->hw_tag_est_depth)
-		cfqd->hw_tag_est_depth = rq_in_driver(cfqd);
+	if (cfqd->rq_in_driver > cfqd->hw_tag_est_depth)
+		cfqd->hw_tag_est_depth = cfqd->rq_in_driver;
 
 	if (cfqd->hw_tag == 1)
 		return;
 
 	if (cfqd->rq_queued <= CFQ_HW_QUEUE_MIN &&
-	    rq_in_driver(cfqd) <= CFQ_HW_QUEUE_MIN)
+	    cfqd->rq_in_driver <= CFQ_HW_QUEUE_MIN)
 		return;
 
 	/*
@@ -3206,7 +3199,7 @@ static void cfq_update_hw_tag(struct cfq_data *cfqd)
 	 */
 	if (cfqq && cfq_cfqq_idle_window(cfqq) &&
 	    cfqq->dispatched + cfqq->queued[0] + cfqq->queued[1] <
-	    CFQ_HW_QUEUE_MIN && rq_in_driver(cfqd) < CFQ_HW_QUEUE_MIN)
+	    CFQ_HW_QUEUE_MIN && cfqd->rq_in_driver < CFQ_HW_QUEUE_MIN)
 		return;
 
 	if (cfqd->hw_tag_samples++ < 50)
@@ -3259,13 +3252,12 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
 
 	cfq_update_hw_tag(cfqd);
 
-	WARN_ON(!cfqd->rq_in_driver[sync]);
+	WARN_ON(!cfqd->rq_in_driver);
 	WARN_ON(!cfqq->dispatched);
-	cfqd->rq_in_driver[sync]--;
+	cfqd->rq_in_driver--;
 	cfqq->dispatched--;
 
-	if (cfq_cfqq_sync(cfqq))
-		cfqd->sync_flight--;
+	cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--;
 
 	if (sync) {
 		RQ_CIC(rq)->last_end_request = now;
@@ -3319,7 +3311,7 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
 		}
 	}
 
-	if (!rq_in_driver(cfqd))
+	if (!cfqd->rq_in_driver)
 		cfq_schedule_dispatch(cfqd);
 }
 
-- 
cgit v1.2.2


From abc3c744d0d7f4ad710a948ae73852ffea5fbc3b Mon Sep 17 00:00:00 2001
From: Shaohua Li <shaohua.li@intel.com>
Date: Mon, 1 Mar 2010 09:20:54 +0100
Subject: cfq-iosched: quantum check tweak

Currently a queue can only dispatch up to 4 requests if there are other queues.
This isn't optimal, device can handle more requests, for example, AHCI can
handle 31 requests. I can understand the limit is for fairness, but we could
do a tweak: if the queue still has a lot of slice left, sounds we could
ignore the limit. Test shows this boost my workload (two thread randread of
a SSD) from 78m/s to 100m/s.
Thanks for suggestions from Corrado and Vivek for the patch.

Signed-off-by: Shaohua Li <shaohua.li@intel.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 30 ++++++++++++++++++++++++++----
 1 file changed, 26 insertions(+), 4 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index f27e535ce262..0db07d7771b5 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -19,7 +19,7 @@
  * tunables
  */
 /* max queue in one round of service */
-static const int cfq_quantum = 4;
+static const int cfq_quantum = 8;
 static const int cfq_fifo_expire[2] = { HZ / 4, HZ / 8 };
 /* maximum backwards seek, in KiB */
 static const int cfq_back_max = 16 * 1024;
@@ -2197,6 +2197,19 @@ static int cfq_forced_dispatch(struct cfq_data *cfqd)
 	return dispatched;
 }
 
+static inline bool cfq_slice_used_soon(struct cfq_data *cfqd,
+	struct cfq_queue *cfqq)
+{
+	/* the queue hasn't finished any request, can't estimate */
+	if (cfq_cfqq_slice_new(cfqq))
+		return 1;
+	if (time_after(jiffies + cfqd->cfq_slice_idle * cfqq->dispatched,
+		cfqq->slice_end))
+		return 1;
+
+	return 0;
+}
+
 static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
 	unsigned int max_dispatch;
@@ -2213,7 +2226,7 @@ static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 	if (cfqd->rq_in_flight[BLK_RW_SYNC] && !cfq_cfqq_sync(cfqq))
 		return false;
 
-	max_dispatch = cfqd->cfq_quantum;
+	max_dispatch = max_t(unsigned int, cfqd->cfq_quantum / 2, 1);
 	if (cfq_class_idle(cfqq))
 		max_dispatch = 1;
 
@@ -2230,13 +2243,22 @@ static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 		/*
 		 * We have other queues, don't allow more IO from this one
 		 */
-		if (cfqd->busy_queues > 1)
+		if (cfqd->busy_queues > 1 && cfq_slice_used_soon(cfqd, cfqq))
 			return false;
 
 		/*
 		 * Sole queue user, no limit
 		 */
-		max_dispatch = -1;
+		if (cfqd->busy_queues == 1)
+			max_dispatch = -1;
+		else
+			/*
+			 * Normally we start throttling cfqq when cfq_quantum/2
+			 * requests have been dispatched. But we can drive
+			 * deeper queue depths at the beginning of slice
+			 * subjected to upper limit of cfq_quantum.
+			 * */
+			max_dispatch = cfqd->cfq_quantum;
 	}
 
 	/*
-- 
cgit v1.2.2


From 73e9ffdd0cc8159f876d5e29ecf2d9c1bfca544f Mon Sep 17 00:00:00 2001
From: Richard Kennedy <richard@rsk.demon.co.uk>
Date: Mon, 1 Mar 2010 10:50:20 +0100
Subject: cfq: remove 8 bytes of padding from cfq_rb_root on 64 bit builds

Reorder cfq_rb_root to remove 8 bytes of padding on 64 bit builds.

Consequently removing 56 bytes from cfq_group and 64 bytes from
cfq_data.

Signed-off-by: Richard Kennedy <richard@rsk.demon.co.uk>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 0db07d7771b5..dee9d9378fee 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -78,11 +78,12 @@ struct cfq_rb_root {
 	struct rb_root rb;
 	struct rb_node *left;
 	unsigned count;
+	unsigned total_weight;
 	u64 min_vdisktime;
 	struct rb_node *active;
-	unsigned total_weight;
 };
-#define CFQ_RB_ROOT	(struct cfq_rb_root) { RB_ROOT, NULL, 0, 0, }
+#define CFQ_RB_ROOT	(struct cfq_rb_root) { .rb = RB_ROOT, .left = NULL, \
+			.count = 0, .min_vdisktime = 0, }
 
 /*
  * Per process-grouping structure
-- 
cgit v1.2.2


From 4671a1322052425afa38fcb7980d2fd2bb0fc99b Mon Sep 17 00:00:00 2001
From: Richard Kennedy <richard@rsk.demon.co.uk>
Date: Mon, 1 Mar 2010 10:57:22 +0100
Subject: block: don't access jiffies when initialising io_context

As the comment says the initial value of last_waited is never used, so
there is no need to initialise it with the current jiffies. Jiffies is
hot enough without accessing it for no reason.

Signed-off-by: Richard Kennedy <richard@rsk.demon.co.uk>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-ioc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index 98e6bf61b0ac..3f65c8aadb2f 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -91,7 +91,7 @@ struct io_context *alloc_io_context(gfp_t gfp_flags, int node)
 		spin_lock_init(&ret->lock);
 		ret->ioprio_changed = 0;
 		ret->ioprio = 0;
-		ret->last_waited = jiffies; /* doesn't matter... */
+		ret->last_waited = 0; /* doesn't matter... */
 		ret->nr_batch_requests = 0; /* because this is 0 */
 		INIT_RADIX_TREE(&ret->radix_root, GFP_ATOMIC | __GFP_HIGH);
 		INIT_HLIST_HEAD(&ret->cic_list);
-- 
cgit v1.2.2


From 52cf25d0ab7f78eeecc59ac652ed5090f69b619e Mon Sep 17 00:00:00 2001
From: Emese Revfy <re.emese@gmail.com>
Date: Tue, 19 Jan 2010 02:58:23 +0100
Subject: Driver core: Constify struct sysfs_ops in struct kobj_type

Constify struct sysfs_ops.

This is part of the ops structure constification
effort started by Arjan van de Ven et al.

Benefits of this constification:

 * prevents modification of data that is shared
   (referenced) by many other structure instances
   at runtime

 * detects/prevents accidental (but not intentional)
   modification attempts on archs that enforce
   read-only kernel data at runtime

 * potentially better optimized code as the compiler
   can assume that the const data cannot be changed

 * the compiler/linker move const data into .rodata
   and therefore exclude them from false sharing

Signed-off-by: Emese Revfy <re.emese@gmail.com>
Acked-by: David Teigland <teigland@redhat.com>
Acked-by: Matt Domsch <Matt_Domsch@dell.com>
Acked-by: Maciej Sosnowski <maciej.sosnowski@intel.com>
Acked-by: Hans J. Koch <hjk@linutronix.de>
Acked-by: Pekka Enberg <penberg@cs.helsinki.fi>
Acked-by: Jens Axboe <jens.axboe@oracle.com>
Acked-by: Stephen Hemminger <shemminger@vyatta.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 block/blk-integrity.c | 2 +-
 block/blk-sysfs.c     | 2 +-
 block/elevator.c      | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'block')

diff --git a/block/blk-integrity.c b/block/blk-integrity.c
index 15c630813b1c..96e83c2bdb94 100644
--- a/block/blk-integrity.c
+++ b/block/blk-integrity.c
@@ -278,7 +278,7 @@ static struct attribute *integrity_attrs[] = {
 	NULL,
 };
 
-static struct sysfs_ops integrity_ops = {
+static const struct sysfs_ops integrity_ops = {
 	.show	= &integrity_attr_show,
 	.store	= &integrity_attr_store,
 };
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index e85442415db3..2ae2cb3f362f 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -450,7 +450,7 @@ static void blk_release_queue(struct kobject *kobj)
 	kmem_cache_free(blk_requestq_cachep, q);
 }
 
-static struct sysfs_ops queue_sysfs_ops = {
+static const struct sysfs_ops queue_sysfs_ops = {
 	.show	= queue_attr_show,
 	.store	= queue_attr_store,
 };
diff --git a/block/elevator.c b/block/elevator.c
index ee3a883840f2..df75676f6671 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -892,7 +892,7 @@ elv_attr_store(struct kobject *kobj, struct attribute *attr,
 	return error;
 }
 
-static struct sysfs_ops elv_sysfs_ops = {
+static const struct sysfs_ops elv_sysfs_ops = {
 	.show	= elv_attr_show,
 	.store	= elv_attr_store,
 };
-- 
cgit v1.2.2


From 67523c48aa74d5637848edeccf285af1c60bf14a Mon Sep 17 00:00:00 2001
From: Ben Blum <bblum@andrew.cmu.edu>
Date: Wed, 10 Mar 2010 15:22:11 -0800
Subject: cgroups: blkio subsystem as module

Modify the Block I/O cgroup subsystem to be able to be built as a module.
As the CFQ disk scheduler optionally depends on blk-cgroup, config options
in block/Kconfig, block/Kconfig.iosched, and block/blk-cgroup.h are
enhanced to support the new module dependency.

Signed-off-by: Ben Blum <bblum@andrew.cmu.edu>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 block/Kconfig         |  2 +-
 block/Kconfig.iosched |  2 +-
 block/blk-cgroup.c    | 53 +++++++++++++++++++++++++++++++++++++++------------
 block/blk-cgroup.h    | 10 ++++++++--
 4 files changed, 51 insertions(+), 16 deletions(-)

(limited to 'block')

diff --git a/block/Kconfig b/block/Kconfig
index e20fbde0875c..62a5921321cd 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -78,7 +78,7 @@ config BLK_DEV_INTEGRITY
 	Protection.  If in doubt, say N.
 
 config BLK_CGROUP
-	bool
+	tristate
 	depends on CGROUPS
 	default n
 	---help---
diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
index b71abfb0d726..fc71cf071fb2 100644
--- a/block/Kconfig.iosched
+++ b/block/Kconfig.iosched
@@ -23,6 +23,7 @@ config IOSCHED_DEADLINE
 
 config IOSCHED_CFQ
 	tristate "CFQ I/O scheduler"
+	select BLK_CGROUP if CFQ_GROUP_IOSCHED
 	default y
 	---help---
 	  The CFQ I/O scheduler tries to distribute bandwidth equally
@@ -35,7 +36,6 @@ config IOSCHED_CFQ
 config CFQ_GROUP_IOSCHED
 	bool "CFQ Group Scheduling support"
 	depends on IOSCHED_CFQ && CGROUPS
-	select BLK_CGROUP
 	default n
 	---help---
 	  Enable group IO scheduling in CFQ.
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index c85d74cae200..4b686ad08eaa 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -23,6 +23,31 @@ static LIST_HEAD(blkio_list);
 struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT };
 EXPORT_SYMBOL_GPL(blkio_root_cgroup);
 
+static struct cgroup_subsys_state *blkiocg_create(struct cgroup_subsys *,
+						  struct cgroup *);
+static int blkiocg_can_attach(struct cgroup_subsys *, struct cgroup *,
+			      struct task_struct *, bool);
+static void blkiocg_attach(struct cgroup_subsys *, struct cgroup *,
+			   struct cgroup *, struct task_struct *, bool);
+static void blkiocg_destroy(struct cgroup_subsys *, struct cgroup *);
+static int blkiocg_populate(struct cgroup_subsys *, struct cgroup *);
+
+struct cgroup_subsys blkio_subsys = {
+	.name = "blkio",
+	.create = blkiocg_create,
+	.can_attach = blkiocg_can_attach,
+	.attach = blkiocg_attach,
+	.destroy = blkiocg_destroy,
+	.populate = blkiocg_populate,
+#ifdef CONFIG_BLK_CGROUP
+	/* note: blkio_subsys_id is otherwise defined in blk-cgroup.h */
+	.subsys_id = blkio_subsys_id,
+#endif
+	.use_id = 1,
+	.module = THIS_MODULE,
+};
+EXPORT_SYMBOL_GPL(blkio_subsys);
+
 struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup)
 {
 	return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id),
@@ -253,7 +278,8 @@ remove_entry:
 done:
 	free_css_id(&blkio_subsys, &blkcg->css);
 	rcu_read_unlock();
-	kfree(blkcg);
+	if (blkcg != &blkio_root_cgroup)
+		kfree(blkcg);
 }
 
 static struct cgroup_subsys_state *
@@ -319,17 +345,6 @@ static void blkiocg_attach(struct cgroup_subsys *subsys, struct cgroup *cgroup,
 	task_unlock(tsk);
 }
 
-struct cgroup_subsys blkio_subsys = {
-	.name = "blkio",
-	.create = blkiocg_create,
-	.can_attach = blkiocg_can_attach,
-	.attach = blkiocg_attach,
-	.destroy = blkiocg_destroy,
-	.populate = blkiocg_populate,
-	.subsys_id = blkio_subsys_id,
-	.use_id = 1,
-};
-
 void blkio_policy_register(struct blkio_policy_type *blkiop)
 {
 	spin_lock(&blkio_list_lock);
@@ -345,3 +360,17 @@ void blkio_policy_unregister(struct blkio_policy_type *blkiop)
 	spin_unlock(&blkio_list_lock);
 }
 EXPORT_SYMBOL_GPL(blkio_policy_unregister);
+
+static int __init init_cgroup_blkio(void)
+{
+	return cgroup_load_subsys(&blkio_subsys);
+}
+
+static void __exit exit_cgroup_blkio(void)
+{
+	cgroup_unload_subsys(&blkio_subsys);
+}
+
+module_init(init_cgroup_blkio);
+module_exit(exit_cgroup_blkio);
+MODULE_LICENSE("GPL");
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 84bf745fa775..8ccc20464dae 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -15,7 +15,13 @@
 
 #include <linux/cgroup.h>
 
-#ifdef CONFIG_BLK_CGROUP
+#if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE)
+
+#ifndef CONFIG_BLK_CGROUP
+/* When blk-cgroup is a module, its subsys_id isn't a compile-time constant */
+extern struct cgroup_subsys blkio_subsys;
+#define blkio_subsys_id blkio_subsys.subsys_id
+#endif
 
 struct blkio_cgroup {
 	struct cgroup_subsys_state css;
@@ -91,7 +97,7 @@ static inline void blkiocg_update_blkio_group_dequeue_stats(
 			struct blkio_group *blkg, unsigned long dequeue) {}
 #endif
 
-#ifdef CONFIG_BLK_CGROUP
+#if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE)
 extern struct blkio_cgroup blkio_root_cgroup;
 extern struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup);
 extern void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
-- 
cgit v1.2.2


From 2cda2728aa1c8c006418a24f867b25e5eb7a32e2 Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Mon, 15 Mar 2010 12:46:51 +0100
Subject: block: Fix overrun in lcm() and move it to lib

lcm() was defined to take integer-sized arguments.  The supplied
arguments are multiplied, however, causing us to overflow given
sufficiently large input.  That in turn led to incorrect optimal I/O
size reporting in some cases (RAID over RAID).

Switch lcm() over to unsigned long similar to gcd() and move the
function from blk-settings.c to lib.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-settings.c | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

(limited to 'block')

diff --git a/block/blk-settings.c b/block/blk-settings.c
index 31e7a9375c13..4c4700dca56a 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -8,6 +8,7 @@
 #include <linux/blkdev.h>
 #include <linux/bootmem.h>	/* for max_pfn/max_low_pfn */
 #include <linux/gcd.h>
+#include <linux/lcm.h>
 #include <linux/jiffies.h>
 
 #include "blk.h"
@@ -461,16 +462,6 @@ void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b)
 }
 EXPORT_SYMBOL(blk_queue_stack_limits);
 
-static unsigned int lcm(unsigned int a, unsigned int b)
-{
-	if (a && b)
-		return (a * b) / gcd(a, b);
-	else if (b)
-		return b;
-
-	return a;
-}
-
 /**
  * blk_stack_limits - adjust queue_limits for stacked devices
  * @t:	the stacking driver limits (top device)
-- 
cgit v1.2.2


From c77a5710b7e23847bfdb81fcaa10b585f65c960a Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Wed, 10 Mar 2010 00:48:33 -0500
Subject: block: Export max number of segments and max segment size in sysfs

These two values are useful when debugging issues surrounding maximum
I/O size.  Put them in sysfs with the rest of the queue limits.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-sysfs.c | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

(limited to 'block')

diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index e85442415db3..fad86550255a 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -106,6 +106,19 @@ static ssize_t queue_max_sectors_show(struct request_queue *q, char *page)
 	return queue_var_show(max_sectors_kb, (page));
 }
 
+static ssize_t queue_max_segments_show(struct request_queue *q, char *page)
+{
+	return queue_var_show(queue_max_segments(q), (page));
+}
+
+static ssize_t queue_max_segment_size_show(struct request_queue *q, char *page)
+{
+	if (test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags))
+		return queue_var_show(queue_max_segment_size(q), (page));
+
+	return queue_var_show(PAGE_CACHE_SIZE, (page));
+}
+
 static ssize_t queue_logical_block_size_show(struct request_queue *q, char *page)
 {
 	return queue_var_show(queue_logical_block_size(q), page);
@@ -280,6 +293,16 @@ static struct queue_sysfs_entry queue_max_hw_sectors_entry = {
 	.show = queue_max_hw_sectors_show,
 };
 
+static struct queue_sysfs_entry queue_max_segments_entry = {
+	.attr = {.name = "max_segments", .mode = S_IRUGO },
+	.show = queue_max_segments_show,
+};
+
+static struct queue_sysfs_entry queue_max_segment_size_entry = {
+	.attr = {.name = "max_segment_size", .mode = S_IRUGO },
+	.show = queue_max_segment_size_show,
+};
+
 static struct queue_sysfs_entry queue_iosched_entry = {
 	.attr = {.name = "scheduler", .mode = S_IRUGO | S_IWUSR },
 	.show = elv_iosched_show,
@@ -355,6 +378,8 @@ static struct attribute *default_attrs[] = {
 	&queue_ra_entry.attr,
 	&queue_max_hw_sectors_entry.attr,
 	&queue_max_sectors_entry.attr,
+	&queue_max_segments_entry.attr,
+	&queue_max_segment_size_entry.attr,
 	&queue_iosched_entry.attr,
 	&queue_hw_sector_size_entry.attr,
 	&queue_logical_block_size_entry.attr,
-- 
cgit v1.2.2


From 910ac735bad53ce54741a72a5b19ab69794ae069 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Tue, 16 Mar 2010 08:57:15 +0100
Subject: block: make CONFIG_BLK_CGROUP visible

Make the config visible, so we can choose from CONFIG_BLK_CGROUP=y
and CONFIG_BLK_CGROUP=m when CONFIG_IOSCHED_CFQ=m.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/Kconfig | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/Kconfig b/block/Kconfig
index e20fbde0875c..f9e89f4d94bb 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -78,8 +78,9 @@ config BLK_DEV_INTEGRITY
 	Protection.  If in doubt, say N.
 
 config BLK_CGROUP
-	bool
+	tristate "Block cgroup support"
 	depends on CGROUPS
+	depends on CFQ_GROUP_IOSCHED
 	default n
 	---help---
 	Generic block IO controller cgroup interface. This is the common
-- 
cgit v1.2.2


From e9ce335df51ff782035a15c261a3c0c9892a1767 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shaohua.li@intel.com>
Date: Fri, 19 Mar 2010 08:03:04 +0100
Subject: cfq-iosched: fix a kbuild regression

Alex Shi reported a kbuild regression which is about 10% performance lost.
He bisected to this commit: 3dde36ddea3e07dd025c4c1ba47edec91606fec0.
The reason is cfqq_close() can't find close cooperator. Restoring
cfq_rq_close()'s threshold to original value makes the regression go away.

Since for_preempt parameter isn't used anymore, this patch deletes it.

Reported-by: Alex Shi <alex.shi@intel.com>
Signed-off-by: Shaohua Li <shaohua.li@intel.com>
Acked-by: Corrado Zoccolo <czoccolo@gmail.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index dee9d9378fee..8d5a2f2f7fb9 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -47,6 +47,7 @@ static const int cfq_hist_divisor = 4;
 #define CFQ_SERVICE_SHIFT       12
 
 #define CFQQ_SEEK_THR		(sector_t)(8 * 100)
+#define CFQQ_CLOSE_THR		(sector_t)(8 * 1024)
 #define CFQQ_SECT_THR_NONROT	(sector_t)(2 * 32)
 #define CFQQ_SEEKY(cfqq)	(hweight32(cfqq->seek_history) > 32/8)
 
@@ -1660,9 +1661,9 @@ static inline sector_t cfq_dist_from_last(struct cfq_data *cfqd,
 }
 
 static inline int cfq_rq_close(struct cfq_data *cfqd, struct cfq_queue *cfqq,
-			       struct request *rq, bool for_preempt)
+			       struct request *rq)
 {
-	return cfq_dist_from_last(cfqd, rq) <= CFQQ_SEEK_THR;
+	return cfq_dist_from_last(cfqd, rq) <= CFQQ_CLOSE_THR;
 }
 
 static struct cfq_queue *cfqq_close(struct cfq_data *cfqd,
@@ -1689,7 +1690,7 @@ static struct cfq_queue *cfqq_close(struct cfq_data *cfqd,
 	 * will contain the closest sector.
 	 */
 	__cfqq = rb_entry(parent, struct cfq_queue, p_node);
-	if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq, false))
+	if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq))
 		return __cfqq;
 
 	if (blk_rq_pos(__cfqq->next_rq) < sector)
@@ -1700,7 +1701,7 @@ static struct cfq_queue *cfqq_close(struct cfq_data *cfqd,
 		return NULL;
 
 	__cfqq = rb_entry(node, struct cfq_queue, p_node);
-	if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq, false))
+	if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq))
 		return __cfqq;
 
 	return NULL;
@@ -3103,7 +3104,7 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
 	 * if this request is as-good as one we would expect from the
 	 * current cfqq, let it preempt
 	 */
-	if (cfq_rq_close(cfqd, cfqq, rq, true))
+	if (cfq_rq_close(cfqd, cfqq, rq))
 		return true;
 
 	return false;
-- 
cgit v1.2.2


From b1ffe737f5b743115ee46ffb59e338e580c54902 Mon Sep 17 00:00:00 2001
From: Divyesh Shah <dpshah@google.com>
Date: Thu, 25 Mar 2010 15:45:03 +0100
Subject: cfq-iosched: Add additional blktrace log messages in CFQ for easier
 debugging

These have helped us debug some issues we've noticed in earlier IO
controller versions and should be useful now as well. The extra logging
covers:
- idling behavior. Since there are so many conditions based on which we decide
to idle or not, this patch adds a log message for some conditions that we've
found useful.
- workload slices and current prio and workload type

Changelog from v1:
o moved log message from cfq_set_active_queue() to __cfq_set_active_queue()
o changed queue_count to st->count

Signed-off-by: Divyesh Shah<dpshah@google.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 8d5a2f2f7fb9..2f91c5351949 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1518,7 +1518,8 @@ static void __cfq_set_active_queue(struct cfq_data *cfqd,
 				   struct cfq_queue *cfqq)
 {
 	if (cfqq) {
-		cfq_log_cfqq(cfqd, cfqq, "set_active");
+		cfq_log_cfqq(cfqd, cfqq, "set_active wl_prio:%d wl_type:%d",
+				cfqd->serving_prio, cfqd->serving_type);
 		cfqq->slice_start = 0;
 		cfqq->dispatch_start = jiffies;
 		cfqq->allocated_slice = 0;
@@ -1788,7 +1789,11 @@ static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 	 * Otherwise, we do only if they are the last ones
 	 * in their service tree.
 	 */
-	return service_tree->count == 1 && cfq_cfqq_sync(cfqq);
+	if (service_tree->count == 1 && cfq_cfqq_sync(cfqq))
+		return 1;
+	cfq_log_cfqq(cfqd, cfqq, "Not idling. st->count:%d",
+			service_tree->count);
+	return 0;
 }
 
 static void cfq_arm_slice_timer(struct cfq_data *cfqd)
@@ -1833,8 +1838,11 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
 	 * time slice.
 	 */
 	if (sample_valid(cic->ttime_samples) &&
-	    (cfqq->slice_end - jiffies < cic->ttime_mean))
+	    (cfqq->slice_end - jiffies < cic->ttime_mean)) {
+		cfq_log_cfqq(cfqd, cfqq, "Not idling. think_time:%d",
+				cic->ttime_mean);
 		return;
+	}
 
 	cfq_mark_cfqq_wait_request(cfqq);
 
@@ -2042,6 +2050,7 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
 		slice = max(slice, 2 * cfqd->cfq_slice_idle);
 
 	slice = max_t(unsigned, slice, CFQ_MIN_TT);
+	cfq_log(cfqd, "workload slice:%d", slice);
 	cfqd->workload_expires = jiffies + slice;
 	cfqd->noidle_tree_requires_idle = false;
 }
@@ -3308,6 +3317,7 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
 		if (cfq_should_wait_busy(cfqd, cfqq)) {
 			cfqq->slice_end = jiffies + cfqd->cfq_slice_idle;
 			cfq_mark_cfqq_wait_busy(cfqq);
+			cfq_log_cfqq(cfqd, cfqq, "will busy wait");
 		}
 
 		/*
-- 
cgit v1.2.2


From 39c01b219fd30c74869b6fc8749f7900f04e9ef6 Mon Sep 17 00:00:00 2001
From: Divyesh Shah <dpshah@google.com>
Date: Thu, 25 Mar 2010 15:45:57 +0100
Subject: cfq-iosched: Do not merge queues of BE and IDLE classes

Even if they are found to be co-operating.

The prio_trees do not have any IDLE cfqqs on them. cfq_close_cooperator()
is called from cfq_select_queue() and cfq_completed_request(). The latter
ensures that the close cooperator code does not get invoked if the current
cfqq is of class IDLE but the former doesn't seem to have any such checks.
So an IDLE cfqq may get merged with a BE cfqq from the same group which
should be avoided.

Signed-off-by: Divyesh Shah<dpshah@google.com>
Acked-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 2f91c5351949..2c7a0f4f3cd7 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1723,6 +1723,8 @@ static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd,
 {
 	struct cfq_queue *cfqq;
 
+	if (cfq_class_idle(cur_cfqq))
+		return NULL;
 	if (!cfq_cfqq_sync(cur_cfqq))
 		return NULL;
 	if (CFQQ_SEEKY(cur_cfqq))
-- 
cgit v1.2.2


From 5a0e3ad6af8660be21ca98a971cd00f331318c05 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 24 Mar 2010 17:04:11 +0900
Subject: include cleanup: Update gfp.h and slab.h includes to prepare for
 breaking implicit slab.h inclusion from percpu.h

percpu.h is included by sched.h and module.h and thus ends up being
included when building most .c files.  percpu.h includes slab.h which
in turn includes gfp.h making everything defined by the two files
universally available and complicating inclusion dependencies.

percpu.h -> slab.h dependency is about to be removed.  Prepare for
this change by updating users of gfp and slab facilities include those
headers directly instead of assuming availability.  As this conversion
needs to touch large number of source files, the following script is
used as the basis of conversion.

  http://userweb.kernel.org/~tj/misc/slabh-sweep.py

The script does the followings.

* Scan files for gfp and slab usages and update includes such that
  only the necessary includes are there.  ie. if only gfp is used,
  gfp.h, if slab is used, slab.h.

* When the script inserts a new include, it looks at the include
  blocks and try to put the new include such that its order conforms
  to its surrounding.  It's put in the include block which contains
  core kernel includes, in the same order that the rest are ordered -
  alphabetical, Christmas tree, rev-Xmas-tree or at the end if there
  doesn't seem to be any matching order.

* If the script can't find a place to put a new include (mostly
  because the file doesn't have fitting include block), it prints out
  an error message indicating which .h file needs to be added to the
  file.

The conversion was done in the following steps.

1. The initial automatic conversion of all .c files updated slightly
   over 4000 files, deleting around 700 includes and adding ~480 gfp.h
   and ~3000 slab.h inclusions.  The script emitted errors for ~400
   files.

2. Each error was manually checked.  Some didn't need the inclusion,
   some needed manual addition while adding it to implementation .h or
   embedding .c file was more appropriate for others.  This step added
   inclusions to around 150 files.

3. The script was run again and the output was compared to the edits
   from #2 to make sure no file was left behind.

4. Several build tests were done and a couple of problems were fixed.
   e.g. lib/decompress_*.c used malloc/free() wrappers around slab
   APIs requiring slab.h to be added manually.

5. The script was run on all .h files but without automatically
   editing them as sprinkling gfp.h and slab.h inclusions around .h
   files could easily lead to inclusion dependency hell.  Most gfp.h
   inclusion directives were ignored as stuff from gfp.h was usually
   wildly available and often used in preprocessor macros.  Each
   slab.h inclusion directive was examined and added manually as
   necessary.

6. percpu.h was updated not to include slab.h.

7. Build test were done on the following configurations and failures
   were fixed.  CONFIG_GCOV_KERNEL was turned off for all tests (as my
   distributed build env didn't work with gcov compiles) and a few
   more options had to be turned off depending on archs to make things
   build (like ipr on powerpc/64 which failed due to missing writeq).

   * x86 and x86_64 UP and SMP allmodconfig and a custom test config.
   * powerpc and powerpc64 SMP allmodconfig
   * sparc and sparc64 SMP allmodconfig
   * ia64 SMP allmodconfig
   * s390 SMP allmodconfig
   * alpha SMP allmodconfig
   * um on x86_64 SMP allmodconfig

8. percpu.h modifications were reverted so that it could be applied as
   a separate patch and serve as bisection point.

Given the fact that I had only a couple of failures from tests on step
6, I'm fairly confident about the coverage of this conversion patch.
If there is a breakage, it's likely to be something in one of the arch
headers which should be easily discoverable easily on most builds of
the specific arch.

Signed-off-by: Tejun Heo <tj@kernel.org>
Guess-its-ok-by: Christoph Lameter <cl@linux-foundation.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
---
 block/blk-barrier.c   | 1 +
 block/blk-cgroup.c    | 1 +
 block/blk-integrity.c | 1 +
 block/blk-ioc.c       | 1 +
 block/blk-settings.c  | 1 +
 block/blk-sysfs.c     | 1 +
 block/blk-tag.c       | 1 +
 block/bsg.c           | 1 +
 block/cfq-iosched.c   | 1 +
 block/compat_ioctl.c  | 1 +
 block/ioctl.c         | 1 +
 block/noop-iosched.c  | 1 +
 12 files changed, 12 insertions(+)

(limited to 'block')

diff --git a/block/blk-barrier.c b/block/blk-barrier.c
index 8618d8996fea..6d88544b677f 100644
--- a/block/blk-barrier.c
+++ b/block/blk-barrier.c
@@ -5,6 +5,7 @@
 #include <linux/module.h>
 #include <linux/bio.h>
 #include <linux/blkdev.h>
+#include <linux/gfp.h>
 
 #include "blk.h"
 
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 4b686ad08eaa..5fe03def34b2 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -15,6 +15,7 @@
 #include <linux/kdev_t.h>
 #include <linux/module.h>
 #include <linux/err.h>
+#include <linux/slab.h>
 #include "blk-cgroup.h"
 
 static DEFINE_SPINLOCK(blkio_list_lock);
diff --git a/block/blk-integrity.c b/block/blk-integrity.c
index 96e83c2bdb94..edce1ef7933d 100644
--- a/block/blk-integrity.c
+++ b/block/blk-integrity.c
@@ -24,6 +24,7 @@
 #include <linux/mempool.h>
 #include <linux/bio.h>
 #include <linux/scatterlist.h>
+#include <linux/slab.h>
 
 #include "blk.h"
 
diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index 3f65c8aadb2f..d22c4c55c406 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -7,6 +7,7 @@
 #include <linux/bio.h>
 #include <linux/blkdev.h>
 #include <linux/bootmem.h>	/* for max_pfn/max_low_pfn */
+#include <linux/slab.h>
 
 #include "blk.h"
 
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 31e7a9375c13..d9a9db5f0a2b 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -9,6 +9,7 @@
 #include <linux/bootmem.h>	/* for max_pfn/max_low_pfn */
 #include <linux/gcd.h>
 #include <linux/jiffies.h>
+#include <linux/gfp.h>
 
 #include "blk.h"
 
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 2ae2cb3f362f..c2b821fa324a 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -2,6 +2,7 @@
  * Functions related to sysfs handling
  */
 #include <linux/kernel.h>
+#include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/bio.h>
 #include <linux/blkdev.h>
diff --git a/block/blk-tag.c b/block/blk-tag.c
index 6b0f52c20964..ece65fc4c79b 100644
--- a/block/blk-tag.c
+++ b/block/blk-tag.c
@@ -5,6 +5,7 @@
 #include <linux/module.h>
 #include <linux/bio.h>
 #include <linux/blkdev.h>
+#include <linux/slab.h>
 
 #include "blk.h"
 
diff --git a/block/bsg.c b/block/bsg.c
index 46597a6bd112..82d58829ba59 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -21,6 +21,7 @@
 #include <linux/idr.h>
 #include <linux/bsg.h>
 #include <linux/smp_lock.h>
+#include <linux/slab.h>
 
 #include <scsi/scsi.h>
 #include <scsi/scsi_ioctl.h>
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index dee9d9378fee..fc98a48554fd 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -7,6 +7,7 @@
  *  Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
  */
 #include <linux/module.h>
+#include <linux/slab.h>
 #include <linux/blkdev.h>
 #include <linux/elevator.h>
 #include <linux/jiffies.h>
diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c
index 4eb8e9ea4af5..f26051f44681 100644
--- a/block/compat_ioctl.c
+++ b/block/compat_ioctl.c
@@ -6,6 +6,7 @@
 #include <linux/elevator.h>
 #include <linux/fd.h>
 #include <linux/hdreg.h>
+#include <linux/slab.h>
 #include <linux/syscalls.h>
 #include <linux/smp_lock.h>
 #include <linux/types.h>
diff --git a/block/ioctl.c b/block/ioctl.c
index be48ea51faee..8905d2a2a717 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -1,5 +1,6 @@
 #include <linux/capability.h>
 #include <linux/blkdev.h>
+#include <linux/gfp.h>
 #include <linux/blkpg.h>
 #include <linux/hdreg.h>
 #include <linux/backing-dev.h>
diff --git a/block/noop-iosched.c b/block/noop-iosched.c
index 3a0d369d08c7..232c4b38cd37 100644
--- a/block/noop-iosched.c
+++ b/block/noop-iosched.c
@@ -5,6 +5,7 @@
 #include <linux/elevator.h>
 #include <linux/bio.h>
 #include <linux/module.h>
+#include <linux/slab.h>
 #include <linux/init.h>
 
 struct noop_data {
-- 
cgit v1.2.2


From a506aedc51093544ff0f9610af6066d18cb6abbe Mon Sep 17 00:00:00 2001
From: "wzt.wzt@gmail.com" <wzt.wzt@gmail.com>
Date: Fri, 2 Apr 2010 08:41:14 +0200
Subject: Block: Fix block/elevator.c elevator_get() off-by-one error

elevator_get() not check the name length, if the name length > sizeof(elv),
elv will miss the '\0'. And elv buffer will be replace "-iosched" as something
like aaaaaaaaa, then call request_module() can load an not trust module.

Signed-off-by: Zhitong Wang <zhitong.wangzt@alibaba-inc.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/elevator.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/elevator.c b/block/elevator.c
index df75676f6671..76e3702d5381 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -154,7 +154,7 @@ static struct elevator_type *elevator_get(const char *name)
 
 		spin_unlock(&elv_list_lock);
 
-		sprintf(elv, "%s-iosched", name);
+		snprintf(elv, sizeof(elv), "%s-iosched", name);
 
 		request_module("%s", elv);
 		spin_lock(&elv_list_lock);
-- 
cgit v1.2.2


From a74b2adae06265b8cfa335d7d40d4a5abd11e977 Mon Sep 17 00:00:00 2001
From: Ricky Benitez <rickyb@google.com>
Date: Mon, 5 Apr 2010 18:22:17 +0200
Subject: block: expose the statistics in blkio.time and blkio.sectors for the
 root cgroup

Currently, the io statistics for the root cgroup are maintained, but
they are not shown because the device information is not available at
the point that the root blkio cgroup is created. This patch updates
the device information when the statistics are updated so that the
statistics become visible.

Signed-off-by: Ricky Benitez <rickyb@google.com>
Acked-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 2c7a0f4f3cd7..7104ac816fb6 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -948,6 +948,11 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
 	unsigned int major, minor;
 
 	cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
+	if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
+		sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
+		cfqg->blkg.dev = MKDEV(major, minor);
+		goto done;
+	}
 	if (cfqg || !create)
 		goto done;
 
-- 
cgit v1.2.2


From 3440c49f5c5ecb4f29b0544aa87da71888404f8f Mon Sep 17 00:00:00 2001
From: Divyesh Shah <dpshah@google.com>
Date: Fri, 9 Apr 2010 09:29:57 +0200
Subject: cfq-iosched: Fix the incorrect timeslice accounting with
 forced_dispatch

When CFQ dispatches requests forcefully due to a barrier or changing iosched,
it runs through all cfqq's dispatching requests and then expires each queue.
However, it does not activate a cfqq before flushing its IOs resulting in
using stale values for computing slice_used.
This patch fixes it by calling activate queue before flushing reuqests from
each queue.

This is useful mostly for barrier requests because when the iosched is changing
it really doesnt matter if we have incorrect accounting since we're going to
break down all structures anyway.

We also now expire the current timeslice before moving on with the dispatch
to accurately account slice used for that cfqq.

Signed-off-by: Divyesh Shah<dpshah@google.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 7104ac816fb6..b773000f8a06 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -2205,10 +2205,13 @@ static int cfq_forced_dispatch(struct cfq_data *cfqd)
 	struct cfq_queue *cfqq;
 	int dispatched = 0;
 
-	while ((cfqq = cfq_get_next_queue_forced(cfqd)) != NULL)
+	/* Expire the timeslice of the current active queue first */
+	cfq_slice_expired(cfqd, 0);
+	while ((cfqq = cfq_get_next_queue_forced(cfqd)) != NULL) {
+		__cfq_set_active_queue(cfqd, cfqq);
 		dispatched += __cfq_forced_dispatch_cfqq(cfqq);
+	}
 
-	cfq_slice_expired(cfqd, 0);
 	BUG_ON(cfqd->busy_queues);
 
 	cfq_log(cfqd, "forced_dispatch=%d", dispatched);
-- 
cgit v1.2.2


From a534dbe96e9929c7245924d8252d89048c23d569 Mon Sep 17 00:00:00 2001
From: Richard Kennedy <richard@rsk.demon.co.uk>
Date: Wed, 14 Apr 2010 20:54:03 +0200
Subject: block: ensure jiffies wrap is handled correctly in
 blk_rq_timed_out_timer

blk_rq_timed_out_timer() relied on blk_add_timer() never returning a
timer value of zero, but commit 7838c15b8dd18e78a523513749e5b54bda07b0cb
removed the code that bumped this value when it was zero.
Therefore when jiffies is near wrap we could get unlucky & not set the
timeout value correctly.

This patch uses a flag to indicate that the timeout value was set and so
handles jiffies wrap correctly, and it keeps all the logic in one
function so should be easier to maintain in the future.

Signed-off-by: Richard Kennedy <richard@rsk.demon.co.uk>
Cc: stable@kernel.org
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-timeout.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

(limited to 'block')

diff --git a/block/blk-timeout.c b/block/blk-timeout.c
index 1ba7e0aca878..4f0c06c7a338 100644
--- a/block/blk-timeout.c
+++ b/block/blk-timeout.c
@@ -109,6 +109,7 @@ void blk_rq_timed_out_timer(unsigned long data)
 	struct request_queue *q = (struct request_queue *) data;
 	unsigned long flags, next = 0;
 	struct request *rq, *tmp;
+	int next_set = 0;
 
 	spin_lock_irqsave(q->queue_lock, flags);
 
@@ -122,16 +123,13 @@ void blk_rq_timed_out_timer(unsigned long data)
 			if (blk_mark_rq_complete(rq))
 				continue;
 			blk_rq_timed_out(rq);
-		} else if (!next || time_after(next, rq->deadline))
+		} else if (!next_set || time_after(next, rq->deadline)) {
 			next = rq->deadline;
+			next_set = 1;
+		}
 	}
 
-	/*
-	 * next can never be 0 here with the list non-empty, since we always
-	 * bump ->deadline to 1 so we can detect if the timer was ever added
-	 * or not. See comment in blk_add_timer()
-	 */
-	if (next)
+	if (next_set)
 		mod_timer(&q->timeout, round_jiffies_up(next));
 
 	spin_unlock_irqrestore(q->queue_lock, flags);
-- 
cgit v1.2.2


From dcf097b247affd8b88ad410a92298590c5600f44 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Thu, 22 Apr 2010 11:54:52 -0400
Subject: blk-cgroup: Fix RCU correctness warning in cfq_init_queue()

It is necessary to be in an RCU read-side critical section when invoking
css_id(), so this patch adds one to blkiocg_add_blkio_group().  This is
actually a false positive, because this is called at initialization time
and hence always refers to the root cgroup, which cannot go away.

[  103.790505] ===================================================
[  103.790509] [ INFO: suspicious rcu_dereference_check() usage. ]
[  103.790511] ---------------------------------------------------
[  103.790514] kernel/cgroup.c:4432 invoked rcu_dereference_check() without protection!
[  103.790517]
[  103.790517] other info that might help us debug this:
[  103.790519]
[  103.790521]
[  103.790521] rcu_scheduler_active = 1, debug_locks = 1
[  103.790524] 4 locks held by bash/4422:
[  103.790526]  #0:  (&buffer->mutex){+.+.+.}, at: [<ffffffff8114befa>] sysfs_write_file+0x3c/0x144
[  103.790537]  #1:  (s_active#102){.+.+.+}, at: [<ffffffff8114bfa5>] sysfs_write_file+0xe7/0x144
[  103.790544]  #2:  (&q->sysfs_lock){+.+.+.}, at: [<ffffffff812263b1>] queue_attr_store+0x49/0x8f
[  103.790552]  #3:  (&(&blkcg->lock)->rlock){......}, at: [<ffffffff8122e4db>] blkiocg_add_blkio_group+0x2b/0xad
[  103.790560]
[  103.790561] stack backtrace:
[  103.790564] Pid: 4422, comm: bash Not tainted 2.6.34-rc4-blkio-second-crash #81
[  103.790567] Call Trace:
[  103.790572]  [<ffffffff81068f57>] lockdep_rcu_dereference+0x9d/0xa5
[  103.790577]  [<ffffffff8107fac1>] css_id+0x44/0x57
[  103.790581]  [<ffffffff8122e503>] blkiocg_add_blkio_group+0x53/0xad
[  103.790586]  [<ffffffff81231936>] cfq_init_queue+0x139/0x32c
[  103.790591]  [<ffffffff8121f2d0>] elv_iosched_store+0xbf/0x1bf
[  103.790595]  [<ffffffff812263d8>] queue_attr_store+0x70/0x8f
[  103.790599]  [<ffffffff8114bfa5>] ? sysfs_write_file+0xe7/0x144
[  103.790603]  [<ffffffff8114bfc6>] sysfs_write_file+0x108/0x144
[  103.790609]  [<ffffffff810f527f>] vfs_write+0xae/0x10b
[  103.790612]  [<ffffffff81069863>] ? trace_hardirqs_on_caller+0x10c/0x130
[  103.790616]  [<ffffffff810f539c>] sys_write+0x4a/0x6e
[  103.790622]  [<ffffffff81002b5b>] system_call_fastpath+0x16/0x1b
[  103.790625]

Located-by: Miles Lane <miles.lane@gmail.com>
Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 838834be115b..5f127cfb2e92 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -3694,8 +3694,10 @@ static void *cfq_init_queue(struct request_queue *q)
 	 * to make sure that cfq_put_cfqg() does not try to kfree root group
 	 */
 	atomic_set(&cfqg->ref, 1);
+	rcu_read_lock();
 	blkiocg_add_blkio_group(&blkio_root_cgroup, &cfqg->blkg, (void *)cfqd,
 					0);
+	rcu_read_unlock();
 #endif
 	/*
 	 * Not strictly needed (since RB_ROOT just clears the node and we
-- 
cgit v1.2.2


From 0341509fdfc9519f7de6aabc5dd23217cef72b73 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Fri, 7 May 2010 08:57:00 +0200
Subject: blk-cgroup: Fix an RCU warning in blkiocg_create()

with CONFIG_PROVE_RCU=y, a warning can be triggered:

  # mount -t cgroup -o blkio xxx /mnt
  # mkdir /mnt/subgroup

...
kernel/cgroup.c:4442 invoked rcu_dereference_check() without protection!
...

To fix this, we avoid caling css_depth() here, which is a bit simpler
than the original code.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-cgroup.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'block')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 5fe03def34b2..2cc682b860ea 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -286,16 +286,16 @@ done:
 static struct cgroup_subsys_state *
 blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup)
 {
-	struct blkio_cgroup *blkcg, *parent_blkcg;
+	struct blkio_cgroup *blkcg;
+	struct cgroup *parent = cgroup->parent;
 
-	if (!cgroup->parent) {
+	if (!parent) {
 		blkcg = &blkio_root_cgroup;
 		goto done;
 	}
 
 	/* Currently we do not support hierarchy deeper than two level (0,1) */
-	parent_blkcg = cgroup_to_blkio_cgroup(cgroup->parent);
-	if (css_depth(&parent_blkcg->css) > 0)
+	if (parent != cgroup->top_cgroup)
 		return ERR_PTR(-EINVAL);
 
 	blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
-- 
cgit v1.2.2