From e692cb668fdd5a712c6ed2a2d6f2a36ee83997b4 Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Wed, 1 Dec 2010 19:41:49 +0100
Subject: block: Deprecate QUEUE_FLAG_CLUSTER and use queue_limits instead

When stacking devices, a request_queue is not always available. This
forced us to have a no_cluster flag in the queue_limits that could be
used as a carrier until the request_queue had been set up for a
metadevice.

There were several problems with that approach. First of all it was up
to the stacking device to remember to set queue flag after stacking had
completed. Also, the queue flag and the queue limits had to be kept in
sync at all times. We got that wrong, which could lead to us issuing
commands that went beyond the max scatterlist limit set by the driver.

The proper fix is to avoid having two flags for tracking the same thing.
We deprecate QUEUE_FLAG_CLUSTER and use the queue limit directly in the
block layer merging functions. The queue_limit 'no_cluster' is turned
into 'cluster' to avoid double negatives and to ease stacking.
Clustering defaults to being enabled as before. The queue flag logic is
removed from the stacking function, and explicitly setting the cluster
flag is no longer necessary in DM and MD.

Reported-by: Ed Lin <ed.lin@promise.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Acked-by: Mike Snitzer <snitzer@redhat.com>
Cc: stable@kernel.org
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 include/linux/blkdev.h | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'include/linux/blkdev.h')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index aae86fd10c4f..95aeeeb49e8b 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -250,7 +250,7 @@ struct queue_limits {
 
 	unsigned char		misaligned;
 	unsigned char		discard_misaligned;
-	unsigned char		no_cluster;
+	unsigned char		cluster;
 	signed char		discard_zeroes_data;
 };
 
@@ -380,7 +380,6 @@ struct request_queue
 #endif
 };
 
-#define QUEUE_FLAG_CLUSTER	0	/* cluster several segments into 1 */
 #define QUEUE_FLAG_QUEUED	1	/* uses generic tag queueing */
 #define QUEUE_FLAG_STOPPED	2	/* queue is stopped */
 #define	QUEUE_FLAG_SYNCFULL	3	/* read queue has been filled */
@@ -403,7 +402,6 @@ struct request_queue
 #define QUEUE_FLAG_SECDISCARD  19	/* supports SECDISCARD */
 
 #define QUEUE_FLAG_DEFAULT	((1 << QUEUE_FLAG_IO_STAT) |		\
-				 (1 << QUEUE_FLAG_CLUSTER) |		\
 				 (1 << QUEUE_FLAG_STACKABLE)	|	\
 				 (1 << QUEUE_FLAG_SAME_COMP)	|	\
 				 (1 << QUEUE_FLAG_ADD_RANDOM))
@@ -510,6 +508,11 @@ static inline void queue_flag_clear(unsigned int flag, struct request_queue *q)
 
 #define rq_data_dir(rq)		((rq)->cmd_flags & 1)
 
+static inline unsigned int blk_queue_cluster(struct request_queue *q)
+{
+	return q->limits.cluster;
+}
+
 /*
  * We regard a request as sync, if either a read or a sync write
  */
-- 
cgit v1.2.2


From 72d4cd9f38b5ed96b75df4c622be25e1c2648dd3 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Fri, 17 Dec 2010 08:34:20 +0100
Subject: block: max hardware sectors limit wrapper

Implement blk_limits_max_hw_sectors() and make
blk_queue_max_hw_sectors() a wrapper around it.

DM needs this to avoid setting queue_limits' max_hw_sectors and
max_sectors directly.  dm_set_device_limits() now leverages
blk_limits_max_hw_sectors() logic to establish the appropriate
max_hw_sectors minimum (PAGE_SIZE).  Fixes issue where DM was
incorrectly setting max_sectors rather than max_hw_sectors (which
caused dm_merge_bvec()'s max_hw_sectors check to be ineffective).

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Cc: stable@kernel.org
Acked-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 include/linux/blkdev.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux/blkdev.h')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 95aeeeb49e8b..36ab42c9bb99 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -808,6 +808,7 @@ extern struct request_queue *blk_init_allocated_queue(struct request_queue *,
 extern void blk_cleanup_queue(struct request_queue *);
 extern void blk_queue_make_request(struct request_queue *, make_request_fn *);
 extern void blk_queue_bounce_limit(struct request_queue *, u64);
+extern void blk_limits_max_hw_sectors(struct queue_limits *, unsigned int);
 extern void blk_queue_max_hw_sectors(struct request_queue *, unsigned int);
 extern void blk_queue_max_segments(struct request_queue *, unsigned short);
 extern void blk_queue_max_segment_size(struct request_queue *, unsigned int);
-- 
cgit v1.2.2


From 09e099d4bafea3b15be003d548bdf94b4b6e0e17 Mon Sep 17 00:00:00 2001
From: Jerome Marchand <jmarchan@redhat.com>
Date: Wed, 5 Jan 2011 16:57:38 +0100
Subject: block: fix accounting bug on cross partition merges

/proc/diskstats would display a strange output as follows.

$ cat /proc/diskstats |grep sda
   8       0 sda 90524 7579 102154 20464 0 0 0 0 0 14096 20089
   8       1 sda1 19085 1352 21841 4209 0 0 0 0 4294967064 15689 4293424691
                                                ~~~~~~~~~~
   8       2 sda2 71252 3624 74891 15950 0 0 0 0 232 23995 1562390
   8       3 sda3 54 487 2188 92 0 0 0 0 0 88 92
   8       4 sda4 4 0 8 0 0 0 0 0 0 0 0
   8       5 sda5 81 2027 2130 138 0 0 0 0 0 87 137

Its reason is the wrong way of accounting hd_struct->in_flight. When a bio is
merged into a request belongs to different partition by ELEVATOR_FRONT_MERGE.

The detailed root cause is as follows.

Assuming that there are two partition, sda1 and sda2.

1. A request for sda2 is in request_queue. Hence sda1's hd_struct->in_flight
   is 0 and sda2's one is 1.

        | hd_struct->in_flight
   ---------------------------
   sda1 |          0
   sda2 |          1
   ---------------------------

2. A bio belongs to sda1 is issued and is merged into the request mentioned on
   step1 by ELEVATOR_BACK_MERGE. The first sector of the request is changed
   from sda2 region to sda1 region. However the two partition's
   hd_struct->in_flight are not changed.

        | hd_struct->in_flight
   ---------------------------
   sda1 |          0
   sda2 |          1
   ---------------------------

3. The request is finished and blk_account_io_done() is called. In this case,
   sda2's hd_struct->in_flight, not a sda1's one, is decremented.

        | hd_struct->in_flight
   ---------------------------
   sda1 |         -1
   sda2 |          1
   ---------------------------

The patch fixes the problem by caching the partition lookup
inside the request structure, hence making sure that the increment
and decrement will always happen on the same partition struct. This
also speeds up IO with accounting enabled, since it cuts down on
the number of lookups we have to do.

Also add a refcount to struct hd_struct to keep the partition in
memory as long as users exist. We use kref_test_and_get() to ensure
we don't add a reference to a partition which is going away.

Signed-off-by: Jerome Marchand <jmarchan@redhat.com>
Signed-off-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Cc: stable@kernel.org
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 include/linux/blkdev.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux/blkdev.h')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index aae86fd10c4f..482a7fd48831 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -115,6 +115,7 @@ struct request {
 	void *elevator_private3;
 
 	struct gendisk *rq_disk;
+	struct hd_struct *part;
 	unsigned long start_time;
 #ifdef CONFIG_BLK_CGROUP
 	unsigned long long start_time_ns;
-- 
cgit v1.2.2