aboutsummaryrefslogtreecommitdiffstats
path: root/block
diff options
context:
space:
mode:
authorJens Axboe <jaxboe@fusionio.com>2010-11-16 04:09:42 -0500
committerJens Axboe <jaxboe@fusionio.com>2010-11-16 04:09:42 -0500
commita02056349cdea2252cd2b21643ebf025e83a29f2 (patch)
treeb7c889d6cbe8e7188d07d99a5c9da858c53a5b6c /block
parent34db1d595ef6f183fbc1e42cda45a3dfa0035258 (diff)
parente53beacd23d9cb47590da6a7a7f6d417b941a994 (diff)
Merge branch 'v2.6.37-rc2' into for-2.6.38/core
Diffstat (limited to 'block')
-rw-r--r--block/Kconfig12
-rw-r--r--block/Makefile3
-rw-r--r--block/blk-barrier.c350
-rw-r--r--block/blk-cgroup.c804
-rw-r--r--block/blk-cgroup.h87
-rw-r--r--block/blk-core.c104
-rw-r--r--block/blk-exec.c9
-rw-r--r--block/blk-flush.c262
-rw-r--r--block/blk-integrity.c94
-rw-r--r--block/blk-ioc.c14
-rw-r--r--block/blk-lib.c39
-rw-r--r--block/blk-map.c7
-rw-r--r--block/blk-merge.c23
-rw-r--r--block/blk-settings.c32
-rw-r--r--block/blk-sysfs.c13
-rw-r--r--block/blk-throttle.c1123
-rw-r--r--block/blk.h16
-rw-r--r--block/bsg.c4
-rw-r--r--block/cfq-iosched.c39
-rw-r--r--block/cfq.h2
-rw-r--r--block/compat_ioctl.c4
-rw-r--r--block/elevator.c83
-rw-r--r--block/genhd.c23
-rw-r--r--block/ioctl.c13
-rw-r--r--block/scsi_ioctl.c34
25 files changed, 2424 insertions, 770 deletions
diff --git a/block/Kconfig b/block/Kconfig
index 9be0b56eaee1..6c9213ef15a1 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -77,6 +77,18 @@ config BLK_DEV_INTEGRITY
77 T10/SCSI Data Integrity Field or the T13/ATA External Path 77 T10/SCSI Data Integrity Field or the T13/ATA External Path
78 Protection. If in doubt, say N. 78 Protection. If in doubt, say N.
79 79
80config BLK_DEV_THROTTLING
81 bool "Block layer bio throttling support"
82 depends on BLK_CGROUP=y && EXPERIMENTAL
83 default n
84 ---help---
85 Block layer bio throttling support. It can be used to limit
86 the IO rate to a device. IO rate policies are per cgroup and
87 one needs to mount and use blkio cgroup controller for creating
88 cgroups and specifying per device IO rate policies.
89
90 See Documentation/cgroups/blkio-controller.txt for more information.
91
80endif # BLOCK 92endif # BLOCK
81 93
82config BLOCK_COMPAT 94config BLOCK_COMPAT
diff --git a/block/Makefile b/block/Makefile
index 0bb499a739cd..0fec4b3fab51 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -3,12 +3,13 @@
3# 3#
4 4
5obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \ 5obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \
6 blk-barrier.o blk-settings.o blk-ioc.o blk-map.o \ 6 blk-flush.o blk-settings.o blk-ioc.o blk-map.o \
7 blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \ 7 blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \
8 blk-iopoll.o blk-lib.o ioctl.o genhd.o scsi_ioctl.o 8 blk-iopoll.o blk-lib.o ioctl.o genhd.o scsi_ioctl.o
9 9
10obj-$(CONFIG_BLK_DEV_BSG) += bsg.o 10obj-$(CONFIG_BLK_DEV_BSG) += bsg.o
11obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o 11obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o
12obj-$(CONFIG_BLK_DEV_THROTTLING) += blk-throttle.o
12obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o 13obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o
13obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o 14obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o
14obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o 15obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o
diff --git a/block/blk-barrier.c b/block/blk-barrier.c
deleted file mode 100644
index f0faefca032f..000000000000
--- a/block/blk-barrier.c
+++ /dev/null
@@ -1,350 +0,0 @@
1/*
2 * Functions related to barrier IO handling
3 */
4#include <linux/kernel.h>
5#include <linux/module.h>
6#include <linux/bio.h>
7#include <linux/blkdev.h>
8#include <linux/gfp.h>
9
10#include "blk.h"
11
12/**
13 * blk_queue_ordered - does this queue support ordered writes
14 * @q: the request queue
15 * @ordered: one of QUEUE_ORDERED_*
16 *
17 * Description:
18 * For journalled file systems, doing ordered writes on a commit
19 * block instead of explicitly doing wait_on_buffer (which is bad
20 * for performance) can be a big win. Block drivers supporting this
21 * feature should call this function and indicate so.
22 *
23 **/
24int blk_queue_ordered(struct request_queue *q, unsigned ordered)
25{
26 if (ordered != QUEUE_ORDERED_NONE &&
27 ordered != QUEUE_ORDERED_DRAIN &&
28 ordered != QUEUE_ORDERED_DRAIN_FLUSH &&
29 ordered != QUEUE_ORDERED_DRAIN_FUA &&
30 ordered != QUEUE_ORDERED_TAG &&
31 ordered != QUEUE_ORDERED_TAG_FLUSH &&
32 ordered != QUEUE_ORDERED_TAG_FUA) {
33 printk(KERN_ERR "blk_queue_ordered: bad value %d\n", ordered);
34 return -EINVAL;
35 }
36
37 q->ordered = ordered;
38 q->next_ordered = ordered;
39
40 return 0;
41}
42EXPORT_SYMBOL(blk_queue_ordered);
43
44/*
45 * Cache flushing for ordered writes handling
46 */
47unsigned blk_ordered_cur_seq(struct request_queue *q)
48{
49 if (!q->ordseq)
50 return 0;
51 return 1 << ffz(q->ordseq);
52}
53
54unsigned blk_ordered_req_seq(struct request *rq)
55{
56 struct request_queue *q = rq->q;
57
58 BUG_ON(q->ordseq == 0);
59
60 if (rq == &q->pre_flush_rq)
61 return QUEUE_ORDSEQ_PREFLUSH;
62 if (rq == &q->bar_rq)
63 return QUEUE_ORDSEQ_BAR;
64 if (rq == &q->post_flush_rq)
65 return QUEUE_ORDSEQ_POSTFLUSH;
66
67 /*
68 * !fs requests don't need to follow barrier ordering. Always
69 * put them at the front. This fixes the following deadlock.
70 *
71 * http://thread.gmane.org/gmane.linux.kernel/537473
72 */
73 if (rq->cmd_type != REQ_TYPE_FS)
74 return QUEUE_ORDSEQ_DRAIN;
75
76 if ((rq->cmd_flags & REQ_ORDERED_COLOR) ==
77 (q->orig_bar_rq->cmd_flags & REQ_ORDERED_COLOR))
78 return QUEUE_ORDSEQ_DRAIN;
79 else
80 return QUEUE_ORDSEQ_DONE;
81}
82
83bool blk_ordered_complete_seq(struct request_queue *q, unsigned seq, int error)
84{
85 struct request *rq;
86
87 if (error && !q->orderr)
88 q->orderr = error;
89
90 BUG_ON(q->ordseq & seq);
91 q->ordseq |= seq;
92
93 if (blk_ordered_cur_seq(q) != QUEUE_ORDSEQ_DONE)
94 return false;
95
96 /*
97 * Okay, sequence complete.
98 */
99 q->ordseq = 0;
100 rq = q->orig_bar_rq;
101 __blk_end_request_all(rq, q->orderr);
102 return true;
103}
104
105static void pre_flush_end_io(struct request *rq, int error)
106{
107 elv_completed_request(rq->q, rq);
108 blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_PREFLUSH, error);
109}
110
111static void bar_end_io(struct request *rq, int error)
112{
113 elv_completed_request(rq->q, rq);
114 blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_BAR, error);
115}
116
117static void post_flush_end_io(struct request *rq, int error)
118{
119 elv_completed_request(rq->q, rq);
120 blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_POSTFLUSH, error);
121}
122
123static void queue_flush(struct request_queue *q, unsigned which)
124{
125 struct request *rq;
126 rq_end_io_fn *end_io;
127
128 if (which == QUEUE_ORDERED_DO_PREFLUSH) {
129 rq = &q->pre_flush_rq;
130 end_io = pre_flush_end_io;
131 } else {
132 rq = &q->post_flush_rq;
133 end_io = post_flush_end_io;
134 }
135
136 blk_rq_init(q, rq);
137 rq->cmd_type = REQ_TYPE_FS;
138 rq->cmd_flags = REQ_HARDBARRIER | REQ_FLUSH;
139 rq->rq_disk = q->orig_bar_rq->rq_disk;
140 rq->end_io = end_io;
141
142 elv_insert(q, rq, ELEVATOR_INSERT_FRONT);
143}
144
145static inline bool start_ordered(struct request_queue *q, struct request **rqp)
146{
147 struct request *rq = *rqp;
148 unsigned skip = 0;
149
150 q->orderr = 0;
151 q->ordered = q->next_ordered;
152 q->ordseq |= QUEUE_ORDSEQ_STARTED;
153
154 /*
155 * For an empty barrier, there's no actual BAR request, which
156 * in turn makes POSTFLUSH unnecessary. Mask them off.
157 */
158 if (!blk_rq_sectors(rq)) {
159 q->ordered &= ~(QUEUE_ORDERED_DO_BAR |
160 QUEUE_ORDERED_DO_POSTFLUSH);
161 /*
162 * Empty barrier on a write-through device w/ ordered
163 * tag has no command to issue and without any command
164 * to issue, ordering by tag can't be used. Drain
165 * instead.
166 */
167 if ((q->ordered & QUEUE_ORDERED_BY_TAG) &&
168 !(q->ordered & QUEUE_ORDERED_DO_PREFLUSH)) {
169 q->ordered &= ~QUEUE_ORDERED_BY_TAG;
170 q->ordered |= QUEUE_ORDERED_BY_DRAIN;
171 }
172 }
173
174 /* stash away the original request */
175 blk_dequeue_request(rq);
176 q->orig_bar_rq = rq;
177 rq = NULL;
178
179 /*
180 * Queue ordered sequence. As we stack them at the head, we
181 * need to queue in reverse order. Note that we rely on that
182 * no fs request uses ELEVATOR_INSERT_FRONT and thus no fs
183 * request gets inbetween ordered sequence.
184 */
185 if (q->ordered & QUEUE_ORDERED_DO_POSTFLUSH) {
186 queue_flush(q, QUEUE_ORDERED_DO_POSTFLUSH);
187 rq = &q->post_flush_rq;
188 } else
189 skip |= QUEUE_ORDSEQ_POSTFLUSH;
190
191 if (q->ordered & QUEUE_ORDERED_DO_BAR) {
192 rq = &q->bar_rq;
193
194 /* initialize proxy request and queue it */
195 blk_rq_init(q, rq);
196 if (bio_data_dir(q->orig_bar_rq->bio) == WRITE)
197 rq->cmd_flags |= REQ_WRITE;
198 if (q->ordered & QUEUE_ORDERED_DO_FUA)
199 rq->cmd_flags |= REQ_FUA;
200 init_request_from_bio(rq, q->orig_bar_rq->bio);
201 rq->end_io = bar_end_io;
202
203 elv_insert(q, rq, ELEVATOR_INSERT_FRONT);
204 } else
205 skip |= QUEUE_ORDSEQ_BAR;
206
207 if (q->ordered & QUEUE_ORDERED_DO_PREFLUSH) {
208 queue_flush(q, QUEUE_ORDERED_DO_PREFLUSH);
209 rq = &q->pre_flush_rq;
210 } else
211 skip |= QUEUE_ORDSEQ_PREFLUSH;
212
213 if ((q->ordered & QUEUE_ORDERED_BY_DRAIN) && queue_in_flight(q))
214 rq = NULL;
215 else
216 skip |= QUEUE_ORDSEQ_DRAIN;
217
218 *rqp = rq;
219
220 /*
221 * Complete skipped sequences. If whole sequence is complete,
222 * return false to tell elevator that this request is gone.
223 */
224 return !blk_ordered_complete_seq(q, skip, 0);
225}
226
227bool blk_do_ordered(struct request_queue *q, struct request **rqp)
228{
229 struct request *rq = *rqp;
230 const int is_barrier = rq->cmd_type == REQ_TYPE_FS &&
231 (rq->cmd_flags & REQ_HARDBARRIER);
232
233 if (!q->ordseq) {
234 if (!is_barrier)
235 return true;
236
237 if (q->next_ordered != QUEUE_ORDERED_NONE)
238 return start_ordered(q, rqp);
239 else {
240 /*
241 * Queue ordering not supported. Terminate
242 * with prejudice.
243 */
244 blk_dequeue_request(rq);
245 __blk_end_request_all(rq, -EOPNOTSUPP);
246 *rqp = NULL;
247 return false;
248 }
249 }
250
251 /*
252 * Ordered sequence in progress
253 */
254
255 /* Special requests are not subject to ordering rules. */
256 if (rq->cmd_type != REQ_TYPE_FS &&
257 rq != &q->pre_flush_rq && rq != &q->post_flush_rq)
258 return true;
259
260 if (q->ordered & QUEUE_ORDERED_BY_TAG) {
261 /* Ordered by tag. Blocking the next barrier is enough. */
262 if (is_barrier && rq != &q->bar_rq)
263 *rqp = NULL;
264 } else {
265 /* Ordered by draining. Wait for turn. */
266 WARN_ON(blk_ordered_req_seq(rq) < blk_ordered_cur_seq(q));
267 if (blk_ordered_req_seq(rq) > blk_ordered_cur_seq(q))
268 *rqp = NULL;
269 }
270
271 return true;
272}
273
274static void bio_end_empty_barrier(struct bio *bio, int err)
275{
276 if (err) {
277 if (err == -EOPNOTSUPP)
278 set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
279 clear_bit(BIO_UPTODATE, &bio->bi_flags);
280 }
281 if (bio->bi_private)
282 complete(bio->bi_private);
283 bio_put(bio);
284}
285
286/**
287 * blkdev_issue_flush - queue a flush
288 * @bdev: blockdev to issue flush for
289 * @gfp_mask: memory allocation flags (for bio_alloc)
290 * @error_sector: error sector
291 * @flags: BLKDEV_IFL_* flags to control behaviour
292 *
293 * Description:
294 * Issue a flush for the block device in question. Caller can supply
295 * room for storing the error offset in case of a flush error, if they
296 * wish to. If WAIT flag is not passed then caller may check only what
297 * request was pushed in some internal queue for later handling.
298 */
299int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask,
300 sector_t *error_sector, unsigned long flags)
301{
302 DECLARE_COMPLETION_ONSTACK(wait);
303 struct request_queue *q;
304 struct bio *bio;
305 int ret = 0;
306
307 if (bdev->bd_disk == NULL)
308 return -ENXIO;
309
310 q = bdev_get_queue(bdev);
311 if (!q)
312 return -ENXIO;
313
314 /*
315 * some block devices may not have their queue correctly set up here
316 * (e.g. loop device without a backing file) and so issuing a flush
317 * here will panic. Ensure there is a request function before issuing
318 * the barrier.
319 */
320 if (!q->make_request_fn)
321 return -ENXIO;
322
323 bio = bio_alloc(gfp_mask, 0);
324 bio->bi_end_io = bio_end_empty_barrier;
325 bio->bi_bdev = bdev;
326 if (test_bit(BLKDEV_WAIT, &flags))
327 bio->bi_private = &wait;
328
329 bio_get(bio);
330 submit_bio(WRITE_BARRIER, bio);
331 if (test_bit(BLKDEV_WAIT, &flags)) {
332 wait_for_completion(&wait);
333 /*
334 * The driver must store the error location in ->bi_sector, if
335 * it supports it. For non-stacked drivers, this should be
336 * copied from blk_rq_pos(rq).
337 */
338 if (error_sector)
339 *error_sector = bio->bi_sector;
340 }
341
342 if (bio_flagged(bio, BIO_EOPNOTSUPP))
343 ret = -EOPNOTSUPP;
344 else if (!bio_flagged(bio, BIO_UPTODATE))
345 ret = -EIO;
346
347 bio_put(bio);
348 return ret;
349}
350EXPORT_SYMBOL(blkdev_issue_flush);
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 2fef1ef931a0..b1febd0f6d2a 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -37,6 +37,12 @@ static void blkiocg_attach(struct cgroup_subsys *, struct cgroup *,
37static void blkiocg_destroy(struct cgroup_subsys *, struct cgroup *); 37static void blkiocg_destroy(struct cgroup_subsys *, struct cgroup *);
38static int blkiocg_populate(struct cgroup_subsys *, struct cgroup *); 38static int blkiocg_populate(struct cgroup_subsys *, struct cgroup *);
39 39
40/* for encoding cft->private value on file */
41#define BLKIOFILE_PRIVATE(x, val) (((x) << 16) | (val))
42/* What policy owns the file, proportional or throttle */
43#define BLKIOFILE_POLICY(val) (((val) >> 16) & 0xffff)
44#define BLKIOFILE_ATTR(val) ((val) & 0xffff)
45
40struct cgroup_subsys blkio_subsys = { 46struct cgroup_subsys blkio_subsys = {
41 .name = "blkio", 47 .name = "blkio",
42 .create = blkiocg_create, 48 .create = blkiocg_create,
@@ -59,6 +65,27 @@ static inline void blkio_policy_insert_node(struct blkio_cgroup *blkcg,
59 list_add(&pn->node, &blkcg->policy_list); 65 list_add(&pn->node, &blkcg->policy_list);
60} 66}
61 67
68static inline bool cftype_blkg_same_policy(struct cftype *cft,
69 struct blkio_group *blkg)
70{
71 enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
72
73 if (blkg->plid == plid)
74 return 1;
75
76 return 0;
77}
78
79/* Determines if policy node matches cgroup file being accessed */
80static inline bool pn_matches_cftype(struct cftype *cft,
81 struct blkio_policy_node *pn)
82{
83 enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
84 int fileid = BLKIOFILE_ATTR(cft->private);
85
86 return (plid == pn->plid && fileid == pn->fileid);
87}
88
62/* Must be called with blkcg->lock held */ 89/* Must be called with blkcg->lock held */
63static inline void blkio_policy_delete_node(struct blkio_policy_node *pn) 90static inline void blkio_policy_delete_node(struct blkio_policy_node *pn)
64{ 91{
@@ -67,12 +94,13 @@ static inline void blkio_policy_delete_node(struct blkio_policy_node *pn)
67 94
68/* Must be called with blkcg->lock held */ 95/* Must be called with blkcg->lock held */
69static struct blkio_policy_node * 96static struct blkio_policy_node *
70blkio_policy_search_node(const struct blkio_cgroup *blkcg, dev_t dev) 97blkio_policy_search_node(const struct blkio_cgroup *blkcg, dev_t dev,
98 enum blkio_policy_id plid, int fileid)
71{ 99{
72 struct blkio_policy_node *pn; 100 struct blkio_policy_node *pn;
73 101
74 list_for_each_entry(pn, &blkcg->policy_list, node) { 102 list_for_each_entry(pn, &blkcg->policy_list, node) {
75 if (pn->dev == dev) 103 if (pn->dev == dev && pn->plid == plid && pn->fileid == fileid)
76 return pn; 104 return pn;
77 } 105 }
78 106
@@ -86,6 +114,67 @@ struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup)
86} 114}
87EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup); 115EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup);
88 116
117static inline void
118blkio_update_group_weight(struct blkio_group *blkg, unsigned int weight)
119{
120 struct blkio_policy_type *blkiop;
121
122 list_for_each_entry(blkiop, &blkio_list, list) {
123 /* If this policy does not own the blkg, do not send updates */
124 if (blkiop->plid != blkg->plid)
125 continue;
126 if (blkiop->ops.blkio_update_group_weight_fn)
127 blkiop->ops.blkio_update_group_weight_fn(blkg->key,
128 blkg, weight);
129 }
130}
131
132static inline void blkio_update_group_bps(struct blkio_group *blkg, u64 bps,
133 int fileid)
134{
135 struct blkio_policy_type *blkiop;
136
137 list_for_each_entry(blkiop, &blkio_list, list) {
138
139 /* If this policy does not own the blkg, do not send updates */
140 if (blkiop->plid != blkg->plid)
141 continue;
142
143 if (fileid == BLKIO_THROTL_read_bps_device
144 && blkiop->ops.blkio_update_group_read_bps_fn)
145 blkiop->ops.blkio_update_group_read_bps_fn(blkg->key,
146 blkg, bps);
147
148 if (fileid == BLKIO_THROTL_write_bps_device
149 && blkiop->ops.blkio_update_group_write_bps_fn)
150 blkiop->ops.blkio_update_group_write_bps_fn(blkg->key,
151 blkg, bps);
152 }
153}
154
155static inline void blkio_update_group_iops(struct blkio_group *blkg,
156 unsigned int iops, int fileid)
157{
158 struct blkio_policy_type *blkiop;
159
160 list_for_each_entry(blkiop, &blkio_list, list) {
161
162 /* If this policy does not own the blkg, do not send updates */
163 if (blkiop->plid != blkg->plid)
164 continue;
165
166 if (fileid == BLKIO_THROTL_read_iops_device
167 && blkiop->ops.blkio_update_group_read_iops_fn)
168 blkiop->ops.blkio_update_group_read_iops_fn(blkg->key,
169 blkg, iops);
170
171 if (fileid == BLKIO_THROTL_write_iops_device
172 && blkiop->ops.blkio_update_group_write_iops_fn)
173 blkiop->ops.blkio_update_group_write_iops_fn(blkg->key,
174 blkg,iops);
175 }
176}
177
89/* 178/*
90 * Add to the appropriate stat variable depending on the request type. 179 * Add to the appropriate stat variable depending on the request type.
91 * This should be called with the blkg->stats_lock held. 180 * This should be called with the blkg->stats_lock held.
@@ -341,7 +430,8 @@ void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction,
341EXPORT_SYMBOL_GPL(blkiocg_update_io_merged_stats); 430EXPORT_SYMBOL_GPL(blkiocg_update_io_merged_stats);
342 431
343void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, 432void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
344 struct blkio_group *blkg, void *key, dev_t dev) 433 struct blkio_group *blkg, void *key, dev_t dev,
434 enum blkio_policy_id plid)
345{ 435{
346 unsigned long flags; 436 unsigned long flags;
347 437
@@ -350,6 +440,7 @@ void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
350 rcu_assign_pointer(blkg->key, key); 440 rcu_assign_pointer(blkg->key, key);
351 blkg->blkcg_id = css_id(&blkcg->css); 441 blkg->blkcg_id = css_id(&blkcg->css);
352 hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list); 442 hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
443 blkg->plid = plid;
353 spin_unlock_irqrestore(&blkcg->lock, flags); 444 spin_unlock_irqrestore(&blkcg->lock, flags);
354 /* Need to take css reference ? */ 445 /* Need to take css reference ? */
355 cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path)); 446 cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path));
@@ -408,51 +499,6 @@ struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key)
408} 499}
409EXPORT_SYMBOL_GPL(blkiocg_lookup_group); 500EXPORT_SYMBOL_GPL(blkiocg_lookup_group);
410 501
411#define SHOW_FUNCTION(__VAR) \
412static u64 blkiocg_##__VAR##_read(struct cgroup *cgroup, \
413 struct cftype *cftype) \
414{ \
415 struct blkio_cgroup *blkcg; \
416 \
417 blkcg = cgroup_to_blkio_cgroup(cgroup); \
418 return (u64)blkcg->__VAR; \
419}
420
421SHOW_FUNCTION(weight);
422#undef SHOW_FUNCTION
423
424static int
425blkiocg_weight_write(struct cgroup *cgroup, struct cftype *cftype, u64 val)
426{
427 struct blkio_cgroup *blkcg;
428 struct blkio_group *blkg;
429 struct hlist_node *n;
430 struct blkio_policy_type *blkiop;
431 struct blkio_policy_node *pn;
432
433 if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX)
434 return -EINVAL;
435
436 blkcg = cgroup_to_blkio_cgroup(cgroup);
437 spin_lock(&blkio_list_lock);
438 spin_lock_irq(&blkcg->lock);
439 blkcg->weight = (unsigned int)val;
440
441 hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
442 pn = blkio_policy_search_node(blkcg, blkg->dev);
443
444 if (pn)
445 continue;
446
447 list_for_each_entry(blkiop, &blkio_list, list)
448 blkiop->ops.blkio_update_group_weight_fn(blkg,
449 blkcg->weight);
450 }
451 spin_unlock_irq(&blkcg->lock);
452 spin_unlock(&blkio_list_lock);
453 return 0;
454}
455
456static int 502static int
457blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val) 503blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val)
458{ 504{
@@ -593,52 +639,6 @@ static uint64_t blkio_get_stat(struct blkio_group *blkg,
593 return disk_total; 639 return disk_total;
594} 640}
595 641
596#define SHOW_FUNCTION_PER_GROUP(__VAR, type, show_total) \
597static int blkiocg_##__VAR##_read(struct cgroup *cgroup, \
598 struct cftype *cftype, struct cgroup_map_cb *cb) \
599{ \
600 struct blkio_cgroup *blkcg; \
601 struct blkio_group *blkg; \
602 struct hlist_node *n; \
603 uint64_t cgroup_total = 0; \
604 \
605 if (!cgroup_lock_live_group(cgroup)) \
606 return -ENODEV; \
607 \
608 blkcg = cgroup_to_blkio_cgroup(cgroup); \
609 rcu_read_lock(); \
610 hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {\
611 if (blkg->dev) { \
612 spin_lock_irq(&blkg->stats_lock); \
613 cgroup_total += blkio_get_stat(blkg, cb, \
614 blkg->dev, type); \
615 spin_unlock_irq(&blkg->stats_lock); \
616 } \
617 } \
618 if (show_total) \
619 cb->fill(cb, "Total", cgroup_total); \
620 rcu_read_unlock(); \
621 cgroup_unlock(); \
622 return 0; \
623}
624
625SHOW_FUNCTION_PER_GROUP(time, BLKIO_STAT_TIME, 0);
626SHOW_FUNCTION_PER_GROUP(sectors, BLKIO_STAT_SECTORS, 0);
627SHOW_FUNCTION_PER_GROUP(io_service_bytes, BLKIO_STAT_SERVICE_BYTES, 1);
628SHOW_FUNCTION_PER_GROUP(io_serviced, BLKIO_STAT_SERVICED, 1);
629SHOW_FUNCTION_PER_GROUP(io_service_time, BLKIO_STAT_SERVICE_TIME, 1);
630SHOW_FUNCTION_PER_GROUP(io_wait_time, BLKIO_STAT_WAIT_TIME, 1);
631SHOW_FUNCTION_PER_GROUP(io_merged, BLKIO_STAT_MERGED, 1);
632SHOW_FUNCTION_PER_GROUP(io_queued, BLKIO_STAT_QUEUED, 1);
633#ifdef CONFIG_DEBUG_BLK_CGROUP
634SHOW_FUNCTION_PER_GROUP(dequeue, BLKIO_STAT_DEQUEUE, 0);
635SHOW_FUNCTION_PER_GROUP(avg_queue_size, BLKIO_STAT_AVG_QUEUE_SIZE, 0);
636SHOW_FUNCTION_PER_GROUP(group_wait_time, BLKIO_STAT_GROUP_WAIT_TIME, 0);
637SHOW_FUNCTION_PER_GROUP(idle_time, BLKIO_STAT_IDLE_TIME, 0);
638SHOW_FUNCTION_PER_GROUP(empty_time, BLKIO_STAT_EMPTY_TIME, 0);
639#endif
640#undef SHOW_FUNCTION_PER_GROUP
641
642static int blkio_check_dev_num(dev_t dev) 642static int blkio_check_dev_num(dev_t dev)
643{ 643{
644 int part = 0; 644 int part = 0;
@@ -652,13 +652,14 @@ static int blkio_check_dev_num(dev_t dev)
652} 652}
653 653
654static int blkio_policy_parse_and_set(char *buf, 654static int blkio_policy_parse_and_set(char *buf,
655 struct blkio_policy_node *newpn) 655 struct blkio_policy_node *newpn, enum blkio_policy_id plid, int fileid)
656{ 656{
657 char *s[4], *p, *major_s = NULL, *minor_s = NULL; 657 char *s[4], *p, *major_s = NULL, *minor_s = NULL;
658 int ret; 658 int ret;
659 unsigned long major, minor, temp; 659 unsigned long major, minor, temp;
660 int i = 0; 660 int i = 0;
661 dev_t dev; 661 dev_t dev;
662 u64 bps, iops;
662 663
663 memset(s, 0, sizeof(s)); 664 memset(s, 0, sizeof(s));
664 665
@@ -705,12 +706,47 @@ static int blkio_policy_parse_and_set(char *buf,
705 if (s[1] == NULL) 706 if (s[1] == NULL)
706 return -EINVAL; 707 return -EINVAL;
707 708
708 ret = strict_strtoul(s[1], 10, &temp); 709 switch (plid) {
709 if (ret || (temp < BLKIO_WEIGHT_MIN && temp > 0) || 710 case BLKIO_POLICY_PROP:
710 temp > BLKIO_WEIGHT_MAX) 711 ret = strict_strtoul(s[1], 10, &temp);
711 return -EINVAL; 712 if (ret || (temp < BLKIO_WEIGHT_MIN && temp > 0) ||
713 temp > BLKIO_WEIGHT_MAX)
714 return -EINVAL;
712 715
713 newpn->weight = temp; 716 newpn->plid = plid;
717 newpn->fileid = fileid;
718 newpn->val.weight = temp;
719 break;
720 case BLKIO_POLICY_THROTL:
721 switch(fileid) {
722 case BLKIO_THROTL_read_bps_device:
723 case BLKIO_THROTL_write_bps_device:
724 ret = strict_strtoull(s[1], 10, &bps);
725 if (ret)
726 return -EINVAL;
727
728 newpn->plid = plid;
729 newpn->fileid = fileid;
730 newpn->val.bps = bps;
731 break;
732 case BLKIO_THROTL_read_iops_device:
733 case BLKIO_THROTL_write_iops_device:
734 ret = strict_strtoull(s[1], 10, &iops);
735 if (ret)
736 return -EINVAL;
737
738 if (iops > THROTL_IOPS_MAX)
739 return -EINVAL;
740
741 newpn->plid = plid;
742 newpn->fileid = fileid;
743 newpn->val.iops = (unsigned int)iops;
744 break;
745 }
746 break;
747 default:
748 BUG();
749 }
714 750
715 return 0; 751 return 0;
716} 752}
@@ -720,26 +756,180 @@ unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
720{ 756{
721 struct blkio_policy_node *pn; 757 struct blkio_policy_node *pn;
722 758
723 pn = blkio_policy_search_node(blkcg, dev); 759 pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_PROP,
760 BLKIO_PROP_weight_device);
724 if (pn) 761 if (pn)
725 return pn->weight; 762 return pn->val.weight;
726 else 763 else
727 return blkcg->weight; 764 return blkcg->weight;
728} 765}
729EXPORT_SYMBOL_GPL(blkcg_get_weight); 766EXPORT_SYMBOL_GPL(blkcg_get_weight);
730 767
768uint64_t blkcg_get_read_bps(struct blkio_cgroup *blkcg, dev_t dev)
769{
770 struct blkio_policy_node *pn;
771
772 pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
773 BLKIO_THROTL_read_bps_device);
774 if (pn)
775 return pn->val.bps;
776 else
777 return -1;
778}
779
780uint64_t blkcg_get_write_bps(struct blkio_cgroup *blkcg, dev_t dev)
781{
782 struct blkio_policy_node *pn;
783 pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
784 BLKIO_THROTL_write_bps_device);
785 if (pn)
786 return pn->val.bps;
787 else
788 return -1;
789}
790
791unsigned int blkcg_get_read_iops(struct blkio_cgroup *blkcg, dev_t dev)
792{
793 struct blkio_policy_node *pn;
794
795 pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
796 BLKIO_THROTL_read_iops_device);
797 if (pn)
798 return pn->val.iops;
799 else
800 return -1;
801}
802
803unsigned int blkcg_get_write_iops(struct blkio_cgroup *blkcg, dev_t dev)
804{
805 struct blkio_policy_node *pn;
806 pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
807 BLKIO_THROTL_write_iops_device);
808 if (pn)
809 return pn->val.iops;
810 else
811 return -1;
812}
813
814/* Checks whether user asked for deleting a policy rule */
815static bool blkio_delete_rule_command(struct blkio_policy_node *pn)
816{
817 switch(pn->plid) {
818 case BLKIO_POLICY_PROP:
819 if (pn->val.weight == 0)
820 return 1;
821 break;
822 case BLKIO_POLICY_THROTL:
823 switch(pn->fileid) {
824 case BLKIO_THROTL_read_bps_device:
825 case BLKIO_THROTL_write_bps_device:
826 if (pn->val.bps == 0)
827 return 1;
828 break;
829 case BLKIO_THROTL_read_iops_device:
830 case BLKIO_THROTL_write_iops_device:
831 if (pn->val.iops == 0)
832 return 1;
833 }
834 break;
835 default:
836 BUG();
837 }
838
839 return 0;
840}
841
842static void blkio_update_policy_rule(struct blkio_policy_node *oldpn,
843 struct blkio_policy_node *newpn)
844{
845 switch(oldpn->plid) {
846 case BLKIO_POLICY_PROP:
847 oldpn->val.weight = newpn->val.weight;
848 break;
849 case BLKIO_POLICY_THROTL:
850 switch(newpn->fileid) {
851 case BLKIO_THROTL_read_bps_device:
852 case BLKIO_THROTL_write_bps_device:
853 oldpn->val.bps = newpn->val.bps;
854 break;
855 case BLKIO_THROTL_read_iops_device:
856 case BLKIO_THROTL_write_iops_device:
857 oldpn->val.iops = newpn->val.iops;
858 }
859 break;
860 default:
861 BUG();
862 }
863}
864
865/*
866 * Some rules/values in blkg have changed. Propogate those to respective
867 * policies.
868 */
869static void blkio_update_blkg_policy(struct blkio_cgroup *blkcg,
870 struct blkio_group *blkg, struct blkio_policy_node *pn)
871{
872 unsigned int weight, iops;
873 u64 bps;
874
875 switch(pn->plid) {
876 case BLKIO_POLICY_PROP:
877 weight = pn->val.weight ? pn->val.weight :
878 blkcg->weight;
879 blkio_update_group_weight(blkg, weight);
880 break;
881 case BLKIO_POLICY_THROTL:
882 switch(pn->fileid) {
883 case BLKIO_THROTL_read_bps_device:
884 case BLKIO_THROTL_write_bps_device:
885 bps = pn->val.bps ? pn->val.bps : (-1);
886 blkio_update_group_bps(blkg, bps, pn->fileid);
887 break;
888 case BLKIO_THROTL_read_iops_device:
889 case BLKIO_THROTL_write_iops_device:
890 iops = pn->val.iops ? pn->val.iops : (-1);
891 blkio_update_group_iops(blkg, iops, pn->fileid);
892 break;
893 }
894 break;
895 default:
896 BUG();
897 }
898}
899
900/*
901 * A policy node rule has been updated. Propogate this update to all the
902 * block groups which might be affected by this update.
903 */
904static void blkio_update_policy_node_blkg(struct blkio_cgroup *blkcg,
905 struct blkio_policy_node *pn)
906{
907 struct blkio_group *blkg;
908 struct hlist_node *n;
909
910 spin_lock(&blkio_list_lock);
911 spin_lock_irq(&blkcg->lock);
912
913 hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
914 if (pn->dev != blkg->dev || pn->plid != blkg->plid)
915 continue;
916 blkio_update_blkg_policy(blkcg, blkg, pn);
917 }
918
919 spin_unlock_irq(&blkcg->lock);
920 spin_unlock(&blkio_list_lock);
921}
731 922
732static int blkiocg_weight_device_write(struct cgroup *cgrp, struct cftype *cft, 923static int blkiocg_file_write(struct cgroup *cgrp, struct cftype *cft,
733 const char *buffer) 924 const char *buffer)
734{ 925{
735 int ret = 0; 926 int ret = 0;
736 char *buf; 927 char *buf;
737 struct blkio_policy_node *newpn, *pn; 928 struct blkio_policy_node *newpn, *pn;
738 struct blkio_cgroup *blkcg; 929 struct blkio_cgroup *blkcg;
739 struct blkio_group *blkg;
740 int keep_newpn = 0; 930 int keep_newpn = 0;
741 struct hlist_node *n; 931 enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
742 struct blkio_policy_type *blkiop; 932 int fileid = BLKIOFILE_ATTR(cft->private);
743 933
744 buf = kstrdup(buffer, GFP_KERNEL); 934 buf = kstrdup(buffer, GFP_KERNEL);
745 if (!buf) 935 if (!buf)
@@ -751,7 +941,7 @@ static int blkiocg_weight_device_write(struct cgroup *cgrp, struct cftype *cft,
751 goto free_buf; 941 goto free_buf;
752 } 942 }
753 943
754 ret = blkio_policy_parse_and_set(buf, newpn); 944 ret = blkio_policy_parse_and_set(buf, newpn, plid, fileid);
755 if (ret) 945 if (ret)
756 goto free_newpn; 946 goto free_newpn;
757 947
@@ -759,9 +949,9 @@ static int blkiocg_weight_device_write(struct cgroup *cgrp, struct cftype *cft,
759 949
760 spin_lock_irq(&blkcg->lock); 950 spin_lock_irq(&blkcg->lock);
761 951
762 pn = blkio_policy_search_node(blkcg, newpn->dev); 952 pn = blkio_policy_search_node(blkcg, newpn->dev, plid, fileid);
763 if (!pn) { 953 if (!pn) {
764 if (newpn->weight != 0) { 954 if (!blkio_delete_rule_command(newpn)) {
765 blkio_policy_insert_node(blkcg, newpn); 955 blkio_policy_insert_node(blkcg, newpn);
766 keep_newpn = 1; 956 keep_newpn = 1;
767 } 957 }
@@ -769,33 +959,17 @@ static int blkiocg_weight_device_write(struct cgroup *cgrp, struct cftype *cft,
769 goto update_io_group; 959 goto update_io_group;
770 } 960 }
771 961
772 if (newpn->weight == 0) { 962 if (blkio_delete_rule_command(newpn)) {
773 /* weight == 0 means deleteing a specific weight */
774 blkio_policy_delete_node(pn); 963 blkio_policy_delete_node(pn);
775 spin_unlock_irq(&blkcg->lock); 964 spin_unlock_irq(&blkcg->lock);
776 goto update_io_group; 965 goto update_io_group;
777 } 966 }
778 spin_unlock_irq(&blkcg->lock); 967 spin_unlock_irq(&blkcg->lock);
779 968
780 pn->weight = newpn->weight; 969 blkio_update_policy_rule(pn, newpn);
781 970
782update_io_group: 971update_io_group:
783 /* update weight for each cfqg */ 972 blkio_update_policy_node_blkg(blkcg, newpn);
784 spin_lock(&blkio_list_lock);
785 spin_lock_irq(&blkcg->lock);
786
787 hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
788 if (newpn->dev == blkg->dev) {
789 list_for_each_entry(blkiop, &blkio_list, list)
790 blkiop->ops.blkio_update_group_weight_fn(blkg,
791 newpn->weight ?
792 newpn->weight :
793 blkcg->weight);
794 }
795 }
796
797 spin_unlock_irq(&blkcg->lock);
798 spin_unlock(&blkio_list_lock);
799 973
800free_newpn: 974free_newpn:
801 if (!keep_newpn) 975 if (!keep_newpn)
@@ -805,23 +979,256 @@ free_buf:
805 return ret; 979 return ret;
806} 980}
807 981
808static int blkiocg_weight_device_read(struct cgroup *cgrp, struct cftype *cft, 982static void
809 struct seq_file *m) 983blkio_print_policy_node(struct seq_file *m, struct blkio_policy_node *pn)
810{ 984{
811 struct blkio_cgroup *blkcg; 985 switch(pn->plid) {
812 struct blkio_policy_node *pn; 986 case BLKIO_POLICY_PROP:
987 if (pn->fileid == BLKIO_PROP_weight_device)
988 seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev),
989 MINOR(pn->dev), pn->val.weight);
990 break;
991 case BLKIO_POLICY_THROTL:
992 switch(pn->fileid) {
993 case BLKIO_THROTL_read_bps_device:
994 case BLKIO_THROTL_write_bps_device:
995 seq_printf(m, "%u:%u\t%llu\n", MAJOR(pn->dev),
996 MINOR(pn->dev), pn->val.bps);
997 break;
998 case BLKIO_THROTL_read_iops_device:
999 case BLKIO_THROTL_write_iops_device:
1000 seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev),
1001 MINOR(pn->dev), pn->val.iops);
1002 break;
1003 }
1004 break;
1005 default:
1006 BUG();
1007 }
1008}
813 1009
814 seq_printf(m, "dev\tweight\n"); 1010/* cgroup files which read their data from policy nodes end up here */
1011static void blkio_read_policy_node_files(struct cftype *cft,
1012 struct blkio_cgroup *blkcg, struct seq_file *m)
1013{
1014 struct blkio_policy_node *pn;
815 1015
816 blkcg = cgroup_to_blkio_cgroup(cgrp);
817 if (!list_empty(&blkcg->policy_list)) { 1016 if (!list_empty(&blkcg->policy_list)) {
818 spin_lock_irq(&blkcg->lock); 1017 spin_lock_irq(&blkcg->lock);
819 list_for_each_entry(pn, &blkcg->policy_list, node) { 1018 list_for_each_entry(pn, &blkcg->policy_list, node) {
820 seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev), 1019 if (!pn_matches_cftype(cft, pn))
821 MINOR(pn->dev), pn->weight); 1020 continue;
1021 blkio_print_policy_node(m, pn);
822 } 1022 }
823 spin_unlock_irq(&blkcg->lock); 1023 spin_unlock_irq(&blkcg->lock);
824 } 1024 }
1025}
1026
1027static int blkiocg_file_read(struct cgroup *cgrp, struct cftype *cft,
1028 struct seq_file *m)
1029{
1030 struct blkio_cgroup *blkcg;
1031 enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
1032 int name = BLKIOFILE_ATTR(cft->private);
1033
1034 blkcg = cgroup_to_blkio_cgroup(cgrp);
1035
1036 switch(plid) {
1037 case BLKIO_POLICY_PROP:
1038 switch(name) {
1039 case BLKIO_PROP_weight_device:
1040 blkio_read_policy_node_files(cft, blkcg, m);
1041 return 0;
1042 default:
1043 BUG();
1044 }
1045 break;
1046 case BLKIO_POLICY_THROTL:
1047 switch(name){
1048 case BLKIO_THROTL_read_bps_device:
1049 case BLKIO_THROTL_write_bps_device:
1050 case BLKIO_THROTL_read_iops_device:
1051 case BLKIO_THROTL_write_iops_device:
1052 blkio_read_policy_node_files(cft, blkcg, m);
1053 return 0;
1054 default:
1055 BUG();
1056 }
1057 break;
1058 default:
1059 BUG();
1060 }
1061
1062 return 0;
1063}
1064
1065static int blkio_read_blkg_stats(struct blkio_cgroup *blkcg,
1066 struct cftype *cft, struct cgroup_map_cb *cb, enum stat_type type,
1067 bool show_total)
1068{
1069 struct blkio_group *blkg;
1070 struct hlist_node *n;
1071 uint64_t cgroup_total = 0;
1072
1073 rcu_read_lock();
1074 hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {
1075 if (blkg->dev) {
1076 if (!cftype_blkg_same_policy(cft, blkg))
1077 continue;
1078 spin_lock_irq(&blkg->stats_lock);
1079 cgroup_total += blkio_get_stat(blkg, cb, blkg->dev,
1080 type);
1081 spin_unlock_irq(&blkg->stats_lock);
1082 }
1083 }
1084 if (show_total)
1085 cb->fill(cb, "Total", cgroup_total);
1086 rcu_read_unlock();
1087 return 0;
1088}
1089
1090/* All map kind of cgroup file get serviced by this function */
1091static int blkiocg_file_read_map(struct cgroup *cgrp, struct cftype *cft,
1092 struct cgroup_map_cb *cb)
1093{
1094 struct blkio_cgroup *blkcg;
1095 enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
1096 int name = BLKIOFILE_ATTR(cft->private);
1097
1098 blkcg = cgroup_to_blkio_cgroup(cgrp);
1099
1100 switch(plid) {
1101 case BLKIO_POLICY_PROP:
1102 switch(name) {
1103 case BLKIO_PROP_time:
1104 return blkio_read_blkg_stats(blkcg, cft, cb,
1105 BLKIO_STAT_TIME, 0);
1106 case BLKIO_PROP_sectors:
1107 return blkio_read_blkg_stats(blkcg, cft, cb,
1108 BLKIO_STAT_SECTORS, 0);
1109 case BLKIO_PROP_io_service_bytes:
1110 return blkio_read_blkg_stats(blkcg, cft, cb,
1111 BLKIO_STAT_SERVICE_BYTES, 1);
1112 case BLKIO_PROP_io_serviced:
1113 return blkio_read_blkg_stats(blkcg, cft, cb,
1114 BLKIO_STAT_SERVICED, 1);
1115 case BLKIO_PROP_io_service_time:
1116 return blkio_read_blkg_stats(blkcg, cft, cb,
1117 BLKIO_STAT_SERVICE_TIME, 1);
1118 case BLKIO_PROP_io_wait_time:
1119 return blkio_read_blkg_stats(blkcg, cft, cb,
1120 BLKIO_STAT_WAIT_TIME, 1);
1121 case BLKIO_PROP_io_merged:
1122 return blkio_read_blkg_stats(blkcg, cft, cb,
1123 BLKIO_STAT_MERGED, 1);
1124 case BLKIO_PROP_io_queued:
1125 return blkio_read_blkg_stats(blkcg, cft, cb,
1126 BLKIO_STAT_QUEUED, 1);
1127#ifdef CONFIG_DEBUG_BLK_CGROUP
1128 case BLKIO_PROP_dequeue:
1129 return blkio_read_blkg_stats(blkcg, cft, cb,
1130 BLKIO_STAT_DEQUEUE, 0);
1131 case BLKIO_PROP_avg_queue_size:
1132 return blkio_read_blkg_stats(blkcg, cft, cb,
1133 BLKIO_STAT_AVG_QUEUE_SIZE, 0);
1134 case BLKIO_PROP_group_wait_time:
1135 return blkio_read_blkg_stats(blkcg, cft, cb,
1136 BLKIO_STAT_GROUP_WAIT_TIME, 0);
1137 case BLKIO_PROP_idle_time:
1138 return blkio_read_blkg_stats(blkcg, cft, cb,
1139 BLKIO_STAT_IDLE_TIME, 0);
1140 case BLKIO_PROP_empty_time:
1141 return blkio_read_blkg_stats(blkcg, cft, cb,
1142 BLKIO_STAT_EMPTY_TIME, 0);
1143#endif
1144 default:
1145 BUG();
1146 }
1147 break;
1148 case BLKIO_POLICY_THROTL:
1149 switch(name){
1150 case BLKIO_THROTL_io_service_bytes:
1151 return blkio_read_blkg_stats(blkcg, cft, cb,
1152 BLKIO_STAT_SERVICE_BYTES, 1);
1153 case BLKIO_THROTL_io_serviced:
1154 return blkio_read_blkg_stats(blkcg, cft, cb,
1155 BLKIO_STAT_SERVICED, 1);
1156 default:
1157 BUG();
1158 }
1159 break;
1160 default:
1161 BUG();
1162 }
1163
1164 return 0;
1165}
1166
1167static int blkio_weight_write(struct blkio_cgroup *blkcg, u64 val)
1168{
1169 struct blkio_group *blkg;
1170 struct hlist_node *n;
1171 struct blkio_policy_node *pn;
1172
1173 if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX)
1174 return -EINVAL;
1175
1176 spin_lock(&blkio_list_lock);
1177 spin_lock_irq(&blkcg->lock);
1178 blkcg->weight = (unsigned int)val;
1179
1180 hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
1181 pn = blkio_policy_search_node(blkcg, blkg->dev,
1182 BLKIO_POLICY_PROP, BLKIO_PROP_weight_device);
1183 if (pn)
1184 continue;
1185
1186 blkio_update_group_weight(blkg, blkcg->weight);
1187 }
1188 spin_unlock_irq(&blkcg->lock);
1189 spin_unlock(&blkio_list_lock);
1190 return 0;
1191}
1192
1193static u64 blkiocg_file_read_u64 (struct cgroup *cgrp, struct cftype *cft) {
1194 struct blkio_cgroup *blkcg;
1195 enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
1196 int name = BLKIOFILE_ATTR(cft->private);
1197
1198 blkcg = cgroup_to_blkio_cgroup(cgrp);
1199
1200 switch(plid) {
1201 case BLKIO_POLICY_PROP:
1202 switch(name) {
1203 case BLKIO_PROP_weight:
1204 return (u64)blkcg->weight;
1205 }
1206 break;
1207 default:
1208 BUG();
1209 }
1210 return 0;
1211}
1212
1213static int
1214blkiocg_file_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
1215{
1216 struct blkio_cgroup *blkcg;
1217 enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
1218 int name = BLKIOFILE_ATTR(cft->private);
1219
1220 blkcg = cgroup_to_blkio_cgroup(cgrp);
1221
1222 switch(plid) {
1223 case BLKIO_POLICY_PROP:
1224 switch(name) {
1225 case BLKIO_PROP_weight:
1226 return blkio_weight_write(blkcg, val);
1227 }
1228 break;
1229 default:
1230 BUG();
1231 }
825 1232
826 return 0; 1233 return 0;
827} 1234}
@@ -829,71 +1236,151 @@ static int blkiocg_weight_device_read(struct cgroup *cgrp, struct cftype *cft,
829struct cftype blkio_files[] = { 1236struct cftype blkio_files[] = {
830 { 1237 {
831 .name = "weight_device", 1238 .name = "weight_device",
832 .read_seq_string = blkiocg_weight_device_read, 1239 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
833 .write_string = blkiocg_weight_device_write, 1240 BLKIO_PROP_weight_device),
1241 .read_seq_string = blkiocg_file_read,
1242 .write_string = blkiocg_file_write,
834 .max_write_len = 256, 1243 .max_write_len = 256,
835 }, 1244 },
836 { 1245 {
837 .name = "weight", 1246 .name = "weight",
838 .read_u64 = blkiocg_weight_read, 1247 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
839 .write_u64 = blkiocg_weight_write, 1248 BLKIO_PROP_weight),
1249 .read_u64 = blkiocg_file_read_u64,
1250 .write_u64 = blkiocg_file_write_u64,
840 }, 1251 },
841 { 1252 {
842 .name = "time", 1253 .name = "time",
843 .read_map = blkiocg_time_read, 1254 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1255 BLKIO_PROP_time),
1256 .read_map = blkiocg_file_read_map,
844 }, 1257 },
845 { 1258 {
846 .name = "sectors", 1259 .name = "sectors",
847 .read_map = blkiocg_sectors_read, 1260 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1261 BLKIO_PROP_sectors),
1262 .read_map = blkiocg_file_read_map,
848 }, 1263 },
849 { 1264 {
850 .name = "io_service_bytes", 1265 .name = "io_service_bytes",
851 .read_map = blkiocg_io_service_bytes_read, 1266 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1267 BLKIO_PROP_io_service_bytes),
1268 .read_map = blkiocg_file_read_map,
852 }, 1269 },
853 { 1270 {
854 .name = "io_serviced", 1271 .name = "io_serviced",
855 .read_map = blkiocg_io_serviced_read, 1272 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1273 BLKIO_PROP_io_serviced),
1274 .read_map = blkiocg_file_read_map,
856 }, 1275 },
857 { 1276 {
858 .name = "io_service_time", 1277 .name = "io_service_time",
859 .read_map = blkiocg_io_service_time_read, 1278 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1279 BLKIO_PROP_io_service_time),
1280 .read_map = blkiocg_file_read_map,
860 }, 1281 },
861 { 1282 {
862 .name = "io_wait_time", 1283 .name = "io_wait_time",
863 .read_map = blkiocg_io_wait_time_read, 1284 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1285 BLKIO_PROP_io_wait_time),
1286 .read_map = blkiocg_file_read_map,
864 }, 1287 },
865 { 1288 {
866 .name = "io_merged", 1289 .name = "io_merged",
867 .read_map = blkiocg_io_merged_read, 1290 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1291 BLKIO_PROP_io_merged),
1292 .read_map = blkiocg_file_read_map,
868 }, 1293 },
869 { 1294 {
870 .name = "io_queued", 1295 .name = "io_queued",
871 .read_map = blkiocg_io_queued_read, 1296 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1297 BLKIO_PROP_io_queued),
1298 .read_map = blkiocg_file_read_map,
872 }, 1299 },
873 { 1300 {
874 .name = "reset_stats", 1301 .name = "reset_stats",
875 .write_u64 = blkiocg_reset_stats, 1302 .write_u64 = blkiocg_reset_stats,
876 }, 1303 },
1304#ifdef CONFIG_BLK_DEV_THROTTLING
1305 {
1306 .name = "throttle.read_bps_device",
1307 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1308 BLKIO_THROTL_read_bps_device),
1309 .read_seq_string = blkiocg_file_read,
1310 .write_string = blkiocg_file_write,
1311 .max_write_len = 256,
1312 },
1313
1314 {
1315 .name = "throttle.write_bps_device",
1316 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1317 BLKIO_THROTL_write_bps_device),
1318 .read_seq_string = blkiocg_file_read,
1319 .write_string = blkiocg_file_write,
1320 .max_write_len = 256,
1321 },
1322
1323 {
1324 .name = "throttle.read_iops_device",
1325 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1326 BLKIO_THROTL_read_iops_device),
1327 .read_seq_string = blkiocg_file_read,
1328 .write_string = blkiocg_file_write,
1329 .max_write_len = 256,
1330 },
1331
1332 {
1333 .name = "throttle.write_iops_device",
1334 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1335 BLKIO_THROTL_write_iops_device),
1336 .read_seq_string = blkiocg_file_read,
1337 .write_string = blkiocg_file_write,
1338 .max_write_len = 256,
1339 },
1340 {
1341 .name = "throttle.io_service_bytes",
1342 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1343 BLKIO_THROTL_io_service_bytes),
1344 .read_map = blkiocg_file_read_map,
1345 },
1346 {
1347 .name = "throttle.io_serviced",
1348 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1349 BLKIO_THROTL_io_serviced),
1350 .read_map = blkiocg_file_read_map,
1351 },
1352#endif /* CONFIG_BLK_DEV_THROTTLING */
1353
877#ifdef CONFIG_DEBUG_BLK_CGROUP 1354#ifdef CONFIG_DEBUG_BLK_CGROUP
878 { 1355 {
879 .name = "avg_queue_size", 1356 .name = "avg_queue_size",
880 .read_map = blkiocg_avg_queue_size_read, 1357 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1358 BLKIO_PROP_avg_queue_size),
1359 .read_map = blkiocg_file_read_map,
881 }, 1360 },
882 { 1361 {
883 .name = "group_wait_time", 1362 .name = "group_wait_time",
884 .read_map = blkiocg_group_wait_time_read, 1363 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1364 BLKIO_PROP_group_wait_time),
1365 .read_map = blkiocg_file_read_map,
885 }, 1366 },
886 { 1367 {
887 .name = "idle_time", 1368 .name = "idle_time",
888 .read_map = blkiocg_idle_time_read, 1369 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1370 BLKIO_PROP_idle_time),
1371 .read_map = blkiocg_file_read_map,
889 }, 1372 },
890 { 1373 {
891 .name = "empty_time", 1374 .name = "empty_time",
892 .read_map = blkiocg_empty_time_read, 1375 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1376 BLKIO_PROP_empty_time),
1377 .read_map = blkiocg_file_read_map,
893 }, 1378 },
894 { 1379 {
895 .name = "dequeue", 1380 .name = "dequeue",
896 .read_map = blkiocg_dequeue_read, 1381 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1382 BLKIO_PROP_dequeue),
1383 .read_map = blkiocg_file_read_map,
897 }, 1384 },
898#endif 1385#endif
899}; 1386};
@@ -932,13 +1419,14 @@ static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup)
932 /* 1419 /*
933 * This blkio_group is being unlinked as associated cgroup is 1420 * This blkio_group is being unlinked as associated cgroup is
934 * going away. Let all the IO controlling policies know about 1421 * going away. Let all the IO controlling policies know about
935 * this event. Currently this is static call to one io 1422 * this event.
936 * controlling policy. Once we have more policies in place, we
937 * need some dynamic registration of callback function.
938 */ 1423 */
939 spin_lock(&blkio_list_lock); 1424 spin_lock(&blkio_list_lock);
940 list_for_each_entry(blkiop, &blkio_list, list) 1425 list_for_each_entry(blkiop, &blkio_list, list) {
1426 if (blkiop->plid != blkg->plid)
1427 continue;
941 blkiop->ops.blkio_unlink_group_fn(key, blkg); 1428 blkiop->ops.blkio_unlink_group_fn(key, blkg);
1429 }
942 spin_unlock(&blkio_list_lock); 1430 spin_unlock(&blkio_list_lock);
943 } while (1); 1431 } while (1);
944 1432
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 2b866ec1dcea..ea4861bdd549 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -15,6 +15,14 @@
15 15
16#include <linux/cgroup.h> 16#include <linux/cgroup.h>
17 17
18enum blkio_policy_id {
19 BLKIO_POLICY_PROP = 0, /* Proportional Bandwidth division */
20 BLKIO_POLICY_THROTL, /* Throttling */
21};
22
23/* Max limits for throttle policy */
24#define THROTL_IOPS_MAX UINT_MAX
25
18#if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE) 26#if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE)
19 27
20#ifndef CONFIG_BLK_CGROUP 28#ifndef CONFIG_BLK_CGROUP
@@ -65,6 +73,35 @@ enum blkg_state_flags {
65 BLKG_empty, 73 BLKG_empty,
66}; 74};
67 75
76/* cgroup files owned by proportional weight policy */
77enum blkcg_file_name_prop {
78 BLKIO_PROP_weight = 1,
79 BLKIO_PROP_weight_device,
80 BLKIO_PROP_io_service_bytes,
81 BLKIO_PROP_io_serviced,
82 BLKIO_PROP_time,
83 BLKIO_PROP_sectors,
84 BLKIO_PROP_io_service_time,
85 BLKIO_PROP_io_wait_time,
86 BLKIO_PROP_io_merged,
87 BLKIO_PROP_io_queued,
88 BLKIO_PROP_avg_queue_size,
89 BLKIO_PROP_group_wait_time,
90 BLKIO_PROP_idle_time,
91 BLKIO_PROP_empty_time,
92 BLKIO_PROP_dequeue,
93};
94
95/* cgroup files owned by throttle policy */
96enum blkcg_file_name_throtl {
97 BLKIO_THROTL_read_bps_device,
98 BLKIO_THROTL_write_bps_device,
99 BLKIO_THROTL_read_iops_device,
100 BLKIO_THROTL_write_iops_device,
101 BLKIO_THROTL_io_service_bytes,
102 BLKIO_THROTL_io_serviced,
103};
104
68struct blkio_cgroup { 105struct blkio_cgroup {
69 struct cgroup_subsys_state css; 106 struct cgroup_subsys_state css;
70 unsigned int weight; 107 unsigned int weight;
@@ -112,6 +149,8 @@ struct blkio_group {
112 char path[128]; 149 char path[128];
113 /* The device MKDEV(major, minor), this group has been created for */ 150 /* The device MKDEV(major, minor), this group has been created for */
114 dev_t dev; 151 dev_t dev;
152 /* policy which owns this blk group */
153 enum blkio_policy_id plid;
115 154
116 /* Need to serialize the stats in the case of reset/update */ 155 /* Need to serialize the stats in the case of reset/update */
117 spinlock_t stats_lock; 156 spinlock_t stats_lock;
@@ -121,24 +160,60 @@ struct blkio_group {
121struct blkio_policy_node { 160struct blkio_policy_node {
122 struct list_head node; 161 struct list_head node;
123 dev_t dev; 162 dev_t dev;
124 unsigned int weight; 163 /* This node belongs to max bw policy or porportional weight policy */
164 enum blkio_policy_id plid;
165 /* cgroup file to which this rule belongs to */
166 int fileid;
167
168 union {
169 unsigned int weight;
170 /*
171 * Rate read/write in terms of byptes per second
172 * Whether this rate represents read or write is determined
173 * by file type "fileid".
174 */
175 u64 bps;
176 unsigned int iops;
177 } val;
125}; 178};
126 179
127extern unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg, 180extern unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
128 dev_t dev); 181 dev_t dev);
182extern uint64_t blkcg_get_read_bps(struct blkio_cgroup *blkcg,
183 dev_t dev);
184extern uint64_t blkcg_get_write_bps(struct blkio_cgroup *blkcg,
185 dev_t dev);
186extern unsigned int blkcg_get_read_iops(struct blkio_cgroup *blkcg,
187 dev_t dev);
188extern unsigned int blkcg_get_write_iops(struct blkio_cgroup *blkcg,
189 dev_t dev);
129 190
130typedef void (blkio_unlink_group_fn) (void *key, struct blkio_group *blkg); 191typedef void (blkio_unlink_group_fn) (void *key, struct blkio_group *blkg);
131typedef void (blkio_update_group_weight_fn) (struct blkio_group *blkg, 192
132 unsigned int weight); 193typedef void (blkio_update_group_weight_fn) (void *key,
194 struct blkio_group *blkg, unsigned int weight);
195typedef void (blkio_update_group_read_bps_fn) (void * key,
196 struct blkio_group *blkg, u64 read_bps);
197typedef void (blkio_update_group_write_bps_fn) (void *key,
198 struct blkio_group *blkg, u64 write_bps);
199typedef void (blkio_update_group_read_iops_fn) (void *key,
200 struct blkio_group *blkg, unsigned int read_iops);
201typedef void (blkio_update_group_write_iops_fn) (void *key,
202 struct blkio_group *blkg, unsigned int write_iops);
133 203
134struct blkio_policy_ops { 204struct blkio_policy_ops {
135 blkio_unlink_group_fn *blkio_unlink_group_fn; 205 blkio_unlink_group_fn *blkio_unlink_group_fn;
136 blkio_update_group_weight_fn *blkio_update_group_weight_fn; 206 blkio_update_group_weight_fn *blkio_update_group_weight_fn;
207 blkio_update_group_read_bps_fn *blkio_update_group_read_bps_fn;
208 blkio_update_group_write_bps_fn *blkio_update_group_write_bps_fn;
209 blkio_update_group_read_iops_fn *blkio_update_group_read_iops_fn;
210 blkio_update_group_write_iops_fn *blkio_update_group_write_iops_fn;
137}; 211};
138 212
139struct blkio_policy_type { 213struct blkio_policy_type {
140 struct list_head list; 214 struct list_head list;
141 struct blkio_policy_ops ops; 215 struct blkio_policy_ops ops;
216 enum blkio_policy_id plid;
142}; 217};
143 218
144/* Blkio controller policy registration */ 219/* Blkio controller policy registration */
@@ -212,7 +287,8 @@ static inline void blkiocg_set_start_empty_time(struct blkio_group *blkg) {}
212extern struct blkio_cgroup blkio_root_cgroup; 287extern struct blkio_cgroup blkio_root_cgroup;
213extern struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup); 288extern struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup);
214extern void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, 289extern void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
215 struct blkio_group *blkg, void *key, dev_t dev); 290 struct blkio_group *blkg, void *key, dev_t dev,
291 enum blkio_policy_id plid);
216extern int blkiocg_del_blkio_group(struct blkio_group *blkg); 292extern int blkiocg_del_blkio_group(struct blkio_group *blkg);
217extern struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, 293extern struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg,
218 void *key); 294 void *key);
@@ -234,7 +310,8 @@ static inline struct blkio_cgroup *
234cgroup_to_blkio_cgroup(struct cgroup *cgroup) { return NULL; } 310cgroup_to_blkio_cgroup(struct cgroup *cgroup) { return NULL; }
235 311
236static inline void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, 312static inline void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
237 struct blkio_group *blkg, void *key, dev_t dev) {} 313 struct blkio_group *blkg, void *key, dev_t dev,
314 enum blkio_policy_id plid) {}
238 315
239static inline int 316static inline int
240blkiocg_del_blkio_group(struct blkio_group *blkg) { return 0; } 317blkiocg_del_blkio_group(struct blkio_group *blkg) { return 0; }
diff --git a/block/blk-core.c b/block/blk-core.c
index 32a1c123dfb3..4ce953f1b390 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -136,7 +136,7 @@ static void req_bio_endio(struct request *rq, struct bio *bio,
136{ 136{
137 struct request_queue *q = rq->q; 137 struct request_queue *q = rq->q;
138 138
139 if (&q->bar_rq != rq) { 139 if (&q->flush_rq != rq) {
140 if (error) 140 if (error)
141 clear_bit(BIO_UPTODATE, &bio->bi_flags); 141 clear_bit(BIO_UPTODATE, &bio->bi_flags);
142 else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) 142 else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
@@ -160,13 +160,12 @@ static void req_bio_endio(struct request *rq, struct bio *bio,
160 if (bio->bi_size == 0) 160 if (bio->bi_size == 0)
161 bio_endio(bio, error); 161 bio_endio(bio, error);
162 } else { 162 } else {
163
164 /* 163 /*
165 * Okay, this is the barrier request in progress, just 164 * Okay, this is the sequenced flush request in
166 * record the error; 165 * progress, just record the error;
167 */ 166 */
168 if (error && !q->orderr) 167 if (error && !q->flush_err)
169 q->orderr = error; 168 q->flush_err = error;
170 } 169 }
171} 170}
172 171
@@ -382,6 +381,7 @@ void blk_sync_queue(struct request_queue *q)
382 del_timer_sync(&q->unplug_timer); 381 del_timer_sync(&q->unplug_timer);
383 del_timer_sync(&q->timeout); 382 del_timer_sync(&q->timeout);
384 cancel_work_sync(&q->unplug_work); 383 cancel_work_sync(&q->unplug_work);
384 throtl_shutdown_timer_wq(q);
385} 385}
386EXPORT_SYMBOL(blk_sync_queue); 386EXPORT_SYMBOL(blk_sync_queue);
387 387
@@ -515,11 +515,17 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
515 return NULL; 515 return NULL;
516 } 516 }
517 517
518 if (blk_throtl_init(q)) {
519 kmem_cache_free(blk_requestq_cachep, q);
520 return NULL;
521 }
522
518 setup_timer(&q->backing_dev_info.laptop_mode_wb_timer, 523 setup_timer(&q->backing_dev_info.laptop_mode_wb_timer,
519 laptop_mode_timer_fn, (unsigned long) q); 524 laptop_mode_timer_fn, (unsigned long) q);
520 init_timer(&q->unplug_timer); 525 init_timer(&q->unplug_timer);
521 setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q); 526 setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q);
522 INIT_LIST_HEAD(&q->timeout_list); 527 INIT_LIST_HEAD(&q->timeout_list);
528 INIT_LIST_HEAD(&q->pending_flushes);
523 INIT_WORK(&q->unplug_work, blk_unplug_work); 529 INIT_WORK(&q->unplug_work, blk_unplug_work);
524 530
525 kobject_init(&q->kobj, &blk_queue_ktype); 531 kobject_init(&q->kobj, &blk_queue_ktype);
@@ -1037,22 +1043,6 @@ void blk_insert_request(struct request_queue *q, struct request *rq,
1037} 1043}
1038EXPORT_SYMBOL(blk_insert_request); 1044EXPORT_SYMBOL(blk_insert_request);
1039 1045
1040/*
1041 * add-request adds a request to the linked list.
1042 * queue lock is held and interrupts disabled, as we muck with the
1043 * request queue list.
1044 */
1045static inline void add_request(struct request_queue *q, struct request *req)
1046{
1047 drive_stat_acct(req, 1);
1048
1049 /*
1050 * elevator indicated where it wants this request to be
1051 * inserted at elevator_merge time
1052 */
1053 __elv_add_request(q, req, ELEVATOR_INSERT_SORT, 0);
1054}
1055
1056static void part_round_stats_single(int cpu, struct hd_struct *part, 1046static void part_round_stats_single(int cpu, struct hd_struct *part,
1057 unsigned long now) 1047 unsigned long now)
1058{ 1048{
@@ -1201,13 +1191,9 @@ static int __make_request(struct request_queue *q, struct bio *bio)
1201 const bool sync = !!(bio->bi_rw & REQ_SYNC); 1191 const bool sync = !!(bio->bi_rw & REQ_SYNC);
1202 const bool unplug = !!(bio->bi_rw & REQ_UNPLUG); 1192 const bool unplug = !!(bio->bi_rw & REQ_UNPLUG);
1203 const unsigned long ff = bio->bi_rw & REQ_FAILFAST_MASK; 1193 const unsigned long ff = bio->bi_rw & REQ_FAILFAST_MASK;
1194 int where = ELEVATOR_INSERT_SORT;
1204 int rw_flags; 1195 int rw_flags;
1205 1196
1206 if ((bio->bi_rw & REQ_HARDBARRIER) &&
1207 (q->next_ordered == QUEUE_ORDERED_NONE)) {
1208 bio_endio(bio, -EOPNOTSUPP);
1209 return 0;
1210 }
1211 /* 1197 /*
1212 * low level driver can indicate that it wants pages above a 1198 * low level driver can indicate that it wants pages above a
1213 * certain limit bounced to low memory (ie for highmem, or even 1199 * certain limit bounced to low memory (ie for highmem, or even
@@ -1217,7 +1203,12 @@ static int __make_request(struct request_queue *q, struct bio *bio)
1217 1203
1218 spin_lock_irq(q->queue_lock); 1204 spin_lock_irq(q->queue_lock);
1219 1205
1220 if (unlikely((bio->bi_rw & REQ_HARDBARRIER)) || elv_queue_empty(q)) 1206 if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) {
1207 where = ELEVATOR_INSERT_FRONT;
1208 goto get_rq;
1209 }
1210
1211 if (elv_queue_empty(q))
1221 goto get_rq; 1212 goto get_rq;
1222 1213
1223 el_ret = elv_merge(q, &req, bio); 1214 el_ret = elv_merge(q, &req, bio);
@@ -1314,7 +1305,10 @@ get_rq:
1314 req->cpu = blk_cpu_to_group(smp_processor_id()); 1305 req->cpu = blk_cpu_to_group(smp_processor_id());
1315 if (queue_should_plug(q) && elv_queue_empty(q)) 1306 if (queue_should_plug(q) && elv_queue_empty(q))
1316 blk_plug_device(q); 1307 blk_plug_device(q);
1317 add_request(q, req); 1308
1309 /* insert the request into the elevator */
1310 drive_stat_acct(req, 1);
1311 __elv_add_request(q, req, where, 0);
1318out: 1312out:
1319 if (unplug || !queue_should_plug(q)) 1313 if (unplug || !queue_should_plug(q))
1320 __generic_unplug_device(q); 1314 __generic_unplug_device(q);
@@ -1350,7 +1344,7 @@ static void handle_bad_sector(struct bio *bio)
1350 bdevname(bio->bi_bdev, b), 1344 bdevname(bio->bi_bdev, b),
1351 bio->bi_rw, 1345 bio->bi_rw,
1352 (unsigned long long)bio->bi_sector + bio_sectors(bio), 1346 (unsigned long long)bio->bi_sector + bio_sectors(bio),
1353 (long long)(bio->bi_bdev->bd_inode->i_size >> 9)); 1347 (long long)(i_size_read(bio->bi_bdev->bd_inode) >> 9));
1354 1348
1355 set_bit(BIO_EOF, &bio->bi_flags); 1349 set_bit(BIO_EOF, &bio->bi_flags);
1356} 1350}
@@ -1403,7 +1397,7 @@ static inline int bio_check_eod(struct bio *bio, unsigned int nr_sectors)
1403 return 0; 1397 return 0;
1404 1398
1405 /* Test device or partition size, when known. */ 1399 /* Test device or partition size, when known. */
1406 maxsector = bio->bi_bdev->bd_inode->i_size >> 9; 1400 maxsector = i_size_read(bio->bi_bdev->bd_inode) >> 9;
1407 if (maxsector) { 1401 if (maxsector) {
1408 sector_t sector = bio->bi_sector; 1402 sector_t sector = bio->bi_sector;
1409 1403
@@ -1514,6 +1508,19 @@ static inline void __generic_make_request(struct bio *bio)
1514 if (bio_check_eod(bio, nr_sectors)) 1508 if (bio_check_eod(bio, nr_sectors))
1515 goto end_io; 1509 goto end_io;
1516 1510
1511 /*
1512 * Filter flush bio's early so that make_request based
1513 * drivers without flush support don't have to worry
1514 * about them.
1515 */
1516 if ((bio->bi_rw & (REQ_FLUSH | REQ_FUA)) && !q->flush_flags) {
1517 bio->bi_rw &= ~(REQ_FLUSH | REQ_FUA);
1518 if (!nr_sectors) {
1519 err = 0;
1520 goto end_io;
1521 }
1522 }
1523
1517 if ((bio->bi_rw & REQ_DISCARD) && 1524 if ((bio->bi_rw & REQ_DISCARD) &&
1518 (!blk_queue_discard(q) || 1525 (!blk_queue_discard(q) ||
1519 ((bio->bi_rw & REQ_SECURE) && 1526 ((bio->bi_rw & REQ_SECURE) &&
@@ -1522,6 +1529,15 @@ static inline void __generic_make_request(struct bio *bio)
1522 goto end_io; 1529 goto end_io;
1523 } 1530 }
1524 1531
1532 blk_throtl_bio(q, &bio);
1533
1534 /*
1535 * If bio = NULL, bio has been throttled and will be submitted
1536 * later.
1537 */
1538 if (!bio)
1539 break;
1540
1525 trace_block_bio_queue(q, bio); 1541 trace_block_bio_queue(q, bio);
1526 1542
1527 ret = q->make_request_fn(q, bio); 1543 ret = q->make_request_fn(q, bio);
@@ -1612,11 +1628,12 @@ void submit_bio(int rw, struct bio *bio)
1612 1628
1613 if (unlikely(block_dump)) { 1629 if (unlikely(block_dump)) {
1614 char b[BDEVNAME_SIZE]; 1630 char b[BDEVNAME_SIZE];
1615 printk(KERN_DEBUG "%s(%d): %s block %Lu on %s\n", 1631 printk(KERN_DEBUG "%s(%d): %s block %Lu on %s (%u sectors)\n",
1616 current->comm, task_pid_nr(current), 1632 current->comm, task_pid_nr(current),
1617 (rw & WRITE) ? "WRITE" : "READ", 1633 (rw & WRITE) ? "WRITE" : "READ",
1618 (unsigned long long)bio->bi_sector, 1634 (unsigned long long)bio->bi_sector,
1619 bdevname(bio->bi_bdev, b)); 1635 bdevname(bio->bi_bdev, b),
1636 count);
1620 } 1637 }
1621 } 1638 }
1622 1639
@@ -1637,7 +1654,7 @@ EXPORT_SYMBOL(submit_bio);
1637 * the insertion using this generic function. 1654 * the insertion using this generic function.
1638 * 1655 *
1639 * This function should also be useful for request stacking drivers 1656 * This function should also be useful for request stacking drivers
1640 * in some cases below, so export this fuction. 1657 * in some cases below, so export this function.
1641 * Request stacking drivers like request-based dm may change the queue 1658 * Request stacking drivers like request-based dm may change the queue
1642 * limits while requests are in the queue (e.g. dm's table swapping). 1659 * limits while requests are in the queue (e.g. dm's table swapping).
1643 * Such request stacking drivers should check those requests agaist 1660 * Such request stacking drivers should check those requests agaist
@@ -1768,11 +1785,11 @@ static void blk_account_io_completion(struct request *req, unsigned int bytes)
1768static void blk_account_io_done(struct request *req) 1785static void blk_account_io_done(struct request *req)
1769{ 1786{
1770 /* 1787 /*
1771 * Account IO completion. bar_rq isn't accounted as a normal 1788 * Account IO completion. flush_rq isn't accounted as a
1772 * IO on queueing nor completion. Accounting the containing 1789 * normal IO on queueing nor completion. Accounting the
1773 * request is enough. 1790 * containing request is enough.
1774 */ 1791 */
1775 if (blk_do_io_stat(req) && req != &req->q->bar_rq) { 1792 if (blk_do_io_stat(req) && req != &req->q->flush_rq) {
1776 unsigned long duration = jiffies - req->start_time; 1793 unsigned long duration = jiffies - req->start_time;
1777 const int rw = rq_data_dir(req); 1794 const int rw = rq_data_dir(req);
1778 struct hd_struct *part; 1795 struct hd_struct *part;
@@ -2497,9 +2514,7 @@ EXPORT_SYMBOL_GPL(blk_rq_unprep_clone);
2497static void __blk_rq_prep_clone(struct request *dst, struct request *src) 2514static void __blk_rq_prep_clone(struct request *dst, struct request *src)
2498{ 2515{
2499 dst->cpu = src->cpu; 2516 dst->cpu = src->cpu;
2500 dst->cmd_flags = (rq_data_dir(src) | REQ_NOMERGE); 2517 dst->cmd_flags = (src->cmd_flags & REQ_CLONE_MASK) | REQ_NOMERGE;
2501 if (src->cmd_flags & REQ_DISCARD)
2502 dst->cmd_flags |= REQ_DISCARD;
2503 dst->cmd_type = src->cmd_type; 2518 dst->cmd_type = src->cmd_type;
2504 dst->__sector = blk_rq_pos(src); 2519 dst->__sector = blk_rq_pos(src);
2505 dst->__data_len = blk_rq_bytes(src); 2520 dst->__data_len = blk_rq_bytes(src);
@@ -2579,6 +2594,13 @@ int kblockd_schedule_work(struct request_queue *q, struct work_struct *work)
2579} 2594}
2580EXPORT_SYMBOL(kblockd_schedule_work); 2595EXPORT_SYMBOL(kblockd_schedule_work);
2581 2596
2597int kblockd_schedule_delayed_work(struct request_queue *q,
2598 struct delayed_work *dwork, unsigned long delay)
2599{
2600 return queue_delayed_work(kblockd_workqueue, dwork, delay);
2601}
2602EXPORT_SYMBOL(kblockd_schedule_delayed_work);
2603
2582int __init blk_dev_init(void) 2604int __init blk_dev_init(void)
2583{ 2605{
2584 BUILD_BUG_ON(__REQ_NR_BITS > 8 * 2606 BUILD_BUG_ON(__REQ_NR_BITS > 8 *
diff --git a/block/blk-exec.c b/block/blk-exec.c
index e1672f14840e..cf1456a02acd 100644
--- a/block/blk-exec.c
+++ b/block/blk-exec.c
@@ -80,6 +80,7 @@ int blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk,
80 DECLARE_COMPLETION_ONSTACK(wait); 80 DECLARE_COMPLETION_ONSTACK(wait);
81 char sense[SCSI_SENSE_BUFFERSIZE]; 81 char sense[SCSI_SENSE_BUFFERSIZE];
82 int err = 0; 82 int err = 0;
83 unsigned long hang_check;
83 84
84 /* 85 /*
85 * we need an extra reference to the request, so we can look at 86 * we need an extra reference to the request, so we can look at
@@ -95,7 +96,13 @@ int blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk,
95 96
96 rq->end_io_data = &wait; 97 rq->end_io_data = &wait;
97 blk_execute_rq_nowait(q, bd_disk, rq, at_head, blk_end_sync_rq); 98 blk_execute_rq_nowait(q, bd_disk, rq, at_head, blk_end_sync_rq);
98 wait_for_completion(&wait); 99
100 /* Prevent hang_check timer from firing at us during very long I/O */
101 hang_check = sysctl_hung_task_timeout_secs;
102 if (hang_check)
103 while (!wait_for_completion_timeout(&wait, hang_check * (HZ/2)));
104 else
105 wait_for_completion(&wait);
99 106
100 if (rq->errors) 107 if (rq->errors)
101 err = -EIO; 108 err = -EIO;
diff --git a/block/blk-flush.c b/block/blk-flush.c
new file mode 100644
index 000000000000..54b123d6563e
--- /dev/null
+++ b/block/blk-flush.c
@@ -0,0 +1,262 @@
1/*
2 * Functions to sequence FLUSH and FUA writes.
3 */
4#include <linux/kernel.h>
5#include <linux/module.h>
6#include <linux/bio.h>
7#include <linux/blkdev.h>
8#include <linux/gfp.h>
9
10#include "blk.h"
11
12/* FLUSH/FUA sequences */
13enum {
14 QUEUE_FSEQ_STARTED = (1 << 0), /* flushing in progress */
15 QUEUE_FSEQ_PREFLUSH = (1 << 1), /* pre-flushing in progress */
16 QUEUE_FSEQ_DATA = (1 << 2), /* data write in progress */
17 QUEUE_FSEQ_POSTFLUSH = (1 << 3), /* post-flushing in progress */
18 QUEUE_FSEQ_DONE = (1 << 4),
19};
20
21static struct request *queue_next_fseq(struct request_queue *q);
22
23unsigned blk_flush_cur_seq(struct request_queue *q)
24{
25 if (!q->flush_seq)
26 return 0;
27 return 1 << ffz(q->flush_seq);
28}
29
30static struct request *blk_flush_complete_seq(struct request_queue *q,
31 unsigned seq, int error)
32{
33 struct request *next_rq = NULL;
34
35 if (error && !q->flush_err)
36 q->flush_err = error;
37
38 BUG_ON(q->flush_seq & seq);
39 q->flush_seq |= seq;
40
41 if (blk_flush_cur_seq(q) != QUEUE_FSEQ_DONE) {
42 /* not complete yet, queue the next flush sequence */
43 next_rq = queue_next_fseq(q);
44 } else {
45 /* complete this flush request */
46 __blk_end_request_all(q->orig_flush_rq, q->flush_err);
47 q->orig_flush_rq = NULL;
48 q->flush_seq = 0;
49
50 /* dispatch the next flush if there's one */
51 if (!list_empty(&q->pending_flushes)) {
52 next_rq = list_entry_rq(q->pending_flushes.next);
53 list_move(&next_rq->queuelist, &q->queue_head);
54 }
55 }
56 return next_rq;
57}
58
59static void blk_flush_complete_seq_end_io(struct request_queue *q,
60 unsigned seq, int error)
61{
62 bool was_empty = elv_queue_empty(q);
63 struct request *next_rq;
64
65 next_rq = blk_flush_complete_seq(q, seq, error);
66
67 /*
68 * Moving a request silently to empty queue_head may stall the
69 * queue. Kick the queue in those cases.
70 */
71 if (was_empty && next_rq)
72 __blk_run_queue(q);
73}
74
75static void pre_flush_end_io(struct request *rq, int error)
76{
77 elv_completed_request(rq->q, rq);
78 blk_flush_complete_seq_end_io(rq->q, QUEUE_FSEQ_PREFLUSH, error);
79}
80
81static void flush_data_end_io(struct request *rq, int error)
82{
83 elv_completed_request(rq->q, rq);
84 blk_flush_complete_seq_end_io(rq->q, QUEUE_FSEQ_DATA, error);
85}
86
87static void post_flush_end_io(struct request *rq, int error)
88{
89 elv_completed_request(rq->q, rq);
90 blk_flush_complete_seq_end_io(rq->q, QUEUE_FSEQ_POSTFLUSH, error);
91}
92
93static void init_flush_request(struct request *rq, struct gendisk *disk)
94{
95 rq->cmd_type = REQ_TYPE_FS;
96 rq->cmd_flags = WRITE_FLUSH;
97 rq->rq_disk = disk;
98}
99
100static struct request *queue_next_fseq(struct request_queue *q)
101{
102 struct request *orig_rq = q->orig_flush_rq;
103 struct request *rq = &q->flush_rq;
104
105 blk_rq_init(q, rq);
106
107 switch (blk_flush_cur_seq(q)) {
108 case QUEUE_FSEQ_PREFLUSH:
109 init_flush_request(rq, orig_rq->rq_disk);
110 rq->end_io = pre_flush_end_io;
111 break;
112 case QUEUE_FSEQ_DATA:
113 init_request_from_bio(rq, orig_rq->bio);
114 /*
115 * orig_rq->rq_disk may be different from
116 * bio->bi_bdev->bd_disk if orig_rq got here through
117 * remapping drivers. Make sure rq->rq_disk points
118 * to the same one as orig_rq.
119 */
120 rq->rq_disk = orig_rq->rq_disk;
121 rq->cmd_flags &= ~(REQ_FLUSH | REQ_FUA);
122 rq->cmd_flags |= orig_rq->cmd_flags & (REQ_FLUSH | REQ_FUA);
123 rq->end_io = flush_data_end_io;
124 break;
125 case QUEUE_FSEQ_POSTFLUSH:
126 init_flush_request(rq, orig_rq->rq_disk);
127 rq->end_io = post_flush_end_io;
128 break;
129 default:
130 BUG();
131 }
132
133 elv_insert(q, rq, ELEVATOR_INSERT_FRONT);
134 return rq;
135}
136
137struct request *blk_do_flush(struct request_queue *q, struct request *rq)
138{
139 unsigned int fflags = q->flush_flags; /* may change, cache it */
140 bool has_flush = fflags & REQ_FLUSH, has_fua = fflags & REQ_FUA;
141 bool do_preflush = has_flush && (rq->cmd_flags & REQ_FLUSH);
142 bool do_postflush = has_flush && !has_fua && (rq->cmd_flags & REQ_FUA);
143 unsigned skip = 0;
144
145 /*
146 * Special case. If there's data but flush is not necessary,
147 * the request can be issued directly.
148 *
149 * Flush w/o data should be able to be issued directly too but
150 * currently some drivers assume that rq->bio contains
151 * non-zero data if it isn't NULL and empty FLUSH requests
152 * getting here usually have bio's without data.
153 */
154 if (blk_rq_sectors(rq) && !do_preflush && !do_postflush) {
155 rq->cmd_flags &= ~REQ_FLUSH;
156 if (!has_fua)
157 rq->cmd_flags &= ~REQ_FUA;
158 return rq;
159 }
160
161 /*
162 * Sequenced flushes can't be processed in parallel. If
163 * another one is already in progress, queue for later
164 * processing.
165 */
166 if (q->flush_seq) {
167 list_move_tail(&rq->queuelist, &q->pending_flushes);
168 return NULL;
169 }
170
171 /*
172 * Start a new flush sequence
173 */
174 q->flush_err = 0;
175 q->flush_seq |= QUEUE_FSEQ_STARTED;
176
177 /* adjust FLUSH/FUA of the original request and stash it away */
178 rq->cmd_flags &= ~REQ_FLUSH;
179 if (!has_fua)
180 rq->cmd_flags &= ~REQ_FUA;
181 blk_dequeue_request(rq);
182 q->orig_flush_rq = rq;
183
184 /* skip unneded sequences and return the first one */
185 if (!do_preflush)
186 skip |= QUEUE_FSEQ_PREFLUSH;
187 if (!blk_rq_sectors(rq))
188 skip |= QUEUE_FSEQ_DATA;
189 if (!do_postflush)
190 skip |= QUEUE_FSEQ_POSTFLUSH;
191 return blk_flush_complete_seq(q, skip, 0);
192}
193
194static void bio_end_flush(struct bio *bio, int err)
195{
196 if (err)
197 clear_bit(BIO_UPTODATE, &bio->bi_flags);
198 if (bio->bi_private)
199 complete(bio->bi_private);
200 bio_put(bio);
201}
202
203/**
204 * blkdev_issue_flush - queue a flush
205 * @bdev: blockdev to issue flush for
206 * @gfp_mask: memory allocation flags (for bio_alloc)
207 * @error_sector: error sector
208 *
209 * Description:
210 * Issue a flush for the block device in question. Caller can supply
211 * room for storing the error offset in case of a flush error, if they
212 * wish to. If WAIT flag is not passed then caller may check only what
213 * request was pushed in some internal queue for later handling.
214 */
215int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask,
216 sector_t *error_sector)
217{
218 DECLARE_COMPLETION_ONSTACK(wait);
219 struct request_queue *q;
220 struct bio *bio;
221 int ret = 0;
222
223 if (bdev->bd_disk == NULL)
224 return -ENXIO;
225
226 q = bdev_get_queue(bdev);
227 if (!q)
228 return -ENXIO;
229
230 /*
231 * some block devices may not have their queue correctly set up here
232 * (e.g. loop device without a backing file) and so issuing a flush
233 * here will panic. Ensure there is a request function before issuing
234 * the flush.
235 */
236 if (!q->make_request_fn)
237 return -ENXIO;
238
239 bio = bio_alloc(gfp_mask, 0);
240 bio->bi_end_io = bio_end_flush;
241 bio->bi_bdev = bdev;
242 bio->bi_private = &wait;
243
244 bio_get(bio);
245 submit_bio(WRITE_FLUSH, bio);
246 wait_for_completion(&wait);
247
248 /*
249 * The driver must store the error location in ->bi_sector, if
250 * it supports it. For non-stacked drivers, this should be
251 * copied from blk_rq_pos(rq).
252 */
253 if (error_sector)
254 *error_sector = bio->bi_sector;
255
256 if (!bio_flagged(bio, BIO_UPTODATE))
257 ret = -EIO;
258
259 bio_put(bio);
260 return ret;
261}
262EXPORT_SYMBOL(blkdev_issue_flush);
diff --git a/block/blk-integrity.c b/block/blk-integrity.c
index edce1ef7933d..54bcba6c02a7 100644
--- a/block/blk-integrity.c
+++ b/block/blk-integrity.c
@@ -32,24 +32,37 @@ static struct kmem_cache *integrity_cachep;
32 32
33/** 33/**
34 * blk_rq_count_integrity_sg - Count number of integrity scatterlist elements 34 * blk_rq_count_integrity_sg - Count number of integrity scatterlist elements
35 * @rq: request with integrity metadata attached 35 * @q: request queue
36 * @bio: bio with integrity metadata attached
36 * 37 *
37 * Description: Returns the number of elements required in a 38 * Description: Returns the number of elements required in a
38 * scatterlist corresponding to the integrity metadata in a request. 39 * scatterlist corresponding to the integrity metadata in a bio.
39 */ 40 */
40int blk_rq_count_integrity_sg(struct request *rq) 41int blk_rq_count_integrity_sg(struct request_queue *q, struct bio *bio)
41{ 42{
42 struct bio_vec *iv, *ivprv; 43 struct bio_vec *iv, *ivprv = NULL;
43 struct req_iterator iter; 44 unsigned int segments = 0;
44 unsigned int segments; 45 unsigned int seg_size = 0;
46 unsigned int i = 0;
45 47
46 ivprv = NULL; 48 bio_for_each_integrity_vec(iv, bio, i) {
47 segments = 0;
48 49
49 rq_for_each_integrity_segment(iv, rq, iter) { 50 if (ivprv) {
51 if (!BIOVEC_PHYS_MERGEABLE(ivprv, iv))
52 goto new_segment;
53
54 if (!BIOVEC_SEG_BOUNDARY(q, ivprv, iv))
55 goto new_segment;
50 56
51 if (!ivprv || !BIOVEC_PHYS_MERGEABLE(ivprv, iv)) 57 if (seg_size + iv->bv_len > queue_max_segment_size(q))
58 goto new_segment;
59
60 seg_size += iv->bv_len;
61 } else {
62new_segment:
52 segments++; 63 segments++;
64 seg_size = iv->bv_len;
65 }
53 66
54 ivprv = iv; 67 ivprv = iv;
55 } 68 }
@@ -60,30 +73,34 @@ EXPORT_SYMBOL(blk_rq_count_integrity_sg);
60 73
61/** 74/**
62 * blk_rq_map_integrity_sg - Map integrity metadata into a scatterlist 75 * blk_rq_map_integrity_sg - Map integrity metadata into a scatterlist
63 * @rq: request with integrity metadata attached 76 * @q: request queue
77 * @bio: bio with integrity metadata attached
64 * @sglist: target scatterlist 78 * @sglist: target scatterlist
65 * 79 *
66 * Description: Map the integrity vectors in request into a 80 * Description: Map the integrity vectors in request into a
67 * scatterlist. The scatterlist must be big enough to hold all 81 * scatterlist. The scatterlist must be big enough to hold all
68 * elements. I.e. sized using blk_rq_count_integrity_sg(). 82 * elements. I.e. sized using blk_rq_count_integrity_sg().
69 */ 83 */
70int blk_rq_map_integrity_sg(struct request *rq, struct scatterlist *sglist) 84int blk_rq_map_integrity_sg(struct request_queue *q, struct bio *bio,
85 struct scatterlist *sglist)
71{ 86{
72 struct bio_vec *iv, *ivprv; 87 struct bio_vec *iv, *ivprv = NULL;
73 struct req_iterator iter; 88 struct scatterlist *sg = NULL;
74 struct scatterlist *sg; 89 unsigned int segments = 0;
75 unsigned int segments; 90 unsigned int i = 0;
76
77 ivprv = NULL;
78 sg = NULL;
79 segments = 0;
80 91
81 rq_for_each_integrity_segment(iv, rq, iter) { 92 bio_for_each_integrity_vec(iv, bio, i) {
82 93
83 if (ivprv) { 94 if (ivprv) {
84 if (!BIOVEC_PHYS_MERGEABLE(ivprv, iv)) 95 if (!BIOVEC_PHYS_MERGEABLE(ivprv, iv))
85 goto new_segment; 96 goto new_segment;
86 97
98 if (!BIOVEC_SEG_BOUNDARY(q, ivprv, iv))
99 goto new_segment;
100
101 if (sg->length + iv->bv_len > queue_max_segment_size(q))
102 goto new_segment;
103
87 sg->length += iv->bv_len; 104 sg->length += iv->bv_len;
88 } else { 105 } else {
89new_segment: 106new_segment:
@@ -162,6 +179,40 @@ int blk_integrity_compare(struct gendisk *gd1, struct gendisk *gd2)
162} 179}
163EXPORT_SYMBOL(blk_integrity_compare); 180EXPORT_SYMBOL(blk_integrity_compare);
164 181
182int blk_integrity_merge_rq(struct request_queue *q, struct request *req,
183 struct request *next)
184{
185 if (blk_integrity_rq(req) != blk_integrity_rq(next))
186 return -1;
187
188 if (req->nr_integrity_segments + next->nr_integrity_segments >
189 q->limits.max_integrity_segments)
190 return -1;
191
192 return 0;
193}
194EXPORT_SYMBOL(blk_integrity_merge_rq);
195
196int blk_integrity_merge_bio(struct request_queue *q, struct request *req,
197 struct bio *bio)
198{
199 int nr_integrity_segs;
200 struct bio *next = bio->bi_next;
201
202 bio->bi_next = NULL;
203 nr_integrity_segs = blk_rq_count_integrity_sg(q, bio);
204 bio->bi_next = next;
205
206 if (req->nr_integrity_segments + nr_integrity_segs >
207 q->limits.max_integrity_segments)
208 return -1;
209
210 req->nr_integrity_segments += nr_integrity_segs;
211
212 return 0;
213}
214EXPORT_SYMBOL(blk_integrity_merge_bio);
215
165struct integrity_sysfs_entry { 216struct integrity_sysfs_entry {
166 struct attribute attr; 217 struct attribute attr;
167 ssize_t (*show)(struct blk_integrity *, char *); 218 ssize_t (*show)(struct blk_integrity *, char *);
@@ -381,7 +432,6 @@ void blk_integrity_unregister(struct gendisk *disk)
381 kobject_uevent(&bi->kobj, KOBJ_REMOVE); 432 kobject_uevent(&bi->kobj, KOBJ_REMOVE);
382 kobject_del(&bi->kobj); 433 kobject_del(&bi->kobj);
383 kobject_put(&bi->kobj); 434 kobject_put(&bi->kobj);
384 kmem_cache_free(integrity_cachep, bi);
385 disk->integrity = NULL; 435 disk->integrity = NULL;
386} 436}
387EXPORT_SYMBOL(blk_integrity_unregister); 437EXPORT_SYMBOL(blk_integrity_unregister);
diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index d22c4c55c406..3c7a339fe381 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -153,20 +153,6 @@ struct io_context *get_io_context(gfp_t gfp_flags, int node)
153} 153}
154EXPORT_SYMBOL(get_io_context); 154EXPORT_SYMBOL(get_io_context);
155 155
156void copy_io_context(struct io_context **pdst, struct io_context **psrc)
157{
158 struct io_context *src = *psrc;
159 struct io_context *dst = *pdst;
160
161 if (src) {
162 BUG_ON(atomic_long_read(&src->refcount) == 0);
163 atomic_long_inc(&src->refcount);
164 put_io_context(dst);
165 *pdst = src;
166 }
167}
168EXPORT_SYMBOL(copy_io_context);
169
170static int __init blk_ioc_init(void) 156static int __init blk_ioc_init(void)
171{ 157{
172 iocontext_cachep = kmem_cache_create("blkdev_ioc", 158 iocontext_cachep = kmem_cache_create("blkdev_ioc",
diff --git a/block/blk-lib.c b/block/blk-lib.c
index c392029a104e..1a320d2406b0 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -39,8 +39,7 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
39{ 39{
40 DECLARE_COMPLETION_ONSTACK(wait); 40 DECLARE_COMPLETION_ONSTACK(wait);
41 struct request_queue *q = bdev_get_queue(bdev); 41 struct request_queue *q = bdev_get_queue(bdev);
42 int type = flags & BLKDEV_IFL_BARRIER ? 42 int type = REQ_WRITE | REQ_DISCARD;
43 DISCARD_BARRIER : DISCARD_NOBARRIER;
44 unsigned int max_discard_sectors; 43 unsigned int max_discard_sectors;
45 struct bio *bio; 44 struct bio *bio;
46 int ret = 0; 45 int ret = 0;
@@ -62,10 +61,10 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
62 max_discard_sectors &= ~(disc_sects - 1); 61 max_discard_sectors &= ~(disc_sects - 1);
63 } 62 }
64 63
65 if (flags & BLKDEV_IFL_SECURE) { 64 if (flags & BLKDEV_DISCARD_SECURE) {
66 if (!blk_queue_secdiscard(q)) 65 if (!blk_queue_secdiscard(q))
67 return -EOPNOTSUPP; 66 return -EOPNOTSUPP;
68 type |= DISCARD_SECURE; 67 type |= REQ_SECURE;
69 } 68 }
70 69
71 while (nr_sects && !ret) { 70 while (nr_sects && !ret) {
@@ -78,8 +77,7 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
78 bio->bi_sector = sector; 77 bio->bi_sector = sector;
79 bio->bi_end_io = blkdev_discard_end_io; 78 bio->bi_end_io = blkdev_discard_end_io;
80 bio->bi_bdev = bdev; 79 bio->bi_bdev = bdev;
81 if (flags & BLKDEV_IFL_WAIT) 80 bio->bi_private = &wait;
82 bio->bi_private = &wait;
83 81
84 if (nr_sects > max_discard_sectors) { 82 if (nr_sects > max_discard_sectors) {
85 bio->bi_size = max_discard_sectors << 9; 83 bio->bi_size = max_discard_sectors << 9;
@@ -93,8 +91,7 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
93 bio_get(bio); 91 bio_get(bio);
94 submit_bio(type, bio); 92 submit_bio(type, bio);
95 93
96 if (flags & BLKDEV_IFL_WAIT) 94 wait_for_completion(&wait);
97 wait_for_completion(&wait);
98 95
99 if (bio_flagged(bio, BIO_EOPNOTSUPP)) 96 if (bio_flagged(bio, BIO_EOPNOTSUPP))
100 ret = -EOPNOTSUPP; 97 ret = -EOPNOTSUPP;
@@ -140,7 +137,6 @@ static void bio_batch_end_io(struct bio *bio, int err)
140 * @sector: start sector 137 * @sector: start sector
141 * @nr_sects: number of sectors to write 138 * @nr_sects: number of sectors to write
142 * @gfp_mask: memory allocation flags (for bio_alloc) 139 * @gfp_mask: memory allocation flags (for bio_alloc)
143 * @flags: BLKDEV_IFL_* flags to control behaviour
144 * 140 *
145 * Description: 141 * Description:
146 * Generate and issue number of bios with zerofiled pages. 142 * Generate and issue number of bios with zerofiled pages.
@@ -149,7 +145,7 @@ static void bio_batch_end_io(struct bio *bio, int err)
149 */ 145 */
150 146
151int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, 147int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
152 sector_t nr_sects, gfp_t gfp_mask, unsigned long flags) 148 sector_t nr_sects, gfp_t gfp_mask)
153{ 149{
154 int ret; 150 int ret;
155 struct bio *bio; 151 struct bio *bio;
@@ -162,12 +158,6 @@ int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
162 bb.wait = &wait; 158 bb.wait = &wait;
163 bb.end_io = NULL; 159 bb.end_io = NULL;
164 160
165 if (flags & BLKDEV_IFL_BARRIER) {
166 /* issue async barrier before the data */
167 ret = blkdev_issue_flush(bdev, gfp_mask, NULL, 0);
168 if (ret)
169 return ret;
170 }
171submit: 161submit:
172 ret = 0; 162 ret = 0;
173 while (nr_sects != 0) { 163 while (nr_sects != 0) {
@@ -181,8 +171,7 @@ submit:
181 bio->bi_sector = sector; 171 bio->bi_sector = sector;
182 bio->bi_bdev = bdev; 172 bio->bi_bdev = bdev;
183 bio->bi_end_io = bio_batch_end_io; 173 bio->bi_end_io = bio_batch_end_io;
184 if (flags & BLKDEV_IFL_WAIT) 174 bio->bi_private = &bb;
185 bio->bi_private = &bb;
186 175
187 while (nr_sects != 0) { 176 while (nr_sects != 0) {
188 sz = min((sector_t) PAGE_SIZE >> 9 , nr_sects); 177 sz = min((sector_t) PAGE_SIZE >> 9 , nr_sects);
@@ -199,18 +188,10 @@ submit:
199 issued++; 188 issued++;
200 submit_bio(WRITE, bio); 189 submit_bio(WRITE, bio);
201 } 190 }
202 /*
203 * When all data bios are in flight. Send final barrier if requeted.
204 */
205 if (nr_sects == 0 && flags & BLKDEV_IFL_BARRIER)
206 ret = blkdev_issue_flush(bdev, gfp_mask, NULL,
207 flags & BLKDEV_IFL_WAIT);
208
209 191
210 if (flags & BLKDEV_IFL_WAIT) 192 /* Wait for bios in-flight */
211 /* Wait for bios in-flight */ 193 while (issued != atomic_read(&bb.done))
212 while ( issued != atomic_read(&bb.done)) 194 wait_for_completion(&wait);
213 wait_for_completion(&wait);
214 195
215 if (!test_bit(BIO_UPTODATE, &bb.flags)) 196 if (!test_bit(BIO_UPTODATE, &bb.flags))
216 /* One of bios in the batch was completed with error.*/ 197 /* One of bios in the batch was completed with error.*/
diff --git a/block/blk-map.c b/block/blk-map.c
index ade0a08c9099..5d5dbe47c228 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -54,7 +54,7 @@ static int __blk_rq_map_user(struct request_queue *q, struct request *rq,
54 * direct dma. else, set up kernel bounce buffers 54 * direct dma. else, set up kernel bounce buffers
55 */ 55 */
56 uaddr = (unsigned long) ubuf; 56 uaddr = (unsigned long) ubuf;
57 if (blk_rq_aligned(q, ubuf, len) && !map_data) 57 if (blk_rq_aligned(q, uaddr, len) && !map_data)
58 bio = bio_map_user(q, NULL, uaddr, len, reading, gfp_mask); 58 bio = bio_map_user(q, NULL, uaddr, len, reading, gfp_mask);
59 else 59 else
60 bio = bio_copy_user(q, map_data, uaddr, len, reading, gfp_mask); 60 bio = bio_copy_user(q, map_data, uaddr, len, reading, gfp_mask);
@@ -205,6 +205,8 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
205 unaligned = 1; 205 unaligned = 1;
206 break; 206 break;
207 } 207 }
208 if (!iov[i].iov_len)
209 return -EINVAL;
208 } 210 }
209 211
210 if (unaligned || (q->dma_pad_mask & len) || map_data) 212 if (unaligned || (q->dma_pad_mask & len) || map_data)
@@ -288,6 +290,7 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
288 unsigned int len, gfp_t gfp_mask) 290 unsigned int len, gfp_t gfp_mask)
289{ 291{
290 int reading = rq_data_dir(rq) == READ; 292 int reading = rq_data_dir(rq) == READ;
293 unsigned long addr = (unsigned long) kbuf;
291 int do_copy = 0; 294 int do_copy = 0;
292 struct bio *bio; 295 struct bio *bio;
293 int ret; 296 int ret;
@@ -297,7 +300,7 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
297 if (!len || !kbuf) 300 if (!len || !kbuf)
298 return -EINVAL; 301 return -EINVAL;
299 302
300 do_copy = !blk_rq_aligned(q, kbuf, len) || object_is_on_stack(kbuf); 303 do_copy = !blk_rq_aligned(q, addr, len) || object_is_on_stack(kbuf);
301 if (do_copy) 304 if (do_copy)
302 bio = bio_copy_kern(q, kbuf, len, gfp_mask, reading); 305 bio = bio_copy_kern(q, kbuf, len, gfp_mask, reading);
303 else 306 else
diff --git a/block/blk-merge.c b/block/blk-merge.c
index eafc94f68d79..77b7c26df6b5 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -205,12 +205,11 @@ static inline int ll_new_hw_segment(struct request_queue *q,
205{ 205{
206 int nr_phys_segs = bio_phys_segments(q, bio); 206 int nr_phys_segs = bio_phys_segments(q, bio);
207 207
208 if (req->nr_phys_segments + nr_phys_segs > queue_max_segments(q)) { 208 if (req->nr_phys_segments + nr_phys_segs > queue_max_segments(q))
209 req->cmd_flags |= REQ_NOMERGE; 209 goto no_merge;
210 if (req == q->last_merge) 210
211 q->last_merge = NULL; 211 if (bio_integrity(bio) && blk_integrity_merge_bio(q, req, bio))
212 return 0; 212 goto no_merge;
213 }
214 213
215 /* 214 /*
216 * This will form the start of a new hw segment. Bump both 215 * This will form the start of a new hw segment. Bump both
@@ -218,6 +217,12 @@ static inline int ll_new_hw_segment(struct request_queue *q,
218 */ 217 */
219 req->nr_phys_segments += nr_phys_segs; 218 req->nr_phys_segments += nr_phys_segs;
220 return 1; 219 return 1;
220
221no_merge:
222 req->cmd_flags |= REQ_NOMERGE;
223 if (req == q->last_merge)
224 q->last_merge = NULL;
225 return 0;
221} 226}
222 227
223int ll_back_merge_fn(struct request_queue *q, struct request *req, 228int ll_back_merge_fn(struct request_queue *q, struct request *req,
@@ -301,6 +306,9 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
301 if (total_phys_segments > queue_max_segments(q)) 306 if (total_phys_segments > queue_max_segments(q))
302 return 0; 307 return 0;
303 308
309 if (blk_integrity_rq(req) && blk_integrity_merge_rq(q, req, next))
310 return 0;
311
304 /* Merge is OK... */ 312 /* Merge is OK... */
305 req->nr_phys_segments = total_phys_segments; 313 req->nr_phys_segments = total_phys_segments;
306 return 1; 314 return 1;
@@ -384,9 +392,6 @@ static int attempt_merge(struct request_queue *q, struct request *req,
384 || next->special) 392 || next->special)
385 return 0; 393 return 0;
386 394
387 if (blk_integrity_rq(req) != blk_integrity_rq(next))
388 return 0;
389
390 /* 395 /*
391 * If we are allowed to merge, then append bio list 396 * If we are allowed to merge, then append bio list
392 * from next to rq and release next. merge_requests_fn 397 * from next to rq and release next. merge_requests_fn
diff --git a/block/blk-settings.c b/block/blk-settings.c
index a234f4bf1d6f..701859fb9647 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -111,6 +111,7 @@ EXPORT_SYMBOL_GPL(blk_queue_lld_busy);
111void blk_set_default_limits(struct queue_limits *lim) 111void blk_set_default_limits(struct queue_limits *lim)
112{ 112{
113 lim->max_segments = BLK_MAX_SEGMENTS; 113 lim->max_segments = BLK_MAX_SEGMENTS;
114 lim->max_integrity_segments = 0;
114 lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK; 115 lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK;
115 lim->max_segment_size = BLK_MAX_SEGMENT_SIZE; 116 lim->max_segment_size = BLK_MAX_SEGMENT_SIZE;
116 lim->max_sectors = BLK_DEF_MAX_SECTORS; 117 lim->max_sectors = BLK_DEF_MAX_SECTORS;
@@ -213,7 +214,7 @@ void blk_queue_bounce_limit(struct request_queue *q, u64 dma_mask)
213 */ 214 */
214 if (b_pfn < (min_t(u64, 0xffffffffUL, BLK_BOUNCE_HIGH) >> PAGE_SHIFT)) 215 if (b_pfn < (min_t(u64, 0xffffffffUL, BLK_BOUNCE_HIGH) >> PAGE_SHIFT))
215 dma = 1; 216 dma = 1;
216 q->limits.bounce_pfn = max_low_pfn; 217 q->limits.bounce_pfn = max(max_low_pfn, b_pfn);
217#else 218#else
218 if (b_pfn < blk_max_low_pfn) 219 if (b_pfn < blk_max_low_pfn)
219 dma = 1; 220 dma = 1;
@@ -343,7 +344,7 @@ EXPORT_SYMBOL(blk_queue_logical_block_size);
343 * hardware can operate on without reverting to read-modify-write 344 * hardware can operate on without reverting to read-modify-write
344 * operations. 345 * operations.
345 */ 346 */
346void blk_queue_physical_block_size(struct request_queue *q, unsigned short size) 347void blk_queue_physical_block_size(struct request_queue *q, unsigned int size)
347{ 348{
348 q->limits.physical_block_size = size; 349 q->limits.physical_block_size = size;
349 350
@@ -455,11 +456,6 @@ void blk_queue_io_opt(struct request_queue *q, unsigned int opt)
455} 456}
456EXPORT_SYMBOL(blk_queue_io_opt); 457EXPORT_SYMBOL(blk_queue_io_opt);
457 458
458/*
459 * Returns the minimum that is _not_ zero, unless both are zero.
460 */
461#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r))
462
463/** 459/**
464 * blk_queue_stack_limits - inherit underlying queue limits for stacked drivers 460 * blk_queue_stack_limits - inherit underlying queue limits for stacked drivers
465 * @t: the stacking driver (top) 461 * @t: the stacking driver (top)
@@ -514,6 +510,8 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
514 b->seg_boundary_mask); 510 b->seg_boundary_mask);
515 511
516 t->max_segments = min_not_zero(t->max_segments, b->max_segments); 512 t->max_segments = min_not_zero(t->max_segments, b->max_segments);
513 t->max_integrity_segments = min_not_zero(t->max_integrity_segments,
514 b->max_integrity_segments);
517 515
518 t->max_segment_size = min_not_zero(t->max_segment_size, 516 t->max_segment_size = min_not_zero(t->max_segment_size,
519 b->max_segment_size); 517 b->max_segment_size);
@@ -794,6 +792,26 @@ void blk_queue_update_dma_alignment(struct request_queue *q, int mask)
794} 792}
795EXPORT_SYMBOL(blk_queue_update_dma_alignment); 793EXPORT_SYMBOL(blk_queue_update_dma_alignment);
796 794
795/**
796 * blk_queue_flush - configure queue's cache flush capability
797 * @q: the request queue for the device
798 * @flush: 0, REQ_FLUSH or REQ_FLUSH | REQ_FUA
799 *
800 * Tell block layer cache flush capability of @q. If it supports
801 * flushing, REQ_FLUSH should be set. If it supports bypassing
802 * write cache for individual writes, REQ_FUA should be set.
803 */
804void blk_queue_flush(struct request_queue *q, unsigned int flush)
805{
806 WARN_ON_ONCE(flush & ~(REQ_FLUSH | REQ_FUA));
807
808 if (WARN_ON_ONCE(!(flush & REQ_FLUSH) && (flush & REQ_FUA)))
809 flush &= ~REQ_FUA;
810
811 q->flush_flags = flush & (REQ_FLUSH | REQ_FUA);
812}
813EXPORT_SYMBOL_GPL(blk_queue_flush);
814
797static int __init blk_settings_init(void) 815static int __init blk_settings_init(void)
798{ 816{
799 blk_max_low_pfn = max_low_pfn - 1; 817 blk_max_low_pfn = max_low_pfn - 1;
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 0749b89c6885..013457f47fdc 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -112,6 +112,11 @@ static ssize_t queue_max_segments_show(struct request_queue *q, char *page)
112 return queue_var_show(queue_max_segments(q), (page)); 112 return queue_var_show(queue_max_segments(q), (page));
113} 113}
114 114
115static ssize_t queue_max_integrity_segments_show(struct request_queue *q, char *page)
116{
117 return queue_var_show(q->limits.max_integrity_segments, (page));
118}
119
115static ssize_t queue_max_segment_size_show(struct request_queue *q, char *page) 120static ssize_t queue_max_segment_size_show(struct request_queue *q, char *page)
116{ 121{
117 if (test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags)) 122 if (test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags))
@@ -288,6 +293,11 @@ static struct queue_sysfs_entry queue_max_segments_entry = {
288 .show = queue_max_segments_show, 293 .show = queue_max_segments_show,
289}; 294};
290 295
296static struct queue_sysfs_entry queue_max_integrity_segments_entry = {
297 .attr = {.name = "max_integrity_segments", .mode = S_IRUGO },
298 .show = queue_max_integrity_segments_show,
299};
300
291static struct queue_sysfs_entry queue_max_segment_size_entry = { 301static struct queue_sysfs_entry queue_max_segment_size_entry = {
292 .attr = {.name = "max_segment_size", .mode = S_IRUGO }, 302 .attr = {.name = "max_segment_size", .mode = S_IRUGO },
293 .show = queue_max_segment_size_show, 303 .show = queue_max_segment_size_show,
@@ -375,6 +385,7 @@ static struct attribute *default_attrs[] = {
375 &queue_max_hw_sectors_entry.attr, 385 &queue_max_hw_sectors_entry.attr,
376 &queue_max_sectors_entry.attr, 386 &queue_max_sectors_entry.attr,
377 &queue_max_segments_entry.attr, 387 &queue_max_segments_entry.attr,
388 &queue_max_integrity_segments_entry.attr,
378 &queue_max_segment_size_entry.attr, 389 &queue_max_segment_size_entry.attr,
379 &queue_iosched_entry.attr, 390 &queue_iosched_entry.attr,
380 &queue_hw_sector_size_entry.attr, 391 &queue_hw_sector_size_entry.attr,
@@ -460,6 +471,8 @@ static void blk_release_queue(struct kobject *kobj)
460 471
461 blk_sync_queue(q); 472 blk_sync_queue(q);
462 473
474 blk_throtl_exit(q);
475
463 if (rl->rq_pool) 476 if (rl->rq_pool)
464 mempool_destroy(rl->rq_pool); 477 mempool_destroy(rl->rq_pool);
465 478
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
new file mode 100644
index 000000000000..56ad4531b412
--- /dev/null
+++ b/block/blk-throttle.c
@@ -0,0 +1,1123 @@
1/*
2 * Interface for controlling IO bandwidth on a request queue
3 *
4 * Copyright (C) 2010 Vivek Goyal <vgoyal@redhat.com>
5 */
6
7#include <linux/module.h>
8#include <linux/slab.h>
9#include <linux/blkdev.h>
10#include <linux/bio.h>
11#include <linux/blktrace_api.h>
12#include "blk-cgroup.h"
13
14/* Max dispatch from a group in 1 round */
15static int throtl_grp_quantum = 8;
16
17/* Total max dispatch from all groups in one round */
18static int throtl_quantum = 32;
19
20/* Throttling is performed over 100ms slice and after that slice is renewed */
21static unsigned long throtl_slice = HZ/10; /* 100 ms */
22
23struct throtl_rb_root {
24 struct rb_root rb;
25 struct rb_node *left;
26 unsigned int count;
27 unsigned long min_disptime;
28};
29
30#define THROTL_RB_ROOT (struct throtl_rb_root) { .rb = RB_ROOT, .left = NULL, \
31 .count = 0, .min_disptime = 0}
32
33#define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node)
34
35struct throtl_grp {
36 /* List of throtl groups on the request queue*/
37 struct hlist_node tg_node;
38
39 /* active throtl group service_tree member */
40 struct rb_node rb_node;
41
42 /*
43 * Dispatch time in jiffies. This is the estimated time when group
44 * will unthrottle and is ready to dispatch more bio. It is used as
45 * key to sort active groups in service tree.
46 */
47 unsigned long disptime;
48
49 struct blkio_group blkg;
50 atomic_t ref;
51 unsigned int flags;
52
53 /* Two lists for READ and WRITE */
54 struct bio_list bio_lists[2];
55
56 /* Number of queued bios on READ and WRITE lists */
57 unsigned int nr_queued[2];
58
59 /* bytes per second rate limits */
60 uint64_t bps[2];
61
62 /* IOPS limits */
63 unsigned int iops[2];
64
65 /* Number of bytes disptached in current slice */
66 uint64_t bytes_disp[2];
67 /* Number of bio's dispatched in current slice */
68 unsigned int io_disp[2];
69
70 /* When did we start a new slice */
71 unsigned long slice_start[2];
72 unsigned long slice_end[2];
73
74 /* Some throttle limits got updated for the group */
75 bool limits_changed;
76};
77
78struct throtl_data
79{
80 /* List of throtl groups */
81 struct hlist_head tg_list;
82
83 /* service tree for active throtl groups */
84 struct throtl_rb_root tg_service_tree;
85
86 struct throtl_grp root_tg;
87 struct request_queue *queue;
88
89 /* Total Number of queued bios on READ and WRITE lists */
90 unsigned int nr_queued[2];
91
92 /*
93 * number of total undestroyed groups
94 */
95 unsigned int nr_undestroyed_grps;
96
97 /* Work for dispatching throttled bios */
98 struct delayed_work throtl_work;
99
100 atomic_t limits_changed;
101};
102
103enum tg_state_flags {
104 THROTL_TG_FLAG_on_rr = 0, /* on round-robin busy list */
105};
106
107#define THROTL_TG_FNS(name) \
108static inline void throtl_mark_tg_##name(struct throtl_grp *tg) \
109{ \
110 (tg)->flags |= (1 << THROTL_TG_FLAG_##name); \
111} \
112static inline void throtl_clear_tg_##name(struct throtl_grp *tg) \
113{ \
114 (tg)->flags &= ~(1 << THROTL_TG_FLAG_##name); \
115} \
116static inline int throtl_tg_##name(const struct throtl_grp *tg) \
117{ \
118 return ((tg)->flags & (1 << THROTL_TG_FLAG_##name)) != 0; \
119}
120
121THROTL_TG_FNS(on_rr);
122
123#define throtl_log_tg(td, tg, fmt, args...) \
124 blk_add_trace_msg((td)->queue, "throtl %s " fmt, \
125 blkg_path(&(tg)->blkg), ##args); \
126
127#define throtl_log(td, fmt, args...) \
128 blk_add_trace_msg((td)->queue, "throtl " fmt, ##args)
129
130static inline struct throtl_grp *tg_of_blkg(struct blkio_group *blkg)
131{
132 if (blkg)
133 return container_of(blkg, struct throtl_grp, blkg);
134
135 return NULL;
136}
137
138static inline int total_nr_queued(struct throtl_data *td)
139{
140 return (td->nr_queued[0] + td->nr_queued[1]);
141}
142
143static inline struct throtl_grp *throtl_ref_get_tg(struct throtl_grp *tg)
144{
145 atomic_inc(&tg->ref);
146 return tg;
147}
148
149static void throtl_put_tg(struct throtl_grp *tg)
150{
151 BUG_ON(atomic_read(&tg->ref) <= 0);
152 if (!atomic_dec_and_test(&tg->ref))
153 return;
154 kfree(tg);
155}
156
157static struct throtl_grp * throtl_find_alloc_tg(struct throtl_data *td,
158 struct cgroup *cgroup)
159{
160 struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
161 struct throtl_grp *tg = NULL;
162 void *key = td;
163 struct backing_dev_info *bdi = &td->queue->backing_dev_info;
164 unsigned int major, minor;
165
166 /*
167 * TODO: Speed up blkiocg_lookup_group() by maintaining a radix
168 * tree of blkg (instead of traversing through hash list all
169 * the time.
170 */
171 tg = tg_of_blkg(blkiocg_lookup_group(blkcg, key));
172
173 /* Fill in device details for root group */
174 if (tg && !tg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
175 sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
176 tg->blkg.dev = MKDEV(major, minor);
177 goto done;
178 }
179
180 if (tg)
181 goto done;
182
183 tg = kzalloc_node(sizeof(*tg), GFP_ATOMIC, td->queue->node);
184 if (!tg)
185 goto done;
186
187 INIT_HLIST_NODE(&tg->tg_node);
188 RB_CLEAR_NODE(&tg->rb_node);
189 bio_list_init(&tg->bio_lists[0]);
190 bio_list_init(&tg->bio_lists[1]);
191
192 /*
193 * Take the initial reference that will be released on destroy
194 * This can be thought of a joint reference by cgroup and
195 * request queue which will be dropped by either request queue
196 * exit or cgroup deletion path depending on who is exiting first.
197 */
198 atomic_set(&tg->ref, 1);
199
200 /* Add group onto cgroup list */
201 sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
202 blkiocg_add_blkio_group(blkcg, &tg->blkg, (void *)td,
203 MKDEV(major, minor), BLKIO_POLICY_THROTL);
204
205 tg->bps[READ] = blkcg_get_read_bps(blkcg, tg->blkg.dev);
206 tg->bps[WRITE] = blkcg_get_write_bps(blkcg, tg->blkg.dev);
207 tg->iops[READ] = blkcg_get_read_iops(blkcg, tg->blkg.dev);
208 tg->iops[WRITE] = blkcg_get_write_iops(blkcg, tg->blkg.dev);
209
210 hlist_add_head(&tg->tg_node, &td->tg_list);
211 td->nr_undestroyed_grps++;
212done:
213 return tg;
214}
215
216static struct throtl_grp * throtl_get_tg(struct throtl_data *td)
217{
218 struct cgroup *cgroup;
219 struct throtl_grp *tg = NULL;
220
221 rcu_read_lock();
222 cgroup = task_cgroup(current, blkio_subsys_id);
223 tg = throtl_find_alloc_tg(td, cgroup);
224 if (!tg)
225 tg = &td->root_tg;
226 rcu_read_unlock();
227 return tg;
228}
229
230static struct throtl_grp *throtl_rb_first(struct throtl_rb_root *root)
231{
232 /* Service tree is empty */
233 if (!root->count)
234 return NULL;
235
236 if (!root->left)
237 root->left = rb_first(&root->rb);
238
239 if (root->left)
240 return rb_entry_tg(root->left);
241
242 return NULL;
243}
244
245static void rb_erase_init(struct rb_node *n, struct rb_root *root)
246{
247 rb_erase(n, root);
248 RB_CLEAR_NODE(n);
249}
250
251static void throtl_rb_erase(struct rb_node *n, struct throtl_rb_root *root)
252{
253 if (root->left == n)
254 root->left = NULL;
255 rb_erase_init(n, &root->rb);
256 --root->count;
257}
258
259static void update_min_dispatch_time(struct throtl_rb_root *st)
260{
261 struct throtl_grp *tg;
262
263 tg = throtl_rb_first(st);
264 if (!tg)
265 return;
266
267 st->min_disptime = tg->disptime;
268}
269
270static void
271tg_service_tree_add(struct throtl_rb_root *st, struct throtl_grp *tg)
272{
273 struct rb_node **node = &st->rb.rb_node;
274 struct rb_node *parent = NULL;
275 struct throtl_grp *__tg;
276 unsigned long key = tg->disptime;
277 int left = 1;
278
279 while (*node != NULL) {
280 parent = *node;
281 __tg = rb_entry_tg(parent);
282
283 if (time_before(key, __tg->disptime))
284 node = &parent->rb_left;
285 else {
286 node = &parent->rb_right;
287 left = 0;
288 }
289 }
290
291 if (left)
292 st->left = &tg->rb_node;
293
294 rb_link_node(&tg->rb_node, parent, node);
295 rb_insert_color(&tg->rb_node, &st->rb);
296}
297
298static void __throtl_enqueue_tg(struct throtl_data *td, struct throtl_grp *tg)
299{
300 struct throtl_rb_root *st = &td->tg_service_tree;
301
302 tg_service_tree_add(st, tg);
303 throtl_mark_tg_on_rr(tg);
304 st->count++;
305}
306
307static void throtl_enqueue_tg(struct throtl_data *td, struct throtl_grp *tg)
308{
309 if (!throtl_tg_on_rr(tg))
310 __throtl_enqueue_tg(td, tg);
311}
312
313static void __throtl_dequeue_tg(struct throtl_data *td, struct throtl_grp *tg)
314{
315 throtl_rb_erase(&tg->rb_node, &td->tg_service_tree);
316 throtl_clear_tg_on_rr(tg);
317}
318
319static void throtl_dequeue_tg(struct throtl_data *td, struct throtl_grp *tg)
320{
321 if (throtl_tg_on_rr(tg))
322 __throtl_dequeue_tg(td, tg);
323}
324
325static void throtl_schedule_next_dispatch(struct throtl_data *td)
326{
327 struct throtl_rb_root *st = &td->tg_service_tree;
328
329 /*
330 * If there are more bios pending, schedule more work.
331 */
332 if (!total_nr_queued(td))
333 return;
334
335 BUG_ON(!st->count);
336
337 update_min_dispatch_time(st);
338
339 if (time_before_eq(st->min_disptime, jiffies))
340 throtl_schedule_delayed_work(td->queue, 0);
341 else
342 throtl_schedule_delayed_work(td->queue,
343 (st->min_disptime - jiffies));
344}
345
346static inline void
347throtl_start_new_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
348{
349 tg->bytes_disp[rw] = 0;
350 tg->io_disp[rw] = 0;
351 tg->slice_start[rw] = jiffies;
352 tg->slice_end[rw] = jiffies + throtl_slice;
353 throtl_log_tg(td, tg, "[%c] new slice start=%lu end=%lu jiffies=%lu",
354 rw == READ ? 'R' : 'W', tg->slice_start[rw],
355 tg->slice_end[rw], jiffies);
356}
357
358static inline void throtl_extend_slice(struct throtl_data *td,
359 struct throtl_grp *tg, bool rw, unsigned long jiffy_end)
360{
361 tg->slice_end[rw] = roundup(jiffy_end, throtl_slice);
362 throtl_log_tg(td, tg, "[%c] extend slice start=%lu end=%lu jiffies=%lu",
363 rw == READ ? 'R' : 'W', tg->slice_start[rw],
364 tg->slice_end[rw], jiffies);
365}
366
367/* Determine if previously allocated or extended slice is complete or not */
368static bool
369throtl_slice_used(struct throtl_data *td, struct throtl_grp *tg, bool rw)
370{
371 if (time_in_range(jiffies, tg->slice_start[rw], tg->slice_end[rw]))
372 return 0;
373
374 return 1;
375}
376
377/* Trim the used slices and adjust slice start accordingly */
378static inline void
379throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
380{
381 unsigned long nr_slices, time_elapsed, io_trim;
382 u64 bytes_trim, tmp;
383
384 BUG_ON(time_before(tg->slice_end[rw], tg->slice_start[rw]));
385
386 /*
387 * If bps are unlimited (-1), then time slice don't get
388 * renewed. Don't try to trim the slice if slice is used. A new
389 * slice will start when appropriate.
390 */
391 if (throtl_slice_used(td, tg, rw))
392 return;
393
394 time_elapsed = jiffies - tg->slice_start[rw];
395
396 nr_slices = time_elapsed / throtl_slice;
397
398 if (!nr_slices)
399 return;
400 tmp = tg->bps[rw] * throtl_slice * nr_slices;
401 do_div(tmp, HZ);
402 bytes_trim = tmp;
403
404 io_trim = (tg->iops[rw] * throtl_slice * nr_slices)/HZ;
405
406 if (!bytes_trim && !io_trim)
407 return;
408
409 if (tg->bytes_disp[rw] >= bytes_trim)
410 tg->bytes_disp[rw] -= bytes_trim;
411 else
412 tg->bytes_disp[rw] = 0;
413
414 if (tg->io_disp[rw] >= io_trim)
415 tg->io_disp[rw] -= io_trim;
416 else
417 tg->io_disp[rw] = 0;
418
419 tg->slice_start[rw] += nr_slices * throtl_slice;
420
421 throtl_log_tg(td, tg, "[%c] trim slice nr=%lu bytes=%llu io=%lu"
422 " start=%lu end=%lu jiffies=%lu",
423 rw == READ ? 'R' : 'W', nr_slices, bytes_trim, io_trim,
424 tg->slice_start[rw], tg->slice_end[rw], jiffies);
425}
426
427static bool tg_with_in_iops_limit(struct throtl_data *td, struct throtl_grp *tg,
428 struct bio *bio, unsigned long *wait)
429{
430 bool rw = bio_data_dir(bio);
431 unsigned int io_allowed;
432 unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
433 u64 tmp;
434
435 jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw];
436
437 /* Slice has just started. Consider one slice interval */
438 if (!jiffy_elapsed)
439 jiffy_elapsed_rnd = throtl_slice;
440
441 jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice);
442
443 /*
444 * jiffy_elapsed_rnd should not be a big value as minimum iops can be
445 * 1 then at max jiffy elapsed should be equivalent of 1 second as we
446 * will allow dispatch after 1 second and after that slice should
447 * have been trimmed.
448 */
449
450 tmp = (u64)tg->iops[rw] * jiffy_elapsed_rnd;
451 do_div(tmp, HZ);
452
453 if (tmp > UINT_MAX)
454 io_allowed = UINT_MAX;
455 else
456 io_allowed = tmp;
457
458 if (tg->io_disp[rw] + 1 <= io_allowed) {
459 if (wait)
460 *wait = 0;
461 return 1;
462 }
463
464 /* Calc approx time to dispatch */
465 jiffy_wait = ((tg->io_disp[rw] + 1) * HZ)/tg->iops[rw] + 1;
466
467 if (jiffy_wait > jiffy_elapsed)
468 jiffy_wait = jiffy_wait - jiffy_elapsed;
469 else
470 jiffy_wait = 1;
471
472 if (wait)
473 *wait = jiffy_wait;
474 return 0;
475}
476
477static bool tg_with_in_bps_limit(struct throtl_data *td, struct throtl_grp *tg,
478 struct bio *bio, unsigned long *wait)
479{
480 bool rw = bio_data_dir(bio);
481 u64 bytes_allowed, extra_bytes, tmp;
482 unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
483
484 jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw];
485
486 /* Slice has just started. Consider one slice interval */
487 if (!jiffy_elapsed)
488 jiffy_elapsed_rnd = throtl_slice;
489
490 jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice);
491
492 tmp = tg->bps[rw] * jiffy_elapsed_rnd;
493 do_div(tmp, HZ);
494 bytes_allowed = tmp;
495
496 if (tg->bytes_disp[rw] + bio->bi_size <= bytes_allowed) {
497 if (wait)
498 *wait = 0;
499 return 1;
500 }
501
502 /* Calc approx time to dispatch */
503 extra_bytes = tg->bytes_disp[rw] + bio->bi_size - bytes_allowed;
504 jiffy_wait = div64_u64(extra_bytes * HZ, tg->bps[rw]);
505
506 if (!jiffy_wait)
507 jiffy_wait = 1;
508
509 /*
510 * This wait time is without taking into consideration the rounding
511 * up we did. Add that time also.
512 */
513 jiffy_wait = jiffy_wait + (jiffy_elapsed_rnd - jiffy_elapsed);
514 if (wait)
515 *wait = jiffy_wait;
516 return 0;
517}
518
519/*
520 * Returns whether one can dispatch a bio or not. Also returns approx number
521 * of jiffies to wait before this bio is with-in IO rate and can be dispatched
522 */
523static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg,
524 struct bio *bio, unsigned long *wait)
525{
526 bool rw = bio_data_dir(bio);
527 unsigned long bps_wait = 0, iops_wait = 0, max_wait = 0;
528
529 /*
530 * Currently whole state machine of group depends on first bio
531 * queued in the group bio list. So one should not be calling
532 * this function with a different bio if there are other bios
533 * queued.
534 */
535 BUG_ON(tg->nr_queued[rw] && bio != bio_list_peek(&tg->bio_lists[rw]));
536
537 /* If tg->bps = -1, then BW is unlimited */
538 if (tg->bps[rw] == -1 && tg->iops[rw] == -1) {
539 if (wait)
540 *wait = 0;
541 return 1;
542 }
543
544 /*
545 * If previous slice expired, start a new one otherwise renew/extend
546 * existing slice to make sure it is at least throtl_slice interval
547 * long since now.
548 */
549 if (throtl_slice_used(td, tg, rw))
550 throtl_start_new_slice(td, tg, rw);
551 else {
552 if (time_before(tg->slice_end[rw], jiffies + throtl_slice))
553 throtl_extend_slice(td, tg, rw, jiffies + throtl_slice);
554 }
555
556 if (tg_with_in_bps_limit(td, tg, bio, &bps_wait)
557 && tg_with_in_iops_limit(td, tg, bio, &iops_wait)) {
558 if (wait)
559 *wait = 0;
560 return 1;
561 }
562
563 max_wait = max(bps_wait, iops_wait);
564
565 if (wait)
566 *wait = max_wait;
567
568 if (time_before(tg->slice_end[rw], jiffies + max_wait))
569 throtl_extend_slice(td, tg, rw, jiffies + max_wait);
570
571 return 0;
572}
573
574static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
575{
576 bool rw = bio_data_dir(bio);
577 bool sync = bio->bi_rw & REQ_SYNC;
578
579 /* Charge the bio to the group */
580 tg->bytes_disp[rw] += bio->bi_size;
581 tg->io_disp[rw]++;
582
583 /*
584 * TODO: This will take blkg->stats_lock. Figure out a way
585 * to avoid this cost.
586 */
587 blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size, rw, sync);
588}
589
590static void throtl_add_bio_tg(struct throtl_data *td, struct throtl_grp *tg,
591 struct bio *bio)
592{
593 bool rw = bio_data_dir(bio);
594
595 bio_list_add(&tg->bio_lists[rw], bio);
596 /* Take a bio reference on tg */
597 throtl_ref_get_tg(tg);
598 tg->nr_queued[rw]++;
599 td->nr_queued[rw]++;
600 throtl_enqueue_tg(td, tg);
601}
602
603static void tg_update_disptime(struct throtl_data *td, struct throtl_grp *tg)
604{
605 unsigned long read_wait = -1, write_wait = -1, min_wait = -1, disptime;
606 struct bio *bio;
607
608 if ((bio = bio_list_peek(&tg->bio_lists[READ])))
609 tg_may_dispatch(td, tg, bio, &read_wait);
610
611 if ((bio = bio_list_peek(&tg->bio_lists[WRITE])))
612 tg_may_dispatch(td, tg, bio, &write_wait);
613
614 min_wait = min(read_wait, write_wait);
615 disptime = jiffies + min_wait;
616
617 /* Update dispatch time */
618 throtl_dequeue_tg(td, tg);
619 tg->disptime = disptime;
620 throtl_enqueue_tg(td, tg);
621}
622
623static void tg_dispatch_one_bio(struct throtl_data *td, struct throtl_grp *tg,
624 bool rw, struct bio_list *bl)
625{
626 struct bio *bio;
627
628 bio = bio_list_pop(&tg->bio_lists[rw]);
629 tg->nr_queued[rw]--;
630 /* Drop bio reference on tg */
631 throtl_put_tg(tg);
632
633 BUG_ON(td->nr_queued[rw] <= 0);
634 td->nr_queued[rw]--;
635
636 throtl_charge_bio(tg, bio);
637 bio_list_add(bl, bio);
638 bio->bi_rw |= REQ_THROTTLED;
639
640 throtl_trim_slice(td, tg, rw);
641}
642
643static int throtl_dispatch_tg(struct throtl_data *td, struct throtl_grp *tg,
644 struct bio_list *bl)
645{
646 unsigned int nr_reads = 0, nr_writes = 0;
647 unsigned int max_nr_reads = throtl_grp_quantum*3/4;
648 unsigned int max_nr_writes = throtl_grp_quantum - nr_reads;
649 struct bio *bio;
650
651 /* Try to dispatch 75% READS and 25% WRITES */
652
653 while ((bio = bio_list_peek(&tg->bio_lists[READ]))
654 && tg_may_dispatch(td, tg, bio, NULL)) {
655
656 tg_dispatch_one_bio(td, tg, bio_data_dir(bio), bl);
657 nr_reads++;
658
659 if (nr_reads >= max_nr_reads)
660 break;
661 }
662
663 while ((bio = bio_list_peek(&tg->bio_lists[WRITE]))
664 && tg_may_dispatch(td, tg, bio, NULL)) {
665
666 tg_dispatch_one_bio(td, tg, bio_data_dir(bio), bl);
667 nr_writes++;
668
669 if (nr_writes >= max_nr_writes)
670 break;
671 }
672
673 return nr_reads + nr_writes;
674}
675
676static int throtl_select_dispatch(struct throtl_data *td, struct bio_list *bl)
677{
678 unsigned int nr_disp = 0;
679 struct throtl_grp *tg;
680 struct throtl_rb_root *st = &td->tg_service_tree;
681
682 while (1) {
683 tg = throtl_rb_first(st);
684
685 if (!tg)
686 break;
687
688 if (time_before(jiffies, tg->disptime))
689 break;
690
691 throtl_dequeue_tg(td, tg);
692
693 nr_disp += throtl_dispatch_tg(td, tg, bl);
694
695 if (tg->nr_queued[0] || tg->nr_queued[1]) {
696 tg_update_disptime(td, tg);
697 throtl_enqueue_tg(td, tg);
698 }
699
700 if (nr_disp >= throtl_quantum)
701 break;
702 }
703
704 return nr_disp;
705}
706
707static void throtl_process_limit_change(struct throtl_data *td)
708{
709 struct throtl_grp *tg;
710 struct hlist_node *pos, *n;
711
712 /*
713 * Make sure atomic_inc() effects from
714 * throtl_update_blkio_group_read_bps(), group of functions are
715 * visible.
716 * Is this required or smp_mb__after_atomic_inc() was suffcient
717 * after the atomic_inc().
718 */
719 smp_rmb();
720 if (!atomic_read(&td->limits_changed))
721 return;
722
723 throtl_log(td, "limit changed =%d", atomic_read(&td->limits_changed));
724
725 hlist_for_each_entry_safe(tg, pos, n, &td->tg_list, tg_node) {
726 /*
727 * Do I need an smp_rmb() here to make sure tg->limits_changed
728 * update is visible. I am relying on smp_rmb() at the
729 * beginning of function and not putting a new one here.
730 */
731
732 if (throtl_tg_on_rr(tg) && tg->limits_changed) {
733 throtl_log_tg(td, tg, "limit change rbps=%llu wbps=%llu"
734 " riops=%u wiops=%u", tg->bps[READ],
735 tg->bps[WRITE], tg->iops[READ],
736 tg->iops[WRITE]);
737 tg_update_disptime(td, tg);
738 tg->limits_changed = false;
739 }
740 }
741
742 smp_mb__before_atomic_dec();
743 atomic_dec(&td->limits_changed);
744 smp_mb__after_atomic_dec();
745}
746
747/* Dispatch throttled bios. Should be called without queue lock held. */
748static int throtl_dispatch(struct request_queue *q)
749{
750 struct throtl_data *td = q->td;
751 unsigned int nr_disp = 0;
752 struct bio_list bio_list_on_stack;
753 struct bio *bio;
754
755 spin_lock_irq(q->queue_lock);
756
757 throtl_process_limit_change(td);
758
759 if (!total_nr_queued(td))
760 goto out;
761
762 bio_list_init(&bio_list_on_stack);
763
764 throtl_log(td, "dispatch nr_queued=%lu read=%u write=%u",
765 total_nr_queued(td), td->nr_queued[READ],
766 td->nr_queued[WRITE]);
767
768 nr_disp = throtl_select_dispatch(td, &bio_list_on_stack);
769
770 if (nr_disp)
771 throtl_log(td, "bios disp=%u", nr_disp);
772
773 throtl_schedule_next_dispatch(td);
774out:
775 spin_unlock_irq(q->queue_lock);
776
777 /*
778 * If we dispatched some requests, unplug the queue to make sure
779 * immediate dispatch
780 */
781 if (nr_disp) {
782 while((bio = bio_list_pop(&bio_list_on_stack)))
783 generic_make_request(bio);
784 blk_unplug(q);
785 }
786 return nr_disp;
787}
788
789void blk_throtl_work(struct work_struct *work)
790{
791 struct throtl_data *td = container_of(work, struct throtl_data,
792 throtl_work.work);
793 struct request_queue *q = td->queue;
794
795 throtl_dispatch(q);
796}
797
798/* Call with queue lock held */
799void throtl_schedule_delayed_work(struct request_queue *q, unsigned long delay)
800{
801
802 struct throtl_data *td = q->td;
803 struct delayed_work *dwork = &td->throtl_work;
804
805 if (total_nr_queued(td) > 0) {
806 /*
807 * We might have a work scheduled to be executed in future.
808 * Cancel that and schedule a new one.
809 */
810 __cancel_delayed_work(dwork);
811 kblockd_schedule_delayed_work(q, dwork, delay);
812 throtl_log(td, "schedule work. delay=%lu jiffies=%lu",
813 delay, jiffies);
814 }
815}
816EXPORT_SYMBOL(throtl_schedule_delayed_work);
817
818static void
819throtl_destroy_tg(struct throtl_data *td, struct throtl_grp *tg)
820{
821 /* Something wrong if we are trying to remove same group twice */
822 BUG_ON(hlist_unhashed(&tg->tg_node));
823
824 hlist_del_init(&tg->tg_node);
825
826 /*
827 * Put the reference taken at the time of creation so that when all
828 * queues are gone, group can be destroyed.
829 */
830 throtl_put_tg(tg);
831 td->nr_undestroyed_grps--;
832}
833
834static void throtl_release_tgs(struct throtl_data *td)
835{
836 struct hlist_node *pos, *n;
837 struct throtl_grp *tg;
838
839 hlist_for_each_entry_safe(tg, pos, n, &td->tg_list, tg_node) {
840 /*
841 * If cgroup removal path got to blk_group first and removed
842 * it from cgroup list, then it will take care of destroying
843 * cfqg also.
844 */
845 if (!blkiocg_del_blkio_group(&tg->blkg))
846 throtl_destroy_tg(td, tg);
847 }
848}
849
850static void throtl_td_free(struct throtl_data *td)
851{
852 kfree(td);
853}
854
855/*
856 * Blk cgroup controller notification saying that blkio_group object is being
857 * delinked as associated cgroup object is going away. That also means that
858 * no new IO will come in this group. So get rid of this group as soon as
859 * any pending IO in the group is finished.
860 *
861 * This function is called under rcu_read_lock(). key is the rcu protected
862 * pointer. That means "key" is a valid throtl_data pointer as long as we are
863 * rcu read lock.
864 *
865 * "key" was fetched from blkio_group under blkio_cgroup->lock. That means
866 * it should not be NULL as even if queue was going away, cgroup deltion
867 * path got to it first.
868 */
869void throtl_unlink_blkio_group(void *key, struct blkio_group *blkg)
870{
871 unsigned long flags;
872 struct throtl_data *td = key;
873
874 spin_lock_irqsave(td->queue->queue_lock, flags);
875 throtl_destroy_tg(td, tg_of_blkg(blkg));
876 spin_unlock_irqrestore(td->queue->queue_lock, flags);
877}
878
879/*
880 * For all update functions, key should be a valid pointer because these
881 * update functions are called under blkcg_lock, that means, blkg is
882 * valid and in turn key is valid. queue exit path can not race becuase
883 * of blkcg_lock
884 *
885 * Can not take queue lock in update functions as queue lock under blkcg_lock
886 * is not allowed. Under other paths we take blkcg_lock under queue_lock.
887 */
888static void throtl_update_blkio_group_read_bps(void *key,
889 struct blkio_group *blkg, u64 read_bps)
890{
891 struct throtl_data *td = key;
892
893 tg_of_blkg(blkg)->bps[READ] = read_bps;
894 /* Make sure read_bps is updated before setting limits_changed */
895 smp_wmb();
896 tg_of_blkg(blkg)->limits_changed = true;
897
898 /* Make sure tg->limits_changed is updated before td->limits_changed */
899 smp_mb__before_atomic_inc();
900 atomic_inc(&td->limits_changed);
901 smp_mb__after_atomic_inc();
902
903 /* Schedule a work now to process the limit change */
904 throtl_schedule_delayed_work(td->queue, 0);
905}
906
907static void throtl_update_blkio_group_write_bps(void *key,
908 struct blkio_group *blkg, u64 write_bps)
909{
910 struct throtl_data *td = key;
911
912 tg_of_blkg(blkg)->bps[WRITE] = write_bps;
913 smp_wmb();
914 tg_of_blkg(blkg)->limits_changed = true;
915 smp_mb__before_atomic_inc();
916 atomic_inc(&td->limits_changed);
917 smp_mb__after_atomic_inc();
918 throtl_schedule_delayed_work(td->queue, 0);
919}
920
921static void throtl_update_blkio_group_read_iops(void *key,
922 struct blkio_group *blkg, unsigned int read_iops)
923{
924 struct throtl_data *td = key;
925
926 tg_of_blkg(blkg)->iops[READ] = read_iops;
927 smp_wmb();
928 tg_of_blkg(blkg)->limits_changed = true;
929 smp_mb__before_atomic_inc();
930 atomic_inc(&td->limits_changed);
931 smp_mb__after_atomic_inc();
932 throtl_schedule_delayed_work(td->queue, 0);
933}
934
935static void throtl_update_blkio_group_write_iops(void *key,
936 struct blkio_group *blkg, unsigned int write_iops)
937{
938 struct throtl_data *td = key;
939
940 tg_of_blkg(blkg)->iops[WRITE] = write_iops;
941 smp_wmb();
942 tg_of_blkg(blkg)->limits_changed = true;
943 smp_mb__before_atomic_inc();
944 atomic_inc(&td->limits_changed);
945 smp_mb__after_atomic_inc();
946 throtl_schedule_delayed_work(td->queue, 0);
947}
948
949void throtl_shutdown_timer_wq(struct request_queue *q)
950{
951 struct throtl_data *td = q->td;
952
953 cancel_delayed_work_sync(&td->throtl_work);
954}
955
956static struct blkio_policy_type blkio_policy_throtl = {
957 .ops = {
958 .blkio_unlink_group_fn = throtl_unlink_blkio_group,
959 .blkio_update_group_read_bps_fn =
960 throtl_update_blkio_group_read_bps,
961 .blkio_update_group_write_bps_fn =
962 throtl_update_blkio_group_write_bps,
963 .blkio_update_group_read_iops_fn =
964 throtl_update_blkio_group_read_iops,
965 .blkio_update_group_write_iops_fn =
966 throtl_update_blkio_group_write_iops,
967 },
968 .plid = BLKIO_POLICY_THROTL,
969};
970
971int blk_throtl_bio(struct request_queue *q, struct bio **biop)
972{
973 struct throtl_data *td = q->td;
974 struct throtl_grp *tg;
975 struct bio *bio = *biop;
976 bool rw = bio_data_dir(bio), update_disptime = true;
977
978 if (bio->bi_rw & REQ_THROTTLED) {
979 bio->bi_rw &= ~REQ_THROTTLED;
980 return 0;
981 }
982
983 spin_lock_irq(q->queue_lock);
984 tg = throtl_get_tg(td);
985
986 if (tg->nr_queued[rw]) {
987 /*
988 * There is already another bio queued in same dir. No
989 * need to update dispatch time.
990 * Still update the disptime if rate limits on this group
991 * were changed.
992 */
993 if (!tg->limits_changed)
994 update_disptime = false;
995 else
996 tg->limits_changed = false;
997
998 goto queue_bio;
999 }
1000
1001 /* Bio is with-in rate limit of group */
1002 if (tg_may_dispatch(td, tg, bio, NULL)) {
1003 throtl_charge_bio(tg, bio);
1004 goto out;
1005 }
1006
1007queue_bio:
1008 throtl_log_tg(td, tg, "[%c] bio. bdisp=%u sz=%u bps=%llu"
1009 " iodisp=%u iops=%u queued=%d/%d",
1010 rw == READ ? 'R' : 'W',
1011 tg->bytes_disp[rw], bio->bi_size, tg->bps[rw],
1012 tg->io_disp[rw], tg->iops[rw],
1013 tg->nr_queued[READ], tg->nr_queued[WRITE]);
1014
1015 throtl_add_bio_tg(q->td, tg, bio);
1016 *biop = NULL;
1017
1018 if (update_disptime) {
1019 tg_update_disptime(td, tg);
1020 throtl_schedule_next_dispatch(td);
1021 }
1022
1023out:
1024 spin_unlock_irq(q->queue_lock);
1025 return 0;
1026}
1027
1028int blk_throtl_init(struct request_queue *q)
1029{
1030 struct throtl_data *td;
1031 struct throtl_grp *tg;
1032
1033 td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node);
1034 if (!td)
1035 return -ENOMEM;
1036
1037 INIT_HLIST_HEAD(&td->tg_list);
1038 td->tg_service_tree = THROTL_RB_ROOT;
1039 atomic_set(&td->limits_changed, 0);
1040
1041 /* Init root group */
1042 tg = &td->root_tg;
1043 INIT_HLIST_NODE(&tg->tg_node);
1044 RB_CLEAR_NODE(&tg->rb_node);
1045 bio_list_init(&tg->bio_lists[0]);
1046 bio_list_init(&tg->bio_lists[1]);
1047
1048 /* Practically unlimited BW */
1049 tg->bps[0] = tg->bps[1] = -1;
1050 tg->iops[0] = tg->iops[1] = -1;
1051
1052 /*
1053 * Set root group reference to 2. One reference will be dropped when
1054 * all groups on tg_list are being deleted during queue exit. Other
1055 * reference will remain there as we don't want to delete this group
1056 * as it is statically allocated and gets destroyed when throtl_data
1057 * goes away.
1058 */
1059 atomic_set(&tg->ref, 2);
1060 hlist_add_head(&tg->tg_node, &td->tg_list);
1061 td->nr_undestroyed_grps++;
1062
1063 INIT_DELAYED_WORK(&td->throtl_work, blk_throtl_work);
1064
1065 rcu_read_lock();
1066 blkiocg_add_blkio_group(&blkio_root_cgroup, &tg->blkg, (void *)td,
1067 0, BLKIO_POLICY_THROTL);
1068 rcu_read_unlock();
1069
1070 /* Attach throtl data to request queue */
1071 td->queue = q;
1072 q->td = td;
1073 return 0;
1074}
1075
1076void blk_throtl_exit(struct request_queue *q)
1077{
1078 struct throtl_data *td = q->td;
1079 bool wait = false;
1080
1081 BUG_ON(!td);
1082
1083 throtl_shutdown_timer_wq(q);
1084
1085 spin_lock_irq(q->queue_lock);
1086 throtl_release_tgs(td);
1087
1088 /* If there are other groups */
1089 if (td->nr_undestroyed_grps > 0)
1090 wait = true;
1091
1092 spin_unlock_irq(q->queue_lock);
1093
1094 /*
1095 * Wait for tg->blkg->key accessors to exit their grace periods.
1096 * Do this wait only if there are other undestroyed groups out
1097 * there (other than root group). This can happen if cgroup deletion
1098 * path claimed the responsibility of cleaning up a group before
1099 * queue cleanup code get to the group.
1100 *
1101 * Do not call synchronize_rcu() unconditionally as there are drivers
1102 * which create/delete request queue hundreds of times during scan/boot
1103 * and synchronize_rcu() can take significant time and slow down boot.
1104 */
1105 if (wait)
1106 synchronize_rcu();
1107
1108 /*
1109 * Just being safe to make sure after previous flush if some body did
1110 * update limits through cgroup and another work got queued, cancel
1111 * it.
1112 */
1113 throtl_shutdown_timer_wq(q);
1114 throtl_td_free(td);
1115}
1116
1117static int __init throtl_init(void)
1118{
1119 blkio_policy_register(&blkio_policy_throtl);
1120 return 0;
1121}
1122
1123module_init(throtl_init);
diff --git a/block/blk.h b/block/blk.h
index d6b911ac002c..2db8f32838e7 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -51,6 +51,8 @@ static inline void blk_clear_rq_complete(struct request *rq)
51 */ 51 */
52#define ELV_ON_HASH(rq) (!hlist_unhashed(&(rq)->hash)) 52#define ELV_ON_HASH(rq) (!hlist_unhashed(&(rq)->hash))
53 53
54struct request *blk_do_flush(struct request_queue *q, struct request *rq);
55
54static inline struct request *__elv_next_request(struct request_queue *q) 56static inline struct request *__elv_next_request(struct request_queue *q)
55{ 57{
56 struct request *rq; 58 struct request *rq;
@@ -58,7 +60,11 @@ static inline struct request *__elv_next_request(struct request_queue *q)
58 while (1) { 60 while (1) {
59 while (!list_empty(&q->queue_head)) { 61 while (!list_empty(&q->queue_head)) {
60 rq = list_entry_rq(q->queue_head.next); 62 rq = list_entry_rq(q->queue_head.next);
61 if (blk_do_ordered(q, &rq)) 63 if (!(rq->cmd_flags & (REQ_FLUSH | REQ_FUA)) ||
64 rq == &q->flush_rq)
65 return rq;
66 rq = blk_do_flush(q, rq);
67 if (rq)
62 return rq; 68 return rq;
63 } 69 }
64 70
@@ -132,14 +138,6 @@ static inline int queue_congestion_off_threshold(struct request_queue *q)
132 return q->nr_congestion_off; 138 return q->nr_congestion_off;
133} 139}
134 140
135#if defined(CONFIG_BLK_DEV_INTEGRITY)
136
137#define rq_for_each_integrity_segment(bvl, _rq, _iter) \
138 __rq_for_each_bio(_iter.bio, _rq) \
139 bip_for_each_vec(bvl, _iter.bio->bi_integrity, _iter.i)
140
141#endif /* BLK_DEV_INTEGRITY */
142
143static inline int blk_cpu_to_group(int cpu) 141static inline int blk_cpu_to_group(int cpu)
144{ 142{
145 int group = NR_CPUS; 143 int group = NR_CPUS;
diff --git a/block/bsg.c b/block/bsg.c
index 0c00870553a3..f20d6a789d48 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -20,7 +20,6 @@
20#include <linux/uio.h> 20#include <linux/uio.h>
21#include <linux/idr.h> 21#include <linux/idr.h>
22#include <linux/bsg.h> 22#include <linux/bsg.h>
23#include <linux/smp_lock.h>
24#include <linux/slab.h> 23#include <linux/slab.h>
25 24
26#include <scsi/scsi.h> 25#include <scsi/scsi.h>
@@ -843,9 +842,7 @@ static int bsg_open(struct inode *inode, struct file *file)
843{ 842{
844 struct bsg_device *bd; 843 struct bsg_device *bd;
845 844
846 lock_kernel();
847 bd = bsg_get_device(inode, file); 845 bd = bsg_get_device(inode, file);
848 unlock_kernel();
849 846
850 if (IS_ERR(bd)) 847 if (IS_ERR(bd))
851 return PTR_ERR(bd); 848 return PTR_ERR(bd);
@@ -968,6 +965,7 @@ static const struct file_operations bsg_fops = {
968 .release = bsg_release, 965 .release = bsg_release,
969 .unlocked_ioctl = bsg_ioctl, 966 .unlocked_ioctl = bsg_ioctl,
970 .owner = THIS_MODULE, 967 .owner = THIS_MODULE,
968 .llseek = default_llseek,
971}; 969};
972 970
973void bsg_unregister_queue(struct request_queue *q) 971void bsg_unregister_queue(struct request_queue *q)
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index f90519430be6..73a58628f54a 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -160,6 +160,7 @@ enum wl_prio_t {
160 BE_WORKLOAD = 0, 160 BE_WORKLOAD = 0,
161 RT_WORKLOAD = 1, 161 RT_WORKLOAD = 1,
162 IDLE_WORKLOAD = 2, 162 IDLE_WORKLOAD = 2,
163 CFQ_PRIO_NR,
163}; 164};
164 165
165/* 166/*
@@ -184,10 +185,19 @@ struct cfq_group {
184 /* number of cfqq currently on this group */ 185 /* number of cfqq currently on this group */
185 int nr_cfqq; 186 int nr_cfqq;
186 187
187 /* Per group busy queus average. Useful for workload slice calc. */
188 unsigned int busy_queues_avg[2];
189 /* 188 /*
190 * rr lists of queues with requests, onle rr for each priority class. 189 * Per group busy queus average. Useful for workload slice calc. We
190 * create the array for each prio class but at run time it is used
191 * only for RT and BE class and slot for IDLE class remains unused.
192 * This is primarily done to avoid confusion and a gcc warning.
193 */
194 unsigned int busy_queues_avg[CFQ_PRIO_NR];
195 /*
196 * rr lists of queues with requests. We maintain service trees for
197 * RT and BE classes. These trees are subdivided in subclasses
198 * of SYNC, SYNC_NOIDLE and ASYNC based on workload type. For IDLE
199 * class there is no subclassification and all the cfq queues go on
200 * a single tree service_tree_idle.
191 * Counts are embedded in the cfq_rb_root 201 * Counts are embedded in the cfq_rb_root
192 */ 202 */
193 struct cfq_rb_root service_trees[2][3]; 203 struct cfq_rb_root service_trees[2][3];
@@ -221,7 +231,6 @@ struct cfq_data {
221 enum wl_type_t serving_type; 231 enum wl_type_t serving_type;
222 unsigned long workload_expires; 232 unsigned long workload_expires;
223 struct cfq_group *serving_group; 233 struct cfq_group *serving_group;
224 bool noidle_tree_requires_idle;
225 234
226 /* 235 /*
227 * Each priority tree is sorted by next_request position. These 236 * Each priority tree is sorted by next_request position. These
@@ -977,8 +986,8 @@ static inline struct cfq_group *cfqg_of_blkg(struct blkio_group *blkg)
977 return NULL; 986 return NULL;
978} 987}
979 988
980void 989void cfq_update_blkio_group_weight(void *key, struct blkio_group *blkg,
981cfq_update_blkio_group_weight(struct blkio_group *blkg, unsigned int weight) 990 unsigned int weight)
982{ 991{
983 cfqg_of_blkg(blkg)->weight = weight; 992 cfqg_of_blkg(blkg)->weight = weight;
984} 993}
@@ -2180,7 +2189,6 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
2180 slice = max_t(unsigned, slice, CFQ_MIN_TT); 2189 slice = max_t(unsigned, slice, CFQ_MIN_TT);
2181 cfq_log(cfqd, "workload slice:%d", slice); 2190 cfq_log(cfqd, "workload slice:%d", slice);
2182 cfqd->workload_expires = jiffies + slice; 2191 cfqd->workload_expires = jiffies + slice;
2183 cfqd->noidle_tree_requires_idle = false;
2184} 2192}
2185 2193
2186static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd) 2194static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd)
@@ -3188,7 +3196,9 @@ cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
3188 if (cfqq->queued[0] + cfqq->queued[1] >= 4) 3196 if (cfqq->queued[0] + cfqq->queued[1] >= 4)
3189 cfq_mark_cfqq_deep(cfqq); 3197 cfq_mark_cfqq_deep(cfqq);
3190 3198
3191 if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle || 3199 if (cfqq->next_rq && (cfqq->next_rq->cmd_flags & REQ_NOIDLE))
3200 enable_idle = 0;
3201 else if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle ||
3192 (!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq))) 3202 (!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq)))
3193 enable_idle = 0; 3203 enable_idle = 0;
3194 else if (sample_valid(cic->ttime_samples)) { 3204 else if (sample_valid(cic->ttime_samples)) {
@@ -3509,17 +3519,7 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
3509 cfq_slice_expired(cfqd, 1); 3519 cfq_slice_expired(cfqd, 1);
3510 else if (sync && cfqq_empty && 3520 else if (sync && cfqq_empty &&
3511 !cfq_close_cooperator(cfqd, cfqq)) { 3521 !cfq_close_cooperator(cfqd, cfqq)) {
3512 cfqd->noidle_tree_requires_idle |= 3522 cfq_arm_slice_timer(cfqd);
3513 !(rq->cmd_flags & REQ_NOIDLE);
3514 /*
3515 * Idling is enabled for SYNC_WORKLOAD.
3516 * SYNC_NOIDLE_WORKLOAD idles at the end of the tree
3517 * only if we processed at least one !REQ_NOIDLE request
3518 */
3519 if (cfqd->serving_type == SYNC_WORKLOAD
3520 || cfqd->noidle_tree_requires_idle
3521 || cfqq->cfqg->nr_cfqq == 1)
3522 cfq_arm_slice_timer(cfqd);
3523 } 3523 }
3524 } 3524 }
3525 3525
@@ -4105,6 +4105,7 @@ static struct blkio_policy_type blkio_policy_cfq = {
4105 .blkio_unlink_group_fn = cfq_unlink_blkio_group, 4105 .blkio_unlink_group_fn = cfq_unlink_blkio_group,
4106 .blkio_update_group_weight_fn = cfq_update_blkio_group_weight, 4106 .blkio_update_group_weight_fn = cfq_update_blkio_group_weight,
4107 }, 4107 },
4108 .plid = BLKIO_POLICY_PROP,
4108}; 4109};
4109#else 4110#else
4110static struct blkio_policy_type blkio_policy_cfq; 4111static struct blkio_policy_type blkio_policy_cfq;
diff --git a/block/cfq.h b/block/cfq.h
index 93448e5a2e41..54a6d90f8e8c 100644
--- a/block/cfq.h
+++ b/block/cfq.h
@@ -69,7 +69,7 @@ static inline void cfq_blkiocg_update_completion_stats(struct blkio_group *blkg,
69 69
70static inline void cfq_blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, 70static inline void cfq_blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
71 struct blkio_group *blkg, void *key, dev_t dev) { 71 struct blkio_group *blkg, void *key, dev_t dev) {
72 blkiocg_add_blkio_group(blkcg, blkg, key, dev); 72 blkiocg_add_blkio_group(blkcg, blkg, key, dev, BLKIO_POLICY_PROP);
73} 73}
74 74
75static inline int cfq_blkiocg_del_blkio_group(struct blkio_group *blkg) 75static inline int cfq_blkiocg_del_blkio_group(struct blkio_group *blkg)
diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c
index 119f07b74dc0..58c6ee5b010c 100644
--- a/block/compat_ioctl.c
+++ b/block/compat_ioctl.c
@@ -744,13 +744,13 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
744 bdi->ra_pages = (arg * 512) / PAGE_CACHE_SIZE; 744 bdi->ra_pages = (arg * 512) / PAGE_CACHE_SIZE;
745 return 0; 745 return 0;
746 case BLKGETSIZE: 746 case BLKGETSIZE:
747 size = bdev->bd_inode->i_size; 747 size = i_size_read(bdev->bd_inode);
748 if ((size >> 9) > ~0UL) 748 if ((size >> 9) > ~0UL)
749 return -EFBIG; 749 return -EFBIG;
750 return compat_put_ulong(arg, size >> 9); 750 return compat_put_ulong(arg, size >> 9);
751 751
752 case BLKGETSIZE64_32: 752 case BLKGETSIZE64_32:
753 return compat_put_u64(arg, bdev->bd_inode->i_size); 753 return compat_put_u64(arg, i_size_read(bdev->bd_inode));
754 754
755 case BLKTRACESETUP32: 755 case BLKTRACESETUP32:
756 case BLKTRACESTART: /* compatible */ 756 case BLKTRACESTART: /* compatible */
diff --git a/block/elevator.c b/block/elevator.c
index 4e11559aa2b0..2569512830d3 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -429,7 +429,7 @@ void elv_dispatch_sort(struct request_queue *q, struct request *rq)
429 q->nr_sorted--; 429 q->nr_sorted--;
430 430
431 boundary = q->end_sector; 431 boundary = q->end_sector;
432 stop_flags = REQ_SOFTBARRIER | REQ_HARDBARRIER | REQ_STARTED; 432 stop_flags = REQ_SOFTBARRIER | REQ_STARTED;
433 list_for_each_prev(entry, &q->queue_head) { 433 list_for_each_prev(entry, &q->queue_head) {
434 struct request *pos = list_entry_rq(entry); 434 struct request *pos = list_entry_rq(entry);
435 435
@@ -617,8 +617,6 @@ void elv_quiesce_end(struct request_queue *q)
617 617
618void elv_insert(struct request_queue *q, struct request *rq, int where) 618void elv_insert(struct request_queue *q, struct request *rq, int where)
619{ 619{
620 struct list_head *pos;
621 unsigned ordseq;
622 int unplug_it = 1; 620 int unplug_it = 1;
623 621
624 trace_block_rq_insert(q, rq); 622 trace_block_rq_insert(q, rq);
@@ -626,9 +624,16 @@ void elv_insert(struct request_queue *q, struct request *rq, int where)
626 rq->q = q; 624 rq->q = q;
627 625
628 switch (where) { 626 switch (where) {
627 case ELEVATOR_INSERT_REQUEUE:
628 /*
629 * Most requeues happen because of a busy condition,
630 * don't force unplug of the queue for that case.
631 * Clear unplug_it and fall through.
632 */
633 unplug_it = 0;
634
629 case ELEVATOR_INSERT_FRONT: 635 case ELEVATOR_INSERT_FRONT:
630 rq->cmd_flags |= REQ_SOFTBARRIER; 636 rq->cmd_flags |= REQ_SOFTBARRIER;
631
632 list_add(&rq->queuelist, &q->queue_head); 637 list_add(&rq->queuelist, &q->queue_head);
633 break; 638 break;
634 639
@@ -668,36 +673,6 @@ void elv_insert(struct request_queue *q, struct request *rq, int where)
668 q->elevator->ops->elevator_add_req_fn(q, rq); 673 q->elevator->ops->elevator_add_req_fn(q, rq);
669 break; 674 break;
670 675
671 case ELEVATOR_INSERT_REQUEUE:
672 /*
673 * If ordered flush isn't in progress, we do front
674 * insertion; otherwise, requests should be requeued
675 * in ordseq order.
676 */
677 rq->cmd_flags |= REQ_SOFTBARRIER;
678
679 /*
680 * Most requeues happen because of a busy condition,
681 * don't force unplug of the queue for that case.
682 */
683 unplug_it = 0;
684
685 if (q->ordseq == 0) {
686 list_add(&rq->queuelist, &q->queue_head);
687 break;
688 }
689
690 ordseq = blk_ordered_req_seq(rq);
691
692 list_for_each(pos, &q->queue_head) {
693 struct request *pos_rq = list_entry_rq(pos);
694 if (ordseq <= blk_ordered_req_seq(pos_rq))
695 break;
696 }
697
698 list_add_tail(&rq->queuelist, pos);
699 break;
700
701 default: 676 default:
702 printk(KERN_ERR "%s: bad insertion point %d\n", 677 printk(KERN_ERR "%s: bad insertion point %d\n",
703 __func__, where); 678 __func__, where);
@@ -716,26 +691,8 @@ void elv_insert(struct request_queue *q, struct request *rq, int where)
716void __elv_add_request(struct request_queue *q, struct request *rq, int where, 691void __elv_add_request(struct request_queue *q, struct request *rq, int where,
717 int plug) 692 int plug)
718{ 693{
719 if (q->ordcolor) 694 if (rq->cmd_flags & REQ_SOFTBARRIER) {
720 rq->cmd_flags |= REQ_ORDERED_COLOR; 695 /* barriers are scheduling boundary, update end_sector */
721
722 if (rq->cmd_flags & (REQ_SOFTBARRIER | REQ_HARDBARRIER)) {
723 /*
724 * toggle ordered color
725 */
726 if (rq->cmd_flags & REQ_HARDBARRIER)
727 q->ordcolor ^= 1;
728
729 /*
730 * barriers implicitly indicate back insertion
731 */
732 if (where == ELEVATOR_INSERT_SORT)
733 where = ELEVATOR_INSERT_BACK;
734
735 /*
736 * this request is scheduling boundary, update
737 * end_sector
738 */
739 if (rq->cmd_type == REQ_TYPE_FS || 696 if (rq->cmd_type == REQ_TYPE_FS ||
740 (rq->cmd_flags & REQ_DISCARD)) { 697 (rq->cmd_flags & REQ_DISCARD)) {
741 q->end_sector = rq_end_sector(rq); 698 q->end_sector = rq_end_sector(rq);
@@ -855,24 +812,6 @@ void elv_completed_request(struct request_queue *q, struct request *rq)
855 e->ops->elevator_completed_req_fn) 812 e->ops->elevator_completed_req_fn)
856 e->ops->elevator_completed_req_fn(q, rq); 813 e->ops->elevator_completed_req_fn(q, rq);
857 } 814 }
858
859 /*
860 * Check if the queue is waiting for fs requests to be
861 * drained for flush sequence.
862 */
863 if (unlikely(q->ordseq)) {
864 struct request *next = NULL;
865
866 if (!list_empty(&q->queue_head))
867 next = list_entry_rq(q->queue_head.next);
868
869 if (!queue_in_flight(q) &&
870 blk_ordered_cur_seq(q) == QUEUE_ORDSEQ_DRAIN &&
871 (!next || blk_ordered_req_seq(next) > QUEUE_ORDSEQ_DRAIN)) {
872 blk_ordered_complete_seq(q, QUEUE_ORDSEQ_DRAIN, 0);
873 __blk_run_queue(q);
874 }
875 }
876} 815}
877 816
878#define to_elv(atr) container_of((atr), struct elv_fs_entry, attr) 817#define to_elv(atr) container_of((atr), struct elv_fs_entry, attr)
diff --git a/block/genhd.c b/block/genhd.c
index 59a2db6fecef..5fa2b44a72ff 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -22,9 +22,7 @@
22#include "blk.h" 22#include "blk.h"
23 23
24static DEFINE_MUTEX(block_class_lock); 24static DEFINE_MUTEX(block_class_lock);
25#ifndef CONFIG_SYSFS_DEPRECATED
26struct kobject *block_depr; 25struct kobject *block_depr;
27#endif
28 26
29/* for extended dynamic devt allocation, currently only one major is used */ 27/* for extended dynamic devt allocation, currently only one major is used */
30#define MAX_EXT_DEVT (1 << MINORBITS) 28#define MAX_EXT_DEVT (1 << MINORBITS)
@@ -541,13 +539,15 @@ void add_disk(struct gendisk *disk)
541 disk->major = MAJOR(devt); 539 disk->major = MAJOR(devt);
542 disk->first_minor = MINOR(devt); 540 disk->first_minor = MINOR(devt);
543 541
542 /* Register BDI before referencing it from bdev */
543 bdi = &disk->queue->backing_dev_info;
544 bdi_register_dev(bdi, disk_devt(disk));
545
544 blk_register_region(disk_devt(disk), disk->minors, NULL, 546 blk_register_region(disk_devt(disk), disk->minors, NULL,
545 exact_match, exact_lock, disk); 547 exact_match, exact_lock, disk);
546 register_disk(disk); 548 register_disk(disk);
547 blk_register_queue(disk); 549 blk_register_queue(disk);
548 550
549 bdi = &disk->queue->backing_dev_info;
550 bdi_register_dev(bdi, disk_devt(disk));
551 retval = sysfs_create_link(&disk_to_dev(disk)->kobj, &bdi->dev->kobj, 551 retval = sysfs_create_link(&disk_to_dev(disk)->kobj, &bdi->dev->kobj,
552 "bdi"); 552 "bdi");
553 WARN_ON(retval); 553 WARN_ON(retval);
@@ -642,6 +642,7 @@ void __init printk_all_partitions(void)
642 struct hd_struct *part; 642 struct hd_struct *part;
643 char name_buf[BDEVNAME_SIZE]; 643 char name_buf[BDEVNAME_SIZE];
644 char devt_buf[BDEVT_SIZE]; 644 char devt_buf[BDEVT_SIZE];
645 u8 uuid[PARTITION_META_INFO_UUIDLTH * 2 + 1];
645 646
646 /* 647 /*
647 * Don't show empty devices or things that have been 648 * Don't show empty devices or things that have been
@@ -660,10 +661,14 @@ void __init printk_all_partitions(void)
660 while ((part = disk_part_iter_next(&piter))) { 661 while ((part = disk_part_iter_next(&piter))) {
661 bool is_part0 = part == &disk->part0; 662 bool is_part0 = part == &disk->part0;
662 663
663 printk("%s%s %10llu %s", is_part0 ? "" : " ", 664 uuid[0] = 0;
665 if (part->info)
666 part_unpack_uuid(part->info->uuid, uuid);
667
668 printk("%s%s %10llu %s %s", is_part0 ? "" : " ",
664 bdevt_str(part_devt(part), devt_buf), 669 bdevt_str(part_devt(part), devt_buf),
665 (unsigned long long)part->nr_sects >> 1, 670 (unsigned long long)part->nr_sects >> 1,
666 disk_name(disk, part->partno, name_buf)); 671 disk_name(disk, part->partno, name_buf), uuid);
667 if (is_part0) { 672 if (is_part0) {
668 if (disk->driverfs_dev != NULL && 673 if (disk->driverfs_dev != NULL &&
669 disk->driverfs_dev->driver != NULL) 674 disk->driverfs_dev->driver != NULL)
@@ -803,10 +808,9 @@ static int __init genhd_device_init(void)
803 808
804 register_blkdev(BLOCK_EXT_MAJOR, "blkext"); 809 register_blkdev(BLOCK_EXT_MAJOR, "blkext");
805 810
806#ifndef CONFIG_SYSFS_DEPRECATED
807 /* create top-level block dir */ 811 /* create top-level block dir */
808 block_depr = kobject_create_and_add("block", NULL); 812 if (!sysfs_deprecated)
809#endif 813 block_depr = kobject_create_and_add("block", NULL);
810 return 0; 814 return 0;
811} 815}
812 816
@@ -1004,6 +1008,7 @@ static void disk_release(struct device *dev)
1004 kfree(disk->random); 1008 kfree(disk->random);
1005 disk_replace_part_tbl(disk, NULL); 1009 disk_replace_part_tbl(disk, NULL);
1006 free_part_stats(&disk->part0); 1010 free_part_stats(&disk->part0);
1011 free_part_info(&disk->part0);
1007 kfree(disk); 1012 kfree(disk);
1008} 1013}
1009struct class block_class = { 1014struct class block_class = {
diff --git a/block/ioctl.c b/block/ioctl.c
index d8052f0dabd3..3d866d0037f2 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -62,7 +62,7 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user
62 62
63 /* all seems OK */ 63 /* all seems OK */
64 part = add_partition(disk, partno, start, length, 64 part = add_partition(disk, partno, start, length,
65 ADDPART_FLAG_NONE); 65 ADDPART_FLAG_NONE, NULL);
66 mutex_unlock(&bdev->bd_mutex); 66 mutex_unlock(&bdev->bd_mutex);
67 return IS_ERR(part) ? PTR_ERR(part) : 0; 67 return IS_ERR(part) ? PTR_ERR(part) : 0;
68 case BLKPG_DEL_PARTITION: 68 case BLKPG_DEL_PARTITION:
@@ -116,7 +116,7 @@ static int blkdev_reread_part(struct block_device *bdev)
116static int blk_ioctl_discard(struct block_device *bdev, uint64_t start, 116static int blk_ioctl_discard(struct block_device *bdev, uint64_t start,
117 uint64_t len, int secure) 117 uint64_t len, int secure)
118{ 118{
119 unsigned long flags = BLKDEV_IFL_WAIT; 119 unsigned long flags = 0;
120 120
121 if (start & 511) 121 if (start & 511)
122 return -EINVAL; 122 return -EINVAL;
@@ -125,10 +125,10 @@ static int blk_ioctl_discard(struct block_device *bdev, uint64_t start,
125 start >>= 9; 125 start >>= 9;
126 len >>= 9; 126 len >>= 9;
127 127
128 if (start + len > (bdev->bd_inode->i_size >> 9)) 128 if (start + len > (i_size_read(bdev->bd_inode) >> 9))
129 return -EINVAL; 129 return -EINVAL;
130 if (secure) 130 if (secure)
131 flags |= BLKDEV_IFL_SECURE; 131 flags |= BLKDEV_DISCARD_SECURE;
132 return blkdev_issue_discard(bdev, start, len, GFP_KERNEL, flags); 132 return blkdev_issue_discard(bdev, start, len, GFP_KERNEL, flags);
133} 133}
134 134
@@ -242,6 +242,7 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
242 * We need to set the startsect first, the driver may 242 * We need to set the startsect first, the driver may
243 * want to override it. 243 * want to override it.
244 */ 244 */
245 memset(&geo, 0, sizeof(geo));
245 geo.start = get_start_sect(bdev); 246 geo.start = get_start_sect(bdev);
246 ret = disk->fops->getgeo(bdev, &geo); 247 ret = disk->fops->getgeo(bdev, &geo);
247 if (ret) 248 if (ret)
@@ -307,12 +308,12 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
307 ret = blkdev_reread_part(bdev); 308 ret = blkdev_reread_part(bdev);
308 break; 309 break;
309 case BLKGETSIZE: 310 case BLKGETSIZE:
310 size = bdev->bd_inode->i_size; 311 size = i_size_read(bdev->bd_inode);
311 if ((size >> 9) > ~0UL) 312 if ((size >> 9) > ~0UL)
312 return -EFBIG; 313 return -EFBIG;
313 return put_ulong(arg, size >> 9); 314 return put_ulong(arg, size >> 9);
314 case BLKGETSIZE64: 315 case BLKGETSIZE64:
315 return put_u64(arg, bdev->bd_inode->i_size); 316 return put_u64(arg, i_size_read(bdev->bd_inode));
316 case BLKTRACESTART: 317 case BLKTRACESTART:
317 case BLKTRACESTOP: 318 case BLKTRACESTOP:
318 case BLKTRACESETUP: 319 case BLKTRACESETUP:
diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index a8b5a10eb5b0..4f4230b79bb6 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -321,33 +321,47 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk,
321 if (hdr->iovec_count) { 321 if (hdr->iovec_count) {
322 const int size = sizeof(struct sg_iovec) * hdr->iovec_count; 322 const int size = sizeof(struct sg_iovec) * hdr->iovec_count;
323 size_t iov_data_len; 323 size_t iov_data_len;
324 struct sg_iovec *iov; 324 struct sg_iovec *sg_iov;
325 struct iovec *iov;
326 int i;
325 327
326 iov = kmalloc(size, GFP_KERNEL); 328 sg_iov = kmalloc(size, GFP_KERNEL);
327 if (!iov) { 329 if (!sg_iov) {
328 ret = -ENOMEM; 330 ret = -ENOMEM;
329 goto out; 331 goto out;
330 } 332 }
331 333
332 if (copy_from_user(iov, hdr->dxferp, size)) { 334 if (copy_from_user(sg_iov, hdr->dxferp, size)) {
333 kfree(iov); 335 kfree(sg_iov);
334 ret = -EFAULT; 336 ret = -EFAULT;
335 goto out; 337 goto out;
336 } 338 }
337 339
340 /*
341 * Sum up the vecs, making sure they don't overflow
342 */
343 iov = (struct iovec *) sg_iov;
344 iov_data_len = 0;
345 for (i = 0; i < hdr->iovec_count; i++) {
346 if (iov_data_len + iov[i].iov_len < iov_data_len) {
347 kfree(sg_iov);
348 ret = -EINVAL;
349 goto out;
350 }
351 iov_data_len += iov[i].iov_len;
352 }
353
338 /* SG_IO howto says that the shorter of the two wins */ 354 /* SG_IO howto says that the shorter of the two wins */
339 iov_data_len = iov_length((struct iovec *)iov,
340 hdr->iovec_count);
341 if (hdr->dxfer_len < iov_data_len) { 355 if (hdr->dxfer_len < iov_data_len) {
342 hdr->iovec_count = iov_shorten((struct iovec *)iov, 356 hdr->iovec_count = iov_shorten(iov,
343 hdr->iovec_count, 357 hdr->iovec_count,
344 hdr->dxfer_len); 358 hdr->dxfer_len);
345 iov_data_len = hdr->dxfer_len; 359 iov_data_len = hdr->dxfer_len;
346 } 360 }
347 361
348 ret = blk_rq_map_user_iov(q, rq, NULL, iov, hdr->iovec_count, 362 ret = blk_rq_map_user_iov(q, rq, NULL, sg_iov, hdr->iovec_count,
349 iov_data_len, GFP_KERNEL); 363 iov_data_len, GFP_KERNEL);
350 kfree(iov); 364 kfree(sg_iov);
351 } else if (hdr->dxfer_len) 365 } else if (hdr->dxfer_len)
352 ret = blk_rq_map_user(q, rq, NULL, hdr->dxferp, hdr->dxfer_len, 366 ret = blk_rq_map_user(q, rq, NULL, hdr->dxferp, hdr->dxfer_len,
353 GFP_KERNEL); 367 GFP_KERNEL);