aboutsummaryrefslogtreecommitdiffstats
path: root/block
diff options
context:
space:
mode:
Diffstat (limited to 'block')
-rw-r--r--block/Kconfig12
-rw-r--r--block/Makefile3
-rw-r--r--block/blk-barrier.c350
-rw-r--r--block/blk-cgroup.c806
-rw-r--r--block/blk-cgroup.h87
-rw-r--r--block/blk-core.c134
-rw-r--r--block/blk-exec.c9
-rw-r--r--block/blk-flush.c262
-rw-r--r--block/blk-integrity.c94
-rw-r--r--block/blk-lib.c41
-rw-r--r--block/blk-map.c7
-rw-r--r--block/blk-merge.c37
-rw-r--r--block/blk-settings.c32
-rw-r--r--block/blk-sysfs.c14
-rw-r--r--block/blk-throttle.c1123
-rw-r--r--block/blk.h28
-rw-r--r--block/bsg.c6
-rw-r--r--block/cfq-iosched.c158
-rw-r--r--block/cfq.h2
-rw-r--r--block/compat_ioctl.c1
-rw-r--r--block/elevator.c137
-rw-r--r--block/genhd.c37
-rw-r--r--block/ioctl.c17
23 files changed, 2608 insertions, 789 deletions
diff --git a/block/Kconfig b/block/Kconfig
index 9be0b56eaee1..6c9213ef15a1 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -77,6 +77,18 @@ config BLK_DEV_INTEGRITY
77 T10/SCSI Data Integrity Field or the T13/ATA External Path 77 T10/SCSI Data Integrity Field or the T13/ATA External Path
78 Protection. If in doubt, say N. 78 Protection. If in doubt, say N.
79 79
80config BLK_DEV_THROTTLING
81 bool "Block layer bio throttling support"
82 depends on BLK_CGROUP=y && EXPERIMENTAL
83 default n
84 ---help---
85 Block layer bio throttling support. It can be used to limit
86 the IO rate to a device. IO rate policies are per cgroup and
87 one needs to mount and use blkio cgroup controller for creating
88 cgroups and specifying per device IO rate policies.
89
90 See Documentation/cgroups/blkio-controller.txt for more information.
91
80endif # BLOCK 92endif # BLOCK
81 93
82config BLOCK_COMPAT 94config BLOCK_COMPAT
diff --git a/block/Makefile b/block/Makefile
index 0bb499a739cd..0fec4b3fab51 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -3,12 +3,13 @@
3# 3#
4 4
5obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \ 5obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \
6 blk-barrier.o blk-settings.o blk-ioc.o blk-map.o \ 6 blk-flush.o blk-settings.o blk-ioc.o blk-map.o \
7 blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \ 7 blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \
8 blk-iopoll.o blk-lib.o ioctl.o genhd.o scsi_ioctl.o 8 blk-iopoll.o blk-lib.o ioctl.o genhd.o scsi_ioctl.o
9 9
10obj-$(CONFIG_BLK_DEV_BSG) += bsg.o 10obj-$(CONFIG_BLK_DEV_BSG) += bsg.o
11obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o 11obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o
12obj-$(CONFIG_BLK_DEV_THROTTLING) += blk-throttle.o
12obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o 13obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o
13obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o 14obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o
14obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o 15obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o
diff --git a/block/blk-barrier.c b/block/blk-barrier.c
deleted file mode 100644
index f0faefca032f..000000000000
--- a/block/blk-barrier.c
+++ /dev/null
@@ -1,350 +0,0 @@
1/*
2 * Functions related to barrier IO handling
3 */
4#include <linux/kernel.h>
5#include <linux/module.h>
6#include <linux/bio.h>
7#include <linux/blkdev.h>
8#include <linux/gfp.h>
9
10#include "blk.h"
11
12/**
13 * blk_queue_ordered - does this queue support ordered writes
14 * @q: the request queue
15 * @ordered: one of QUEUE_ORDERED_*
16 *
17 * Description:
18 * For journalled file systems, doing ordered writes on a commit
19 * block instead of explicitly doing wait_on_buffer (which is bad
20 * for performance) can be a big win. Block drivers supporting this
21 * feature should call this function and indicate so.
22 *
23 **/
24int blk_queue_ordered(struct request_queue *q, unsigned ordered)
25{
26 if (ordered != QUEUE_ORDERED_NONE &&
27 ordered != QUEUE_ORDERED_DRAIN &&
28 ordered != QUEUE_ORDERED_DRAIN_FLUSH &&
29 ordered != QUEUE_ORDERED_DRAIN_FUA &&
30 ordered != QUEUE_ORDERED_TAG &&
31 ordered != QUEUE_ORDERED_TAG_FLUSH &&
32 ordered != QUEUE_ORDERED_TAG_FUA) {
33 printk(KERN_ERR "blk_queue_ordered: bad value %d\n", ordered);
34 return -EINVAL;
35 }
36
37 q->ordered = ordered;
38 q->next_ordered = ordered;
39
40 return 0;
41}
42EXPORT_SYMBOL(blk_queue_ordered);
43
44/*
45 * Cache flushing for ordered writes handling
46 */
47unsigned blk_ordered_cur_seq(struct request_queue *q)
48{
49 if (!q->ordseq)
50 return 0;
51 return 1 << ffz(q->ordseq);
52}
53
54unsigned blk_ordered_req_seq(struct request *rq)
55{
56 struct request_queue *q = rq->q;
57
58 BUG_ON(q->ordseq == 0);
59
60 if (rq == &q->pre_flush_rq)
61 return QUEUE_ORDSEQ_PREFLUSH;
62 if (rq == &q->bar_rq)
63 return QUEUE_ORDSEQ_BAR;
64 if (rq == &q->post_flush_rq)
65 return QUEUE_ORDSEQ_POSTFLUSH;
66
67 /*
68 * !fs requests don't need to follow barrier ordering. Always
69 * put them at the front. This fixes the following deadlock.
70 *
71 * http://thread.gmane.org/gmane.linux.kernel/537473
72 */
73 if (rq->cmd_type != REQ_TYPE_FS)
74 return QUEUE_ORDSEQ_DRAIN;
75
76 if ((rq->cmd_flags & REQ_ORDERED_COLOR) ==
77 (q->orig_bar_rq->cmd_flags & REQ_ORDERED_COLOR))
78 return QUEUE_ORDSEQ_DRAIN;
79 else
80 return QUEUE_ORDSEQ_DONE;
81}
82
83bool blk_ordered_complete_seq(struct request_queue *q, unsigned seq, int error)
84{
85 struct request *rq;
86
87 if (error && !q->orderr)
88 q->orderr = error;
89
90 BUG_ON(q->ordseq & seq);
91 q->ordseq |= seq;
92
93 if (blk_ordered_cur_seq(q) != QUEUE_ORDSEQ_DONE)
94 return false;
95
96 /*
97 * Okay, sequence complete.
98 */
99 q->ordseq = 0;
100 rq = q->orig_bar_rq;
101 __blk_end_request_all(rq, q->orderr);
102 return true;
103}
104
105static void pre_flush_end_io(struct request *rq, int error)
106{
107 elv_completed_request(rq->q, rq);
108 blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_PREFLUSH, error);
109}
110
111static void bar_end_io(struct request *rq, int error)
112{
113 elv_completed_request(rq->q, rq);
114 blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_BAR, error);
115}
116
117static void post_flush_end_io(struct request *rq, int error)
118{
119 elv_completed_request(rq->q, rq);
120 blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_POSTFLUSH, error);
121}
122
123static void queue_flush(struct request_queue *q, unsigned which)
124{
125 struct request *rq;
126 rq_end_io_fn *end_io;
127
128 if (which == QUEUE_ORDERED_DO_PREFLUSH) {
129 rq = &q->pre_flush_rq;
130 end_io = pre_flush_end_io;
131 } else {
132 rq = &q->post_flush_rq;
133 end_io = post_flush_end_io;
134 }
135
136 blk_rq_init(q, rq);
137 rq->cmd_type = REQ_TYPE_FS;
138 rq->cmd_flags = REQ_HARDBARRIER | REQ_FLUSH;
139 rq->rq_disk = q->orig_bar_rq->rq_disk;
140 rq->end_io = end_io;
141
142 elv_insert(q, rq, ELEVATOR_INSERT_FRONT);
143}
144
145static inline bool start_ordered(struct request_queue *q, struct request **rqp)
146{
147 struct request *rq = *rqp;
148 unsigned skip = 0;
149
150 q->orderr = 0;
151 q->ordered = q->next_ordered;
152 q->ordseq |= QUEUE_ORDSEQ_STARTED;
153
154 /*
155 * For an empty barrier, there's no actual BAR request, which
156 * in turn makes POSTFLUSH unnecessary. Mask them off.
157 */
158 if (!blk_rq_sectors(rq)) {
159 q->ordered &= ~(QUEUE_ORDERED_DO_BAR |
160 QUEUE_ORDERED_DO_POSTFLUSH);
161 /*
162 * Empty barrier on a write-through device w/ ordered
163 * tag has no command to issue and without any command
164 * to issue, ordering by tag can't be used. Drain
165 * instead.
166 */
167 if ((q->ordered & QUEUE_ORDERED_BY_TAG) &&
168 !(q->ordered & QUEUE_ORDERED_DO_PREFLUSH)) {
169 q->ordered &= ~QUEUE_ORDERED_BY_TAG;
170 q->ordered |= QUEUE_ORDERED_BY_DRAIN;
171 }
172 }
173
174 /* stash away the original request */
175 blk_dequeue_request(rq);
176 q->orig_bar_rq = rq;
177 rq = NULL;
178
179 /*
180 * Queue ordered sequence. As we stack them at the head, we
181 * need to queue in reverse order. Note that we rely on that
182 * no fs request uses ELEVATOR_INSERT_FRONT and thus no fs
183 * request gets inbetween ordered sequence.
184 */
185 if (q->ordered & QUEUE_ORDERED_DO_POSTFLUSH) {
186 queue_flush(q, QUEUE_ORDERED_DO_POSTFLUSH);
187 rq = &q->post_flush_rq;
188 } else
189 skip |= QUEUE_ORDSEQ_POSTFLUSH;
190
191 if (q->ordered & QUEUE_ORDERED_DO_BAR) {
192 rq = &q->bar_rq;
193
194 /* initialize proxy request and queue it */
195 blk_rq_init(q, rq);
196 if (bio_data_dir(q->orig_bar_rq->bio) == WRITE)
197 rq->cmd_flags |= REQ_WRITE;
198 if (q->ordered & QUEUE_ORDERED_DO_FUA)
199 rq->cmd_flags |= REQ_FUA;
200 init_request_from_bio(rq, q->orig_bar_rq->bio);
201 rq->end_io = bar_end_io;
202
203 elv_insert(q, rq, ELEVATOR_INSERT_FRONT);
204 } else
205 skip |= QUEUE_ORDSEQ_BAR;
206
207 if (q->ordered & QUEUE_ORDERED_DO_PREFLUSH) {
208 queue_flush(q, QUEUE_ORDERED_DO_PREFLUSH);
209 rq = &q->pre_flush_rq;
210 } else
211 skip |= QUEUE_ORDSEQ_PREFLUSH;
212
213 if ((q->ordered & QUEUE_ORDERED_BY_DRAIN) && queue_in_flight(q))
214 rq = NULL;
215 else
216 skip |= QUEUE_ORDSEQ_DRAIN;
217
218 *rqp = rq;
219
220 /*
221 * Complete skipped sequences. If whole sequence is complete,
222 * return false to tell elevator that this request is gone.
223 */
224 return !blk_ordered_complete_seq(q, skip, 0);
225}
226
227bool blk_do_ordered(struct request_queue *q, struct request **rqp)
228{
229 struct request *rq = *rqp;
230 const int is_barrier = rq->cmd_type == REQ_TYPE_FS &&
231 (rq->cmd_flags & REQ_HARDBARRIER);
232
233 if (!q->ordseq) {
234 if (!is_barrier)
235 return true;
236
237 if (q->next_ordered != QUEUE_ORDERED_NONE)
238 return start_ordered(q, rqp);
239 else {
240 /*
241 * Queue ordering not supported. Terminate
242 * with prejudice.
243 */
244 blk_dequeue_request(rq);
245 __blk_end_request_all(rq, -EOPNOTSUPP);
246 *rqp = NULL;
247 return false;
248 }
249 }
250
251 /*
252 * Ordered sequence in progress
253 */
254
255 /* Special requests are not subject to ordering rules. */
256 if (rq->cmd_type != REQ_TYPE_FS &&
257 rq != &q->pre_flush_rq && rq != &q->post_flush_rq)
258 return true;
259
260 if (q->ordered & QUEUE_ORDERED_BY_TAG) {
261 /* Ordered by tag. Blocking the next barrier is enough. */
262 if (is_barrier && rq != &q->bar_rq)
263 *rqp = NULL;
264 } else {
265 /* Ordered by draining. Wait for turn. */
266 WARN_ON(blk_ordered_req_seq(rq) < blk_ordered_cur_seq(q));
267 if (blk_ordered_req_seq(rq) > blk_ordered_cur_seq(q))
268 *rqp = NULL;
269 }
270
271 return true;
272}
273
274static void bio_end_empty_barrier(struct bio *bio, int err)
275{
276 if (err) {
277 if (err == -EOPNOTSUPP)
278 set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
279 clear_bit(BIO_UPTODATE, &bio->bi_flags);
280 }
281 if (bio->bi_private)
282 complete(bio->bi_private);
283 bio_put(bio);
284}
285
286/**
287 * blkdev_issue_flush - queue a flush
288 * @bdev: blockdev to issue flush for
289 * @gfp_mask: memory allocation flags (for bio_alloc)
290 * @error_sector: error sector
291 * @flags: BLKDEV_IFL_* flags to control behaviour
292 *
293 * Description:
294 * Issue a flush for the block device in question. Caller can supply
295 * room for storing the error offset in case of a flush error, if they
296 * wish to. If WAIT flag is not passed then caller may check only what
297 * request was pushed in some internal queue for later handling.
298 */
299int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask,
300 sector_t *error_sector, unsigned long flags)
301{
302 DECLARE_COMPLETION_ONSTACK(wait);
303 struct request_queue *q;
304 struct bio *bio;
305 int ret = 0;
306
307 if (bdev->bd_disk == NULL)
308 return -ENXIO;
309
310 q = bdev_get_queue(bdev);
311 if (!q)
312 return -ENXIO;
313
314 /*
315 * some block devices may not have their queue correctly set up here
316 * (e.g. loop device without a backing file) and so issuing a flush
317 * here will panic. Ensure there is a request function before issuing
318 * the barrier.
319 */
320 if (!q->make_request_fn)
321 return -ENXIO;
322
323 bio = bio_alloc(gfp_mask, 0);
324 bio->bi_end_io = bio_end_empty_barrier;
325 bio->bi_bdev = bdev;
326 if (test_bit(BLKDEV_WAIT, &flags))
327 bio->bi_private = &wait;
328
329 bio_get(bio);
330 submit_bio(WRITE_BARRIER, bio);
331 if (test_bit(BLKDEV_WAIT, &flags)) {
332 wait_for_completion(&wait);
333 /*
334 * The driver must store the error location in ->bi_sector, if
335 * it supports it. For non-stacked drivers, this should be
336 * copied from blk_rq_pos(rq).
337 */
338 if (error_sector)
339 *error_sector = bio->bi_sector;
340 }
341
342 if (bio_flagged(bio, BIO_EOPNOTSUPP))
343 ret = -EOPNOTSUPP;
344 else if (!bio_flagged(bio, BIO_UPTODATE))
345 ret = -EIO;
346
347 bio_put(bio);
348 return ret;
349}
350EXPORT_SYMBOL(blkdev_issue_flush);
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index a6809645d212..b1febd0f6d2a 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -37,6 +37,12 @@ static void blkiocg_attach(struct cgroup_subsys *, struct cgroup *,
37static void blkiocg_destroy(struct cgroup_subsys *, struct cgroup *); 37static void blkiocg_destroy(struct cgroup_subsys *, struct cgroup *);
38static int blkiocg_populate(struct cgroup_subsys *, struct cgroup *); 38static int blkiocg_populate(struct cgroup_subsys *, struct cgroup *);
39 39
40/* for encoding cft->private value on file */
41#define BLKIOFILE_PRIVATE(x, val) (((x) << 16) | (val))
42/* What policy owns the file, proportional or throttle */
43#define BLKIOFILE_POLICY(val) (((val) >> 16) & 0xffff)
44#define BLKIOFILE_ATTR(val) ((val) & 0xffff)
45
40struct cgroup_subsys blkio_subsys = { 46struct cgroup_subsys blkio_subsys = {
41 .name = "blkio", 47 .name = "blkio",
42 .create = blkiocg_create, 48 .create = blkiocg_create,
@@ -59,6 +65,27 @@ static inline void blkio_policy_insert_node(struct blkio_cgroup *blkcg,
59 list_add(&pn->node, &blkcg->policy_list); 65 list_add(&pn->node, &blkcg->policy_list);
60} 66}
61 67
68static inline bool cftype_blkg_same_policy(struct cftype *cft,
69 struct blkio_group *blkg)
70{
71 enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
72
73 if (blkg->plid == plid)
74 return 1;
75
76 return 0;
77}
78
79/* Determines if policy node matches cgroup file being accessed */
80static inline bool pn_matches_cftype(struct cftype *cft,
81 struct blkio_policy_node *pn)
82{
83 enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
84 int fileid = BLKIOFILE_ATTR(cft->private);
85
86 return (plid == pn->plid && fileid == pn->fileid);
87}
88
62/* Must be called with blkcg->lock held */ 89/* Must be called with blkcg->lock held */
63static inline void blkio_policy_delete_node(struct blkio_policy_node *pn) 90static inline void blkio_policy_delete_node(struct blkio_policy_node *pn)
64{ 91{
@@ -67,12 +94,13 @@ static inline void blkio_policy_delete_node(struct blkio_policy_node *pn)
67 94
68/* Must be called with blkcg->lock held */ 95/* Must be called with blkcg->lock held */
69static struct blkio_policy_node * 96static struct blkio_policy_node *
70blkio_policy_search_node(const struct blkio_cgroup *blkcg, dev_t dev) 97blkio_policy_search_node(const struct blkio_cgroup *blkcg, dev_t dev,
98 enum blkio_policy_id plid, int fileid)
71{ 99{
72 struct blkio_policy_node *pn; 100 struct blkio_policy_node *pn;
73 101
74 list_for_each_entry(pn, &blkcg->policy_list, node) { 102 list_for_each_entry(pn, &blkcg->policy_list, node) {
75 if (pn->dev == dev) 103 if (pn->dev == dev && pn->plid == plid && pn->fileid == fileid)
76 return pn; 104 return pn;
77 } 105 }
78 106
@@ -86,6 +114,67 @@ struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup)
86} 114}
87EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup); 115EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup);
88 116
117static inline void
118blkio_update_group_weight(struct blkio_group *blkg, unsigned int weight)
119{
120 struct blkio_policy_type *blkiop;
121
122 list_for_each_entry(blkiop, &blkio_list, list) {
123 /* If this policy does not own the blkg, do not send updates */
124 if (blkiop->plid != blkg->plid)
125 continue;
126 if (blkiop->ops.blkio_update_group_weight_fn)
127 blkiop->ops.blkio_update_group_weight_fn(blkg->key,
128 blkg, weight);
129 }
130}
131
132static inline void blkio_update_group_bps(struct blkio_group *blkg, u64 bps,
133 int fileid)
134{
135 struct blkio_policy_type *blkiop;
136
137 list_for_each_entry(blkiop, &blkio_list, list) {
138
139 /* If this policy does not own the blkg, do not send updates */
140 if (blkiop->plid != blkg->plid)
141 continue;
142
143 if (fileid == BLKIO_THROTL_read_bps_device
144 && blkiop->ops.blkio_update_group_read_bps_fn)
145 blkiop->ops.blkio_update_group_read_bps_fn(blkg->key,
146 blkg, bps);
147
148 if (fileid == BLKIO_THROTL_write_bps_device
149 && blkiop->ops.blkio_update_group_write_bps_fn)
150 blkiop->ops.blkio_update_group_write_bps_fn(blkg->key,
151 blkg, bps);
152 }
153}
154
155static inline void blkio_update_group_iops(struct blkio_group *blkg,
156 unsigned int iops, int fileid)
157{
158 struct blkio_policy_type *blkiop;
159
160 list_for_each_entry(blkiop, &blkio_list, list) {
161
162 /* If this policy does not own the blkg, do not send updates */
163 if (blkiop->plid != blkg->plid)
164 continue;
165
166 if (fileid == BLKIO_THROTL_read_iops_device
167 && blkiop->ops.blkio_update_group_read_iops_fn)
168 blkiop->ops.blkio_update_group_read_iops_fn(blkg->key,
169 blkg, iops);
170
171 if (fileid == BLKIO_THROTL_write_iops_device
172 && blkiop->ops.blkio_update_group_write_iops_fn)
173 blkiop->ops.blkio_update_group_write_iops_fn(blkg->key,
174 blkg,iops);
175 }
176}
177
89/* 178/*
90 * Add to the appropriate stat variable depending on the request type. 179 * Add to the appropriate stat variable depending on the request type.
91 * This should be called with the blkg->stats_lock held. 180 * This should be called with the blkg->stats_lock held.
@@ -341,7 +430,8 @@ void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction,
341EXPORT_SYMBOL_GPL(blkiocg_update_io_merged_stats); 430EXPORT_SYMBOL_GPL(blkiocg_update_io_merged_stats);
342 431
343void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, 432void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
344 struct blkio_group *blkg, void *key, dev_t dev) 433 struct blkio_group *blkg, void *key, dev_t dev,
434 enum blkio_policy_id plid)
345{ 435{
346 unsigned long flags; 436 unsigned long flags;
347 437
@@ -350,6 +440,7 @@ void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
350 rcu_assign_pointer(blkg->key, key); 440 rcu_assign_pointer(blkg->key, key);
351 blkg->blkcg_id = css_id(&blkcg->css); 441 blkg->blkcg_id = css_id(&blkcg->css);
352 hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list); 442 hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
443 blkg->plid = plid;
353 spin_unlock_irqrestore(&blkcg->lock, flags); 444 spin_unlock_irqrestore(&blkcg->lock, flags);
354 /* Need to take css reference ? */ 445 /* Need to take css reference ? */
355 cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path)); 446 cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path));
@@ -408,51 +499,6 @@ struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key)
408} 499}
409EXPORT_SYMBOL_GPL(blkiocg_lookup_group); 500EXPORT_SYMBOL_GPL(blkiocg_lookup_group);
410 501
411#define SHOW_FUNCTION(__VAR) \
412static u64 blkiocg_##__VAR##_read(struct cgroup *cgroup, \
413 struct cftype *cftype) \
414{ \
415 struct blkio_cgroup *blkcg; \
416 \
417 blkcg = cgroup_to_blkio_cgroup(cgroup); \
418 return (u64)blkcg->__VAR; \
419}
420
421SHOW_FUNCTION(weight);
422#undef SHOW_FUNCTION
423
424static int
425blkiocg_weight_write(struct cgroup *cgroup, struct cftype *cftype, u64 val)
426{
427 struct blkio_cgroup *blkcg;
428 struct blkio_group *blkg;
429 struct hlist_node *n;
430 struct blkio_policy_type *blkiop;
431 struct blkio_policy_node *pn;
432
433 if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX)
434 return -EINVAL;
435
436 blkcg = cgroup_to_blkio_cgroup(cgroup);
437 spin_lock(&blkio_list_lock);
438 spin_lock_irq(&blkcg->lock);
439 blkcg->weight = (unsigned int)val;
440
441 hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
442 pn = blkio_policy_search_node(blkcg, blkg->dev);
443
444 if (pn)
445 continue;
446
447 list_for_each_entry(blkiop, &blkio_list, list)
448 blkiop->ops.blkio_update_group_weight_fn(blkg,
449 blkcg->weight);
450 }
451 spin_unlock_irq(&blkcg->lock);
452 spin_unlock(&blkio_list_lock);
453 return 0;
454}
455
456static int 502static int
457blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val) 503blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val)
458{ 504{
@@ -593,52 +639,6 @@ static uint64_t blkio_get_stat(struct blkio_group *blkg,
593 return disk_total; 639 return disk_total;
594} 640}
595 641
596#define SHOW_FUNCTION_PER_GROUP(__VAR, type, show_total) \
597static int blkiocg_##__VAR##_read(struct cgroup *cgroup, \
598 struct cftype *cftype, struct cgroup_map_cb *cb) \
599{ \
600 struct blkio_cgroup *blkcg; \
601 struct blkio_group *blkg; \
602 struct hlist_node *n; \
603 uint64_t cgroup_total = 0; \
604 \
605 if (!cgroup_lock_live_group(cgroup)) \
606 return -ENODEV; \
607 \
608 blkcg = cgroup_to_blkio_cgroup(cgroup); \
609 rcu_read_lock(); \
610 hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {\
611 if (blkg->dev) { \
612 spin_lock_irq(&blkg->stats_lock); \
613 cgroup_total += blkio_get_stat(blkg, cb, \
614 blkg->dev, type); \
615 spin_unlock_irq(&blkg->stats_lock); \
616 } \
617 } \
618 if (show_total) \
619 cb->fill(cb, "Total", cgroup_total); \
620 rcu_read_unlock(); \
621 cgroup_unlock(); \
622 return 0; \
623}
624
625SHOW_FUNCTION_PER_GROUP(time, BLKIO_STAT_TIME, 0);
626SHOW_FUNCTION_PER_GROUP(sectors, BLKIO_STAT_SECTORS, 0);
627SHOW_FUNCTION_PER_GROUP(io_service_bytes, BLKIO_STAT_SERVICE_BYTES, 1);
628SHOW_FUNCTION_PER_GROUP(io_serviced, BLKIO_STAT_SERVICED, 1);
629SHOW_FUNCTION_PER_GROUP(io_service_time, BLKIO_STAT_SERVICE_TIME, 1);
630SHOW_FUNCTION_PER_GROUP(io_wait_time, BLKIO_STAT_WAIT_TIME, 1);
631SHOW_FUNCTION_PER_GROUP(io_merged, BLKIO_STAT_MERGED, 1);
632SHOW_FUNCTION_PER_GROUP(io_queued, BLKIO_STAT_QUEUED, 1);
633#ifdef CONFIG_DEBUG_BLK_CGROUP
634SHOW_FUNCTION_PER_GROUP(dequeue, BLKIO_STAT_DEQUEUE, 0);
635SHOW_FUNCTION_PER_GROUP(avg_queue_size, BLKIO_STAT_AVG_QUEUE_SIZE, 0);
636SHOW_FUNCTION_PER_GROUP(group_wait_time, BLKIO_STAT_GROUP_WAIT_TIME, 0);
637SHOW_FUNCTION_PER_GROUP(idle_time, BLKIO_STAT_IDLE_TIME, 0);
638SHOW_FUNCTION_PER_GROUP(empty_time, BLKIO_STAT_EMPTY_TIME, 0);
639#endif
640#undef SHOW_FUNCTION_PER_GROUP
641
642static int blkio_check_dev_num(dev_t dev) 642static int blkio_check_dev_num(dev_t dev)
643{ 643{
644 int part = 0; 644 int part = 0;
@@ -652,13 +652,14 @@ static int blkio_check_dev_num(dev_t dev)
652} 652}
653 653
654static int blkio_policy_parse_and_set(char *buf, 654static int blkio_policy_parse_and_set(char *buf,
655 struct blkio_policy_node *newpn) 655 struct blkio_policy_node *newpn, enum blkio_policy_id plid, int fileid)
656{ 656{
657 char *s[4], *p, *major_s = NULL, *minor_s = NULL; 657 char *s[4], *p, *major_s = NULL, *minor_s = NULL;
658 int ret; 658 int ret;
659 unsigned long major, minor, temp; 659 unsigned long major, minor, temp;
660 int i = 0; 660 int i = 0;
661 dev_t dev; 661 dev_t dev;
662 u64 bps, iops;
662 663
663 memset(s, 0, sizeof(s)); 664 memset(s, 0, sizeof(s));
664 665
@@ -705,12 +706,47 @@ static int blkio_policy_parse_and_set(char *buf,
705 if (s[1] == NULL) 706 if (s[1] == NULL)
706 return -EINVAL; 707 return -EINVAL;
707 708
708 ret = strict_strtoul(s[1], 10, &temp); 709 switch (plid) {
709 if (ret || (temp < BLKIO_WEIGHT_MIN && temp > 0) || 710 case BLKIO_POLICY_PROP:
710 temp > BLKIO_WEIGHT_MAX) 711 ret = strict_strtoul(s[1], 10, &temp);
711 return -EINVAL; 712 if (ret || (temp < BLKIO_WEIGHT_MIN && temp > 0) ||
713 temp > BLKIO_WEIGHT_MAX)
714 return -EINVAL;
712 715
713 newpn->weight = temp; 716 newpn->plid = plid;
717 newpn->fileid = fileid;
718 newpn->val.weight = temp;
719 break;
720 case BLKIO_POLICY_THROTL:
721 switch(fileid) {
722 case BLKIO_THROTL_read_bps_device:
723 case BLKIO_THROTL_write_bps_device:
724 ret = strict_strtoull(s[1], 10, &bps);
725 if (ret)
726 return -EINVAL;
727
728 newpn->plid = plid;
729 newpn->fileid = fileid;
730 newpn->val.bps = bps;
731 break;
732 case BLKIO_THROTL_read_iops_device:
733 case BLKIO_THROTL_write_iops_device:
734 ret = strict_strtoull(s[1], 10, &iops);
735 if (ret)
736 return -EINVAL;
737
738 if (iops > THROTL_IOPS_MAX)
739 return -EINVAL;
740
741 newpn->plid = plid;
742 newpn->fileid = fileid;
743 newpn->val.iops = (unsigned int)iops;
744 break;
745 }
746 break;
747 default:
748 BUG();
749 }
714 750
715 return 0; 751 return 0;
716} 752}
@@ -720,26 +756,180 @@ unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
720{ 756{
721 struct blkio_policy_node *pn; 757 struct blkio_policy_node *pn;
722 758
723 pn = blkio_policy_search_node(blkcg, dev); 759 pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_PROP,
760 BLKIO_PROP_weight_device);
724 if (pn) 761 if (pn)
725 return pn->weight; 762 return pn->val.weight;
726 else 763 else
727 return blkcg->weight; 764 return blkcg->weight;
728} 765}
729EXPORT_SYMBOL_GPL(blkcg_get_weight); 766EXPORT_SYMBOL_GPL(blkcg_get_weight);
730 767
768uint64_t blkcg_get_read_bps(struct blkio_cgroup *blkcg, dev_t dev)
769{
770 struct blkio_policy_node *pn;
771
772 pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
773 BLKIO_THROTL_read_bps_device);
774 if (pn)
775 return pn->val.bps;
776 else
777 return -1;
778}
779
780uint64_t blkcg_get_write_bps(struct blkio_cgroup *blkcg, dev_t dev)
781{
782 struct blkio_policy_node *pn;
783 pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
784 BLKIO_THROTL_write_bps_device);
785 if (pn)
786 return pn->val.bps;
787 else
788 return -1;
789}
790
791unsigned int blkcg_get_read_iops(struct blkio_cgroup *blkcg, dev_t dev)
792{
793 struct blkio_policy_node *pn;
794
795 pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
796 BLKIO_THROTL_read_iops_device);
797 if (pn)
798 return pn->val.iops;
799 else
800 return -1;
801}
802
803unsigned int blkcg_get_write_iops(struct blkio_cgroup *blkcg, dev_t dev)
804{
805 struct blkio_policy_node *pn;
806 pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
807 BLKIO_THROTL_write_iops_device);
808 if (pn)
809 return pn->val.iops;
810 else
811 return -1;
812}
813
814/* Checks whether user asked for deleting a policy rule */
815static bool blkio_delete_rule_command(struct blkio_policy_node *pn)
816{
817 switch(pn->plid) {
818 case BLKIO_POLICY_PROP:
819 if (pn->val.weight == 0)
820 return 1;
821 break;
822 case BLKIO_POLICY_THROTL:
823 switch(pn->fileid) {
824 case BLKIO_THROTL_read_bps_device:
825 case BLKIO_THROTL_write_bps_device:
826 if (pn->val.bps == 0)
827 return 1;
828 break;
829 case BLKIO_THROTL_read_iops_device:
830 case BLKIO_THROTL_write_iops_device:
831 if (pn->val.iops == 0)
832 return 1;
833 }
834 break;
835 default:
836 BUG();
837 }
838
839 return 0;
840}
841
842static void blkio_update_policy_rule(struct blkio_policy_node *oldpn,
843 struct blkio_policy_node *newpn)
844{
845 switch(oldpn->plid) {
846 case BLKIO_POLICY_PROP:
847 oldpn->val.weight = newpn->val.weight;
848 break;
849 case BLKIO_POLICY_THROTL:
850 switch(newpn->fileid) {
851 case BLKIO_THROTL_read_bps_device:
852 case BLKIO_THROTL_write_bps_device:
853 oldpn->val.bps = newpn->val.bps;
854 break;
855 case BLKIO_THROTL_read_iops_device:
856 case BLKIO_THROTL_write_iops_device:
857 oldpn->val.iops = newpn->val.iops;
858 }
859 break;
860 default:
861 BUG();
862 }
863}
864
865/*
866 * Some rules/values in blkg have changed. Propogate those to respective
867 * policies.
868 */
869static void blkio_update_blkg_policy(struct blkio_cgroup *blkcg,
870 struct blkio_group *blkg, struct blkio_policy_node *pn)
871{
872 unsigned int weight, iops;
873 u64 bps;
874
875 switch(pn->plid) {
876 case BLKIO_POLICY_PROP:
877 weight = pn->val.weight ? pn->val.weight :
878 blkcg->weight;
879 blkio_update_group_weight(blkg, weight);
880 break;
881 case BLKIO_POLICY_THROTL:
882 switch(pn->fileid) {
883 case BLKIO_THROTL_read_bps_device:
884 case BLKIO_THROTL_write_bps_device:
885 bps = pn->val.bps ? pn->val.bps : (-1);
886 blkio_update_group_bps(blkg, bps, pn->fileid);
887 break;
888 case BLKIO_THROTL_read_iops_device:
889 case BLKIO_THROTL_write_iops_device:
890 iops = pn->val.iops ? pn->val.iops : (-1);
891 blkio_update_group_iops(blkg, iops, pn->fileid);
892 break;
893 }
894 break;
895 default:
896 BUG();
897 }
898}
899
900/*
901 * A policy node rule has been updated. Propogate this update to all the
902 * block groups which might be affected by this update.
903 */
904static void blkio_update_policy_node_blkg(struct blkio_cgroup *blkcg,
905 struct blkio_policy_node *pn)
906{
907 struct blkio_group *blkg;
908 struct hlist_node *n;
909
910 spin_lock(&blkio_list_lock);
911 spin_lock_irq(&blkcg->lock);
912
913 hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
914 if (pn->dev != blkg->dev || pn->plid != blkg->plid)
915 continue;
916 blkio_update_blkg_policy(blkcg, blkg, pn);
917 }
918
919 spin_unlock_irq(&blkcg->lock);
920 spin_unlock(&blkio_list_lock);
921}
731 922
732static int blkiocg_weight_device_write(struct cgroup *cgrp, struct cftype *cft, 923static int blkiocg_file_write(struct cgroup *cgrp, struct cftype *cft,
733 const char *buffer) 924 const char *buffer)
734{ 925{
735 int ret = 0; 926 int ret = 0;
736 char *buf; 927 char *buf;
737 struct blkio_policy_node *newpn, *pn; 928 struct blkio_policy_node *newpn, *pn;
738 struct blkio_cgroup *blkcg; 929 struct blkio_cgroup *blkcg;
739 struct blkio_group *blkg;
740 int keep_newpn = 0; 930 int keep_newpn = 0;
741 struct hlist_node *n; 931 enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
742 struct blkio_policy_type *blkiop; 932 int fileid = BLKIOFILE_ATTR(cft->private);
743 933
744 buf = kstrdup(buffer, GFP_KERNEL); 934 buf = kstrdup(buffer, GFP_KERNEL);
745 if (!buf) 935 if (!buf)
@@ -751,7 +941,7 @@ static int blkiocg_weight_device_write(struct cgroup *cgrp, struct cftype *cft,
751 goto free_buf; 941 goto free_buf;
752 } 942 }
753 943
754 ret = blkio_policy_parse_and_set(buf, newpn); 944 ret = blkio_policy_parse_and_set(buf, newpn, plid, fileid);
755 if (ret) 945 if (ret)
756 goto free_newpn; 946 goto free_newpn;
757 947
@@ -759,9 +949,9 @@ static int blkiocg_weight_device_write(struct cgroup *cgrp, struct cftype *cft,
759 949
760 spin_lock_irq(&blkcg->lock); 950 spin_lock_irq(&blkcg->lock);
761 951
762 pn = blkio_policy_search_node(blkcg, newpn->dev); 952 pn = blkio_policy_search_node(blkcg, newpn->dev, plid, fileid);
763 if (!pn) { 953 if (!pn) {
764 if (newpn->weight != 0) { 954 if (!blkio_delete_rule_command(newpn)) {
765 blkio_policy_insert_node(blkcg, newpn); 955 blkio_policy_insert_node(blkcg, newpn);
766 keep_newpn = 1; 956 keep_newpn = 1;
767 } 957 }
@@ -769,33 +959,17 @@ static int blkiocg_weight_device_write(struct cgroup *cgrp, struct cftype *cft,
769 goto update_io_group; 959 goto update_io_group;
770 } 960 }
771 961
772 if (newpn->weight == 0) { 962 if (blkio_delete_rule_command(newpn)) {
773 /* weight == 0 means deleteing a specific weight */
774 blkio_policy_delete_node(pn); 963 blkio_policy_delete_node(pn);
775 spin_unlock_irq(&blkcg->lock); 964 spin_unlock_irq(&blkcg->lock);
776 goto update_io_group; 965 goto update_io_group;
777 } 966 }
778 spin_unlock_irq(&blkcg->lock); 967 spin_unlock_irq(&blkcg->lock);
779 968
780 pn->weight = newpn->weight; 969 blkio_update_policy_rule(pn, newpn);
781 970
782update_io_group: 971update_io_group:
783 /* update weight for each cfqg */ 972 blkio_update_policy_node_blkg(blkcg, newpn);
784 spin_lock(&blkio_list_lock);
785 spin_lock_irq(&blkcg->lock);
786
787 hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
788 if (newpn->dev == blkg->dev) {
789 list_for_each_entry(blkiop, &blkio_list, list)
790 blkiop->ops.blkio_update_group_weight_fn(blkg,
791 newpn->weight ?
792 newpn->weight :
793 blkcg->weight);
794 }
795 }
796
797 spin_unlock_irq(&blkcg->lock);
798 spin_unlock(&blkio_list_lock);
799 973
800free_newpn: 974free_newpn:
801 if (!keep_newpn) 975 if (!keep_newpn)
@@ -805,23 +979,256 @@ free_buf:
805 return ret; 979 return ret;
806} 980}
807 981
808static int blkiocg_weight_device_read(struct cgroup *cgrp, struct cftype *cft, 982static void
809 struct seq_file *m) 983blkio_print_policy_node(struct seq_file *m, struct blkio_policy_node *pn)
810{ 984{
811 struct blkio_cgroup *blkcg; 985 switch(pn->plid) {
812 struct blkio_policy_node *pn; 986 case BLKIO_POLICY_PROP:
987 if (pn->fileid == BLKIO_PROP_weight_device)
988 seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev),
989 MINOR(pn->dev), pn->val.weight);
990 break;
991 case BLKIO_POLICY_THROTL:
992 switch(pn->fileid) {
993 case BLKIO_THROTL_read_bps_device:
994 case BLKIO_THROTL_write_bps_device:
995 seq_printf(m, "%u:%u\t%llu\n", MAJOR(pn->dev),
996 MINOR(pn->dev), pn->val.bps);
997 break;
998 case BLKIO_THROTL_read_iops_device:
999 case BLKIO_THROTL_write_iops_device:
1000 seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev),
1001 MINOR(pn->dev), pn->val.iops);
1002 break;
1003 }
1004 break;
1005 default:
1006 BUG();
1007 }
1008}
813 1009
814 seq_printf(m, "dev\tweight\n"); 1010/* cgroup files which read their data from policy nodes end up here */
1011static void blkio_read_policy_node_files(struct cftype *cft,
1012 struct blkio_cgroup *blkcg, struct seq_file *m)
1013{
1014 struct blkio_policy_node *pn;
815 1015
816 blkcg = cgroup_to_blkio_cgroup(cgrp);
817 if (!list_empty(&blkcg->policy_list)) { 1016 if (!list_empty(&blkcg->policy_list)) {
818 spin_lock_irq(&blkcg->lock); 1017 spin_lock_irq(&blkcg->lock);
819 list_for_each_entry(pn, &blkcg->policy_list, node) { 1018 list_for_each_entry(pn, &blkcg->policy_list, node) {
820 seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev), 1019 if (!pn_matches_cftype(cft, pn))
821 MINOR(pn->dev), pn->weight); 1020 continue;
1021 blkio_print_policy_node(m, pn);
822 } 1022 }
823 spin_unlock_irq(&blkcg->lock); 1023 spin_unlock_irq(&blkcg->lock);
824 } 1024 }
1025}
1026
1027static int blkiocg_file_read(struct cgroup *cgrp, struct cftype *cft,
1028 struct seq_file *m)
1029{
1030 struct blkio_cgroup *blkcg;
1031 enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
1032 int name = BLKIOFILE_ATTR(cft->private);
1033
1034 blkcg = cgroup_to_blkio_cgroup(cgrp);
1035
1036 switch(plid) {
1037 case BLKIO_POLICY_PROP:
1038 switch(name) {
1039 case BLKIO_PROP_weight_device:
1040 blkio_read_policy_node_files(cft, blkcg, m);
1041 return 0;
1042 default:
1043 BUG();
1044 }
1045 break;
1046 case BLKIO_POLICY_THROTL:
1047 switch(name){
1048 case BLKIO_THROTL_read_bps_device:
1049 case BLKIO_THROTL_write_bps_device:
1050 case BLKIO_THROTL_read_iops_device:
1051 case BLKIO_THROTL_write_iops_device:
1052 blkio_read_policy_node_files(cft, blkcg, m);
1053 return 0;
1054 default:
1055 BUG();
1056 }
1057 break;
1058 default:
1059 BUG();
1060 }
1061
1062 return 0;
1063}
1064
1065static int blkio_read_blkg_stats(struct blkio_cgroup *blkcg,
1066 struct cftype *cft, struct cgroup_map_cb *cb, enum stat_type type,
1067 bool show_total)
1068{
1069 struct blkio_group *blkg;
1070 struct hlist_node *n;
1071 uint64_t cgroup_total = 0;
1072
1073 rcu_read_lock();
1074 hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {
1075 if (blkg->dev) {
1076 if (!cftype_blkg_same_policy(cft, blkg))
1077 continue;
1078 spin_lock_irq(&blkg->stats_lock);
1079 cgroup_total += blkio_get_stat(blkg, cb, blkg->dev,
1080 type);
1081 spin_unlock_irq(&blkg->stats_lock);
1082 }
1083 }
1084 if (show_total)
1085 cb->fill(cb, "Total", cgroup_total);
1086 rcu_read_unlock();
1087 return 0;
1088}
1089
1090/* All map kind of cgroup file get serviced by this function */
1091static int blkiocg_file_read_map(struct cgroup *cgrp, struct cftype *cft,
1092 struct cgroup_map_cb *cb)
1093{
1094 struct blkio_cgroup *blkcg;
1095 enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
1096 int name = BLKIOFILE_ATTR(cft->private);
1097
1098 blkcg = cgroup_to_blkio_cgroup(cgrp);
1099
1100 switch(plid) {
1101 case BLKIO_POLICY_PROP:
1102 switch(name) {
1103 case BLKIO_PROP_time:
1104 return blkio_read_blkg_stats(blkcg, cft, cb,
1105 BLKIO_STAT_TIME, 0);
1106 case BLKIO_PROP_sectors:
1107 return blkio_read_blkg_stats(blkcg, cft, cb,
1108 BLKIO_STAT_SECTORS, 0);
1109 case BLKIO_PROP_io_service_bytes:
1110 return blkio_read_blkg_stats(blkcg, cft, cb,
1111 BLKIO_STAT_SERVICE_BYTES, 1);
1112 case BLKIO_PROP_io_serviced:
1113 return blkio_read_blkg_stats(blkcg, cft, cb,
1114 BLKIO_STAT_SERVICED, 1);
1115 case BLKIO_PROP_io_service_time:
1116 return blkio_read_blkg_stats(blkcg, cft, cb,
1117 BLKIO_STAT_SERVICE_TIME, 1);
1118 case BLKIO_PROP_io_wait_time:
1119 return blkio_read_blkg_stats(blkcg, cft, cb,
1120 BLKIO_STAT_WAIT_TIME, 1);
1121 case BLKIO_PROP_io_merged:
1122 return blkio_read_blkg_stats(blkcg, cft, cb,
1123 BLKIO_STAT_MERGED, 1);
1124 case BLKIO_PROP_io_queued:
1125 return blkio_read_blkg_stats(blkcg, cft, cb,
1126 BLKIO_STAT_QUEUED, 1);
1127#ifdef CONFIG_DEBUG_BLK_CGROUP
1128 case BLKIO_PROP_dequeue:
1129 return blkio_read_blkg_stats(blkcg, cft, cb,
1130 BLKIO_STAT_DEQUEUE, 0);
1131 case BLKIO_PROP_avg_queue_size:
1132 return blkio_read_blkg_stats(blkcg, cft, cb,
1133 BLKIO_STAT_AVG_QUEUE_SIZE, 0);
1134 case BLKIO_PROP_group_wait_time:
1135 return blkio_read_blkg_stats(blkcg, cft, cb,
1136 BLKIO_STAT_GROUP_WAIT_TIME, 0);
1137 case BLKIO_PROP_idle_time:
1138 return blkio_read_blkg_stats(blkcg, cft, cb,
1139 BLKIO_STAT_IDLE_TIME, 0);
1140 case BLKIO_PROP_empty_time:
1141 return blkio_read_blkg_stats(blkcg, cft, cb,
1142 BLKIO_STAT_EMPTY_TIME, 0);
1143#endif
1144 default:
1145 BUG();
1146 }
1147 break;
1148 case BLKIO_POLICY_THROTL:
1149 switch(name){
1150 case BLKIO_THROTL_io_service_bytes:
1151 return blkio_read_blkg_stats(blkcg, cft, cb,
1152 BLKIO_STAT_SERVICE_BYTES, 1);
1153 case BLKIO_THROTL_io_serviced:
1154 return blkio_read_blkg_stats(blkcg, cft, cb,
1155 BLKIO_STAT_SERVICED, 1);
1156 default:
1157 BUG();
1158 }
1159 break;
1160 default:
1161 BUG();
1162 }
1163
1164 return 0;
1165}
1166
1167static int blkio_weight_write(struct blkio_cgroup *blkcg, u64 val)
1168{
1169 struct blkio_group *blkg;
1170 struct hlist_node *n;
1171 struct blkio_policy_node *pn;
1172
1173 if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX)
1174 return -EINVAL;
1175
1176 spin_lock(&blkio_list_lock);
1177 spin_lock_irq(&blkcg->lock);
1178 blkcg->weight = (unsigned int)val;
1179
1180 hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
1181 pn = blkio_policy_search_node(blkcg, blkg->dev,
1182 BLKIO_POLICY_PROP, BLKIO_PROP_weight_device);
1183 if (pn)
1184 continue;
1185
1186 blkio_update_group_weight(blkg, blkcg->weight);
1187 }
1188 spin_unlock_irq(&blkcg->lock);
1189 spin_unlock(&blkio_list_lock);
1190 return 0;
1191}
1192
1193static u64 blkiocg_file_read_u64 (struct cgroup *cgrp, struct cftype *cft) {
1194 struct blkio_cgroup *blkcg;
1195 enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
1196 int name = BLKIOFILE_ATTR(cft->private);
1197
1198 blkcg = cgroup_to_blkio_cgroup(cgrp);
1199
1200 switch(plid) {
1201 case BLKIO_POLICY_PROP:
1202 switch(name) {
1203 case BLKIO_PROP_weight:
1204 return (u64)blkcg->weight;
1205 }
1206 break;
1207 default:
1208 BUG();
1209 }
1210 return 0;
1211}
1212
1213static int
1214blkiocg_file_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
1215{
1216 struct blkio_cgroup *blkcg;
1217 enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
1218 int name = BLKIOFILE_ATTR(cft->private);
1219
1220 blkcg = cgroup_to_blkio_cgroup(cgrp);
1221
1222 switch(plid) {
1223 case BLKIO_POLICY_PROP:
1224 switch(name) {
1225 case BLKIO_PROP_weight:
1226 return blkio_weight_write(blkcg, val);
1227 }
1228 break;
1229 default:
1230 BUG();
1231 }
825 1232
826 return 0; 1233 return 0;
827} 1234}
@@ -829,71 +1236,151 @@ static int blkiocg_weight_device_read(struct cgroup *cgrp, struct cftype *cft,
829struct cftype blkio_files[] = { 1236struct cftype blkio_files[] = {
830 { 1237 {
831 .name = "weight_device", 1238 .name = "weight_device",
832 .read_seq_string = blkiocg_weight_device_read, 1239 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
833 .write_string = blkiocg_weight_device_write, 1240 BLKIO_PROP_weight_device),
1241 .read_seq_string = blkiocg_file_read,
1242 .write_string = blkiocg_file_write,
834 .max_write_len = 256, 1243 .max_write_len = 256,
835 }, 1244 },
836 { 1245 {
837 .name = "weight", 1246 .name = "weight",
838 .read_u64 = blkiocg_weight_read, 1247 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
839 .write_u64 = blkiocg_weight_write, 1248 BLKIO_PROP_weight),
1249 .read_u64 = blkiocg_file_read_u64,
1250 .write_u64 = blkiocg_file_write_u64,
840 }, 1251 },
841 { 1252 {
842 .name = "time", 1253 .name = "time",
843 .read_map = blkiocg_time_read, 1254 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1255 BLKIO_PROP_time),
1256 .read_map = blkiocg_file_read_map,
844 }, 1257 },
845 { 1258 {
846 .name = "sectors", 1259 .name = "sectors",
847 .read_map = blkiocg_sectors_read, 1260 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1261 BLKIO_PROP_sectors),
1262 .read_map = blkiocg_file_read_map,
848 }, 1263 },
849 { 1264 {
850 .name = "io_service_bytes", 1265 .name = "io_service_bytes",
851 .read_map = blkiocg_io_service_bytes_read, 1266 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1267 BLKIO_PROP_io_service_bytes),
1268 .read_map = blkiocg_file_read_map,
852 }, 1269 },
853 { 1270 {
854 .name = "io_serviced", 1271 .name = "io_serviced",
855 .read_map = blkiocg_io_serviced_read, 1272 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1273 BLKIO_PROP_io_serviced),
1274 .read_map = blkiocg_file_read_map,
856 }, 1275 },
857 { 1276 {
858 .name = "io_service_time", 1277 .name = "io_service_time",
859 .read_map = blkiocg_io_service_time_read, 1278 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1279 BLKIO_PROP_io_service_time),
1280 .read_map = blkiocg_file_read_map,
860 }, 1281 },
861 { 1282 {
862 .name = "io_wait_time", 1283 .name = "io_wait_time",
863 .read_map = blkiocg_io_wait_time_read, 1284 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1285 BLKIO_PROP_io_wait_time),
1286 .read_map = blkiocg_file_read_map,
864 }, 1287 },
865 { 1288 {
866 .name = "io_merged", 1289 .name = "io_merged",
867 .read_map = blkiocg_io_merged_read, 1290 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1291 BLKIO_PROP_io_merged),
1292 .read_map = blkiocg_file_read_map,
868 }, 1293 },
869 { 1294 {
870 .name = "io_queued", 1295 .name = "io_queued",
871 .read_map = blkiocg_io_queued_read, 1296 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1297 BLKIO_PROP_io_queued),
1298 .read_map = blkiocg_file_read_map,
872 }, 1299 },
873 { 1300 {
874 .name = "reset_stats", 1301 .name = "reset_stats",
875 .write_u64 = blkiocg_reset_stats, 1302 .write_u64 = blkiocg_reset_stats,
876 }, 1303 },
1304#ifdef CONFIG_BLK_DEV_THROTTLING
1305 {
1306 .name = "throttle.read_bps_device",
1307 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1308 BLKIO_THROTL_read_bps_device),
1309 .read_seq_string = blkiocg_file_read,
1310 .write_string = blkiocg_file_write,
1311 .max_write_len = 256,
1312 },
1313
1314 {
1315 .name = "throttle.write_bps_device",
1316 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1317 BLKIO_THROTL_write_bps_device),
1318 .read_seq_string = blkiocg_file_read,
1319 .write_string = blkiocg_file_write,
1320 .max_write_len = 256,
1321 },
1322
1323 {
1324 .name = "throttle.read_iops_device",
1325 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1326 BLKIO_THROTL_read_iops_device),
1327 .read_seq_string = blkiocg_file_read,
1328 .write_string = blkiocg_file_write,
1329 .max_write_len = 256,
1330 },
1331
1332 {
1333 .name = "throttle.write_iops_device",
1334 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1335 BLKIO_THROTL_write_iops_device),
1336 .read_seq_string = blkiocg_file_read,
1337 .write_string = blkiocg_file_write,
1338 .max_write_len = 256,
1339 },
1340 {
1341 .name = "throttle.io_service_bytes",
1342 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1343 BLKIO_THROTL_io_service_bytes),
1344 .read_map = blkiocg_file_read_map,
1345 },
1346 {
1347 .name = "throttle.io_serviced",
1348 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1349 BLKIO_THROTL_io_serviced),
1350 .read_map = blkiocg_file_read_map,
1351 },
1352#endif /* CONFIG_BLK_DEV_THROTTLING */
1353
877#ifdef CONFIG_DEBUG_BLK_CGROUP 1354#ifdef CONFIG_DEBUG_BLK_CGROUP
878 { 1355 {
879 .name = "avg_queue_size", 1356 .name = "avg_queue_size",
880 .read_map = blkiocg_avg_queue_size_read, 1357 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1358 BLKIO_PROP_avg_queue_size),
1359 .read_map = blkiocg_file_read_map,
881 }, 1360 },
882 { 1361 {
883 .name = "group_wait_time", 1362 .name = "group_wait_time",
884 .read_map = blkiocg_group_wait_time_read, 1363 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1364 BLKIO_PROP_group_wait_time),
1365 .read_map = blkiocg_file_read_map,
885 }, 1366 },
886 { 1367 {
887 .name = "idle_time", 1368 .name = "idle_time",
888 .read_map = blkiocg_idle_time_read, 1369 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1370 BLKIO_PROP_idle_time),
1371 .read_map = blkiocg_file_read_map,
889 }, 1372 },
890 { 1373 {
891 .name = "empty_time", 1374 .name = "empty_time",
892 .read_map = blkiocg_empty_time_read, 1375 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1376 BLKIO_PROP_empty_time),
1377 .read_map = blkiocg_file_read_map,
893 }, 1378 },
894 { 1379 {
895 .name = "dequeue", 1380 .name = "dequeue",
896 .read_map = blkiocg_dequeue_read, 1381 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1382 BLKIO_PROP_dequeue),
1383 .read_map = blkiocg_file_read_map,
897 }, 1384 },
898#endif 1385#endif
899}; 1386};
@@ -932,13 +1419,14 @@ static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup)
932 /* 1419 /*
933 * This blkio_group is being unlinked as associated cgroup is 1420 * This blkio_group is being unlinked as associated cgroup is
934 * going away. Let all the IO controlling policies know about 1421 * going away. Let all the IO controlling policies know about
935 * this event. Currently this is static call to one io 1422 * this event.
936 * controlling policy. Once we have more policies in place, we
937 * need some dynamic registration of callback function.
938 */ 1423 */
939 spin_lock(&blkio_list_lock); 1424 spin_lock(&blkio_list_lock);
940 list_for_each_entry(blkiop, &blkio_list, list) 1425 list_for_each_entry(blkiop, &blkio_list, list) {
1426 if (blkiop->plid != blkg->plid)
1427 continue;
941 blkiop->ops.blkio_unlink_group_fn(key, blkg); 1428 blkiop->ops.blkio_unlink_group_fn(key, blkg);
1429 }
942 spin_unlock(&blkio_list_lock); 1430 spin_unlock(&blkio_list_lock);
943 } while (1); 1431 } while (1);
944 1432
@@ -966,7 +1454,7 @@ blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup)
966 1454
967 /* Currently we do not support hierarchy deeper than two level (0,1) */ 1455 /* Currently we do not support hierarchy deeper than two level (0,1) */
968 if (parent != cgroup->top_cgroup) 1456 if (parent != cgroup->top_cgroup)
969 return ERR_PTR(-EINVAL); 1457 return ERR_PTR(-EPERM);
970 1458
971 blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL); 1459 blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
972 if (!blkcg) 1460 if (!blkcg)
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 2b866ec1dcea..ea4861bdd549 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -15,6 +15,14 @@
15 15
16#include <linux/cgroup.h> 16#include <linux/cgroup.h>
17 17
18enum blkio_policy_id {
19 BLKIO_POLICY_PROP = 0, /* Proportional Bandwidth division */
20 BLKIO_POLICY_THROTL, /* Throttling */
21};
22
23/* Max limits for throttle policy */
24#define THROTL_IOPS_MAX UINT_MAX
25
18#if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE) 26#if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE)
19 27
20#ifndef CONFIG_BLK_CGROUP 28#ifndef CONFIG_BLK_CGROUP
@@ -65,6 +73,35 @@ enum blkg_state_flags {
65 BLKG_empty, 73 BLKG_empty,
66}; 74};
67 75
76/* cgroup files owned by proportional weight policy */
77enum blkcg_file_name_prop {
78 BLKIO_PROP_weight = 1,
79 BLKIO_PROP_weight_device,
80 BLKIO_PROP_io_service_bytes,
81 BLKIO_PROP_io_serviced,
82 BLKIO_PROP_time,
83 BLKIO_PROP_sectors,
84 BLKIO_PROP_io_service_time,
85 BLKIO_PROP_io_wait_time,
86 BLKIO_PROP_io_merged,
87 BLKIO_PROP_io_queued,
88 BLKIO_PROP_avg_queue_size,
89 BLKIO_PROP_group_wait_time,
90 BLKIO_PROP_idle_time,
91 BLKIO_PROP_empty_time,
92 BLKIO_PROP_dequeue,
93};
94
95/* cgroup files owned by throttle policy */
96enum blkcg_file_name_throtl {
97 BLKIO_THROTL_read_bps_device,
98 BLKIO_THROTL_write_bps_device,
99 BLKIO_THROTL_read_iops_device,
100 BLKIO_THROTL_write_iops_device,
101 BLKIO_THROTL_io_service_bytes,
102 BLKIO_THROTL_io_serviced,
103};
104
68struct blkio_cgroup { 105struct blkio_cgroup {
69 struct cgroup_subsys_state css; 106 struct cgroup_subsys_state css;
70 unsigned int weight; 107 unsigned int weight;
@@ -112,6 +149,8 @@ struct blkio_group {
112 char path[128]; 149 char path[128];
113 /* The device MKDEV(major, minor), this group has been created for */ 150 /* The device MKDEV(major, minor), this group has been created for */
114 dev_t dev; 151 dev_t dev;
152 /* policy which owns this blk group */
153 enum blkio_policy_id plid;
115 154
116 /* Need to serialize the stats in the case of reset/update */ 155 /* Need to serialize the stats in the case of reset/update */
117 spinlock_t stats_lock; 156 spinlock_t stats_lock;
@@ -121,24 +160,60 @@ struct blkio_group {
121struct blkio_policy_node { 160struct blkio_policy_node {
122 struct list_head node; 161 struct list_head node;
123 dev_t dev; 162 dev_t dev;
124 unsigned int weight; 163 /* This node belongs to max bw policy or porportional weight policy */
164 enum blkio_policy_id plid;
165 /* cgroup file to which this rule belongs to */
166 int fileid;
167
168 union {
169 unsigned int weight;
170 /*
171 * Rate read/write in terms of byptes per second
172 * Whether this rate represents read or write is determined
173 * by file type "fileid".
174 */
175 u64 bps;
176 unsigned int iops;
177 } val;
125}; 178};
126 179
127extern unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg, 180extern unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
128 dev_t dev); 181 dev_t dev);
182extern uint64_t blkcg_get_read_bps(struct blkio_cgroup *blkcg,
183 dev_t dev);
184extern uint64_t blkcg_get_write_bps(struct blkio_cgroup *blkcg,
185 dev_t dev);
186extern unsigned int blkcg_get_read_iops(struct blkio_cgroup *blkcg,
187 dev_t dev);
188extern unsigned int blkcg_get_write_iops(struct blkio_cgroup *blkcg,
189 dev_t dev);
129 190
130typedef void (blkio_unlink_group_fn) (void *key, struct blkio_group *blkg); 191typedef void (blkio_unlink_group_fn) (void *key, struct blkio_group *blkg);
131typedef void (blkio_update_group_weight_fn) (struct blkio_group *blkg, 192
132 unsigned int weight); 193typedef void (blkio_update_group_weight_fn) (void *key,
194 struct blkio_group *blkg, unsigned int weight);
195typedef void (blkio_update_group_read_bps_fn) (void * key,
196 struct blkio_group *blkg, u64 read_bps);
197typedef void (blkio_update_group_write_bps_fn) (void *key,
198 struct blkio_group *blkg, u64 write_bps);
199typedef void (blkio_update_group_read_iops_fn) (void *key,
200 struct blkio_group *blkg, unsigned int read_iops);
201typedef void (blkio_update_group_write_iops_fn) (void *key,
202 struct blkio_group *blkg, unsigned int write_iops);
133 203
134struct blkio_policy_ops { 204struct blkio_policy_ops {
135 blkio_unlink_group_fn *blkio_unlink_group_fn; 205 blkio_unlink_group_fn *blkio_unlink_group_fn;
136 blkio_update_group_weight_fn *blkio_update_group_weight_fn; 206 blkio_update_group_weight_fn *blkio_update_group_weight_fn;
207 blkio_update_group_read_bps_fn *blkio_update_group_read_bps_fn;
208 blkio_update_group_write_bps_fn *blkio_update_group_write_bps_fn;
209 blkio_update_group_read_iops_fn *blkio_update_group_read_iops_fn;
210 blkio_update_group_write_iops_fn *blkio_update_group_write_iops_fn;
137}; 211};
138 212
139struct blkio_policy_type { 213struct blkio_policy_type {
140 struct list_head list; 214 struct list_head list;
141 struct blkio_policy_ops ops; 215 struct blkio_policy_ops ops;
216 enum blkio_policy_id plid;
142}; 217};
143 218
144/* Blkio controller policy registration */ 219/* Blkio controller policy registration */
@@ -212,7 +287,8 @@ static inline void blkiocg_set_start_empty_time(struct blkio_group *blkg) {}
212extern struct blkio_cgroup blkio_root_cgroup; 287extern struct blkio_cgroup blkio_root_cgroup;
213extern struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup); 288extern struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup);
214extern void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, 289extern void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
215 struct blkio_group *blkg, void *key, dev_t dev); 290 struct blkio_group *blkg, void *key, dev_t dev,
291 enum blkio_policy_id plid);
216extern int blkiocg_del_blkio_group(struct blkio_group *blkg); 292extern int blkiocg_del_blkio_group(struct blkio_group *blkg);
217extern struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, 293extern struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg,
218 void *key); 294 void *key);
@@ -234,7 +310,8 @@ static inline struct blkio_cgroup *
234cgroup_to_blkio_cgroup(struct cgroup *cgroup) { return NULL; } 310cgroup_to_blkio_cgroup(struct cgroup *cgroup) { return NULL; }
235 311
236static inline void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, 312static inline void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
237 struct blkio_group *blkg, void *key, dev_t dev) {} 313 struct blkio_group *blkg, void *key, dev_t dev,
314 enum blkio_policy_id plid) {}
238 315
239static inline int 316static inline int
240blkiocg_del_blkio_group(struct blkio_group *blkg) { return 0; } 317blkiocg_del_blkio_group(struct blkio_group *blkg) { return 0; }
diff --git a/block/blk-core.c b/block/blk-core.c
index 77411486b111..881fe44ec7da 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -64,13 +64,15 @@ static void drive_stat_acct(struct request *rq, int new_io)
64 return; 64 return;
65 65
66 cpu = part_stat_lock(); 66 cpu = part_stat_lock();
67 part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
68 67
69 if (!new_io) 68 if (!new_io) {
69 part = rq->part;
70 part_stat_inc(cpu, part, merges[rw]); 70 part_stat_inc(cpu, part, merges[rw]);
71 else { 71 } else {
72 part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
72 part_round_stats(cpu, part); 73 part_round_stats(cpu, part);
73 part_inc_in_flight(part, rw); 74 part_inc_in_flight(part, rw);
75 rq->part = part;
74 } 76 }
75 77
76 part_stat_unlock(); 78 part_stat_unlock();
@@ -128,6 +130,7 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
128 rq->ref_count = 1; 130 rq->ref_count = 1;
129 rq->start_time = jiffies; 131 rq->start_time = jiffies;
130 set_start_time_ns(rq); 132 set_start_time_ns(rq);
133 rq->part = NULL;
131} 134}
132EXPORT_SYMBOL(blk_rq_init); 135EXPORT_SYMBOL(blk_rq_init);
133 136
@@ -136,7 +139,7 @@ static void req_bio_endio(struct request *rq, struct bio *bio,
136{ 139{
137 struct request_queue *q = rq->q; 140 struct request_queue *q = rq->q;
138 141
139 if (&q->bar_rq != rq) { 142 if (&q->flush_rq != rq) {
140 if (error) 143 if (error)
141 clear_bit(BIO_UPTODATE, &bio->bi_flags); 144 clear_bit(BIO_UPTODATE, &bio->bi_flags);
142 else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) 145 else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
@@ -160,13 +163,12 @@ static void req_bio_endio(struct request *rq, struct bio *bio,
160 if (bio->bi_size == 0) 163 if (bio->bi_size == 0)
161 bio_endio(bio, error); 164 bio_endio(bio, error);
162 } else { 165 } else {
163
164 /* 166 /*
165 * Okay, this is the barrier request in progress, just 167 * Okay, this is the sequenced flush request in
166 * record the error; 168 * progress, just record the error;
167 */ 169 */
168 if (error && !q->orderr) 170 if (error && !q->flush_err)
169 q->orderr = error; 171 q->flush_err = error;
170 } 172 }
171} 173}
172 174
@@ -382,6 +384,7 @@ void blk_sync_queue(struct request_queue *q)
382 del_timer_sync(&q->unplug_timer); 384 del_timer_sync(&q->unplug_timer);
383 del_timer_sync(&q->timeout); 385 del_timer_sync(&q->timeout);
384 cancel_work_sync(&q->unplug_work); 386 cancel_work_sync(&q->unplug_work);
387 throtl_shutdown_timer_wq(q);
385} 388}
386EXPORT_SYMBOL(blk_sync_queue); 389EXPORT_SYMBOL(blk_sync_queue);
387 390
@@ -515,11 +518,17 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
515 return NULL; 518 return NULL;
516 } 519 }
517 520
521 if (blk_throtl_init(q)) {
522 kmem_cache_free(blk_requestq_cachep, q);
523 return NULL;
524 }
525
518 setup_timer(&q->backing_dev_info.laptop_mode_wb_timer, 526 setup_timer(&q->backing_dev_info.laptop_mode_wb_timer,
519 laptop_mode_timer_fn, (unsigned long) q); 527 laptop_mode_timer_fn, (unsigned long) q);
520 init_timer(&q->unplug_timer); 528 init_timer(&q->unplug_timer);
521 setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q); 529 setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q);
522 INIT_LIST_HEAD(&q->timeout_list); 530 INIT_LIST_HEAD(&q->timeout_list);
531 INIT_LIST_HEAD(&q->pending_flushes);
523 INIT_WORK(&q->unplug_work, blk_unplug_work); 532 INIT_WORK(&q->unplug_work, blk_unplug_work);
524 533
525 kobject_init(&q->kobj, &blk_queue_ktype); 534 kobject_init(&q->kobj, &blk_queue_ktype);
@@ -796,11 +805,16 @@ static struct request *get_request(struct request_queue *q, int rw_flags,
796 rl->starved[is_sync] = 0; 805 rl->starved[is_sync] = 0;
797 806
798 priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags); 807 priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
799 if (priv) 808 if (priv) {
800 rl->elvpriv++; 809 rl->elvpriv++;
801 810
802 if (blk_queue_io_stat(q)) 811 /*
803 rw_flags |= REQ_IO_STAT; 812 * Don't do stats for non-priv requests
813 */
814 if (blk_queue_io_stat(q))
815 rw_flags |= REQ_IO_STAT;
816 }
817
804 spin_unlock_irq(q->queue_lock); 818 spin_unlock_irq(q->queue_lock);
805 819
806 rq = blk_alloc_request(q, rw_flags, priv, gfp_mask); 820 rq = blk_alloc_request(q, rw_flags, priv, gfp_mask);
@@ -1037,22 +1051,6 @@ void blk_insert_request(struct request_queue *q, struct request *rq,
1037} 1051}
1038EXPORT_SYMBOL(blk_insert_request); 1052EXPORT_SYMBOL(blk_insert_request);
1039 1053
1040/*
1041 * add-request adds a request to the linked list.
1042 * queue lock is held and interrupts disabled, as we muck with the
1043 * request queue list.
1044 */
1045static inline void add_request(struct request_queue *q, struct request *req)
1046{
1047 drive_stat_acct(req, 1);
1048
1049 /*
1050 * elevator indicated where it wants this request to be
1051 * inserted at elevator_merge time
1052 */
1053 __elv_add_request(q, req, ELEVATOR_INSERT_SORT, 0);
1054}
1055
1056static void part_round_stats_single(int cpu, struct hd_struct *part, 1054static void part_round_stats_single(int cpu, struct hd_struct *part,
1057 unsigned long now) 1055 unsigned long now)
1058{ 1056{
@@ -1198,16 +1196,19 @@ static int __make_request(struct request_queue *q, struct bio *bio)
1198 int el_ret; 1196 int el_ret;
1199 unsigned int bytes = bio->bi_size; 1197 unsigned int bytes = bio->bi_size;
1200 const unsigned short prio = bio_prio(bio); 1198 const unsigned short prio = bio_prio(bio);
1201 const bool sync = (bio->bi_rw & REQ_SYNC); 1199 const bool sync = !!(bio->bi_rw & REQ_SYNC);
1202 const bool unplug = (bio->bi_rw & REQ_UNPLUG); 1200 const bool unplug = !!(bio->bi_rw & REQ_UNPLUG);
1203 const unsigned int ff = bio->bi_rw & REQ_FAILFAST_MASK; 1201 const unsigned long ff = bio->bi_rw & REQ_FAILFAST_MASK;
1202 int where = ELEVATOR_INSERT_SORT;
1204 int rw_flags; 1203 int rw_flags;
1205 1204
1206 if ((bio->bi_rw & REQ_HARDBARRIER) && 1205 /* REQ_HARDBARRIER is no more */
1207 (q->next_ordered == QUEUE_ORDERED_NONE)) { 1206 if (WARN_ONCE(bio->bi_rw & REQ_HARDBARRIER,
1207 "block: HARDBARRIER is deprecated, use FLUSH/FUA instead\n")) {
1208 bio_endio(bio, -EOPNOTSUPP); 1208 bio_endio(bio, -EOPNOTSUPP);
1209 return 0; 1209 return 0;
1210 } 1210 }
1211
1211 /* 1212 /*
1212 * low level driver can indicate that it wants pages above a 1213 * low level driver can indicate that it wants pages above a
1213 * certain limit bounced to low memory (ie for highmem, or even 1214 * certain limit bounced to low memory (ie for highmem, or even
@@ -1217,7 +1218,12 @@ static int __make_request(struct request_queue *q, struct bio *bio)
1217 1218
1218 spin_lock_irq(q->queue_lock); 1219 spin_lock_irq(q->queue_lock);
1219 1220
1220 if (unlikely((bio->bi_rw & REQ_HARDBARRIER)) || elv_queue_empty(q)) 1221 if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) {
1222 where = ELEVATOR_INSERT_FRONT;
1223 goto get_rq;
1224 }
1225
1226 if (elv_queue_empty(q))
1221 goto get_rq; 1227 goto get_rq;
1222 1228
1223 el_ret = elv_merge(q, &req, bio); 1229 el_ret = elv_merge(q, &req, bio);
@@ -1314,7 +1320,10 @@ get_rq:
1314 req->cpu = blk_cpu_to_group(smp_processor_id()); 1320 req->cpu = blk_cpu_to_group(smp_processor_id());
1315 if (queue_should_plug(q) && elv_queue_empty(q)) 1321 if (queue_should_plug(q) && elv_queue_empty(q))
1316 blk_plug_device(q); 1322 blk_plug_device(q);
1317 add_request(q, req); 1323
1324 /* insert the request into the elevator */
1325 drive_stat_acct(req, 1);
1326 __elv_add_request(q, req, where, 0);
1318out: 1327out:
1319 if (unplug || !queue_should_plug(q)) 1328 if (unplug || !queue_should_plug(q))
1320 __generic_unplug_device(q); 1329 __generic_unplug_device(q);
@@ -1514,11 +1523,36 @@ static inline void __generic_make_request(struct bio *bio)
1514 if (bio_check_eod(bio, nr_sectors)) 1523 if (bio_check_eod(bio, nr_sectors))
1515 goto end_io; 1524 goto end_io;
1516 1525
1517 if ((bio->bi_rw & REQ_DISCARD) && !blk_queue_discard(q)) { 1526 /*
1527 * Filter flush bio's early so that make_request based
1528 * drivers without flush support don't have to worry
1529 * about them.
1530 */
1531 if ((bio->bi_rw & (REQ_FLUSH | REQ_FUA)) && !q->flush_flags) {
1532 bio->bi_rw &= ~(REQ_FLUSH | REQ_FUA);
1533 if (!nr_sectors) {
1534 err = 0;
1535 goto end_io;
1536 }
1537 }
1538
1539 if ((bio->bi_rw & REQ_DISCARD) &&
1540 (!blk_queue_discard(q) ||
1541 ((bio->bi_rw & REQ_SECURE) &&
1542 !blk_queue_secdiscard(q)))) {
1518 err = -EOPNOTSUPP; 1543 err = -EOPNOTSUPP;
1519 goto end_io; 1544 goto end_io;
1520 } 1545 }
1521 1546
1547 blk_throtl_bio(q, &bio);
1548
1549 /*
1550 * If bio = NULL, bio has been throttled and will be submitted
1551 * later.
1552 */
1553 if (!bio)
1554 break;
1555
1522 trace_block_bio_queue(q, bio); 1556 trace_block_bio_queue(q, bio);
1523 1557
1524 ret = q->make_request_fn(q, bio); 1558 ret = q->make_request_fn(q, bio);
@@ -1609,11 +1643,12 @@ void submit_bio(int rw, struct bio *bio)
1609 1643
1610 if (unlikely(block_dump)) { 1644 if (unlikely(block_dump)) {
1611 char b[BDEVNAME_SIZE]; 1645 char b[BDEVNAME_SIZE];
1612 printk(KERN_DEBUG "%s(%d): %s block %Lu on %s\n", 1646 printk(KERN_DEBUG "%s(%d): %s block %Lu on %s (%u sectors)\n",
1613 current->comm, task_pid_nr(current), 1647 current->comm, task_pid_nr(current),
1614 (rw & WRITE) ? "WRITE" : "READ", 1648 (rw & WRITE) ? "WRITE" : "READ",
1615 (unsigned long long)bio->bi_sector, 1649 (unsigned long long)bio->bi_sector,
1616 bdevname(bio->bi_bdev, b)); 1650 bdevname(bio->bi_bdev, b),
1651 count);
1617 } 1652 }
1618 } 1653 }
1619 1654
@@ -1756,7 +1791,7 @@ static void blk_account_io_completion(struct request *req, unsigned int bytes)
1756 int cpu; 1791 int cpu;
1757 1792
1758 cpu = part_stat_lock(); 1793 cpu = part_stat_lock();
1759 part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req)); 1794 part = req->part;
1760 part_stat_add(cpu, part, sectors[rw], bytes >> 9); 1795 part_stat_add(cpu, part, sectors[rw], bytes >> 9);
1761 part_stat_unlock(); 1796 part_stat_unlock();
1762 } 1797 }
@@ -1765,18 +1800,18 @@ static void blk_account_io_completion(struct request *req, unsigned int bytes)
1765static void blk_account_io_done(struct request *req) 1800static void blk_account_io_done(struct request *req)
1766{ 1801{
1767 /* 1802 /*
1768 * Account IO completion. bar_rq isn't accounted as a normal 1803 * Account IO completion. flush_rq isn't accounted as a
1769 * IO on queueing nor completion. Accounting the containing 1804 * normal IO on queueing nor completion. Accounting the
1770 * request is enough. 1805 * containing request is enough.
1771 */ 1806 */
1772 if (blk_do_io_stat(req) && req != &req->q->bar_rq) { 1807 if (blk_do_io_stat(req) && req != &req->q->flush_rq) {
1773 unsigned long duration = jiffies - req->start_time; 1808 unsigned long duration = jiffies - req->start_time;
1774 const int rw = rq_data_dir(req); 1809 const int rw = rq_data_dir(req);
1775 struct hd_struct *part; 1810 struct hd_struct *part;
1776 int cpu; 1811 int cpu;
1777 1812
1778 cpu = part_stat_lock(); 1813 cpu = part_stat_lock();
1779 part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req)); 1814 part = req->part;
1780 1815
1781 part_stat_inc(cpu, part, ios[rw]); 1816 part_stat_inc(cpu, part, ios[rw]);
1782 part_stat_add(cpu, part, ticks[rw], duration); 1817 part_stat_add(cpu, part, ticks[rw], duration);
@@ -2494,9 +2529,7 @@ EXPORT_SYMBOL_GPL(blk_rq_unprep_clone);
2494static void __blk_rq_prep_clone(struct request *dst, struct request *src) 2529static void __blk_rq_prep_clone(struct request *dst, struct request *src)
2495{ 2530{
2496 dst->cpu = src->cpu; 2531 dst->cpu = src->cpu;
2497 dst->cmd_flags = (rq_data_dir(src) | REQ_NOMERGE); 2532 dst->cmd_flags = (src->cmd_flags & REQ_CLONE_MASK) | REQ_NOMERGE;
2498 if (src->cmd_flags & REQ_DISCARD)
2499 dst->cmd_flags |= REQ_DISCARD;
2500 dst->cmd_type = src->cmd_type; 2533 dst->cmd_type = src->cmd_type;
2501 dst->__sector = blk_rq_pos(src); 2534 dst->__sector = blk_rq_pos(src);
2502 dst->__data_len = blk_rq_bytes(src); 2535 dst->__data_len = blk_rq_bytes(src);
@@ -2576,6 +2609,13 @@ int kblockd_schedule_work(struct request_queue *q, struct work_struct *work)
2576} 2609}
2577EXPORT_SYMBOL(kblockd_schedule_work); 2610EXPORT_SYMBOL(kblockd_schedule_work);
2578 2611
2612int kblockd_schedule_delayed_work(struct request_queue *q,
2613 struct delayed_work *dwork, unsigned long delay)
2614{
2615 return queue_delayed_work(kblockd_workqueue, dwork, delay);
2616}
2617EXPORT_SYMBOL(kblockd_schedule_delayed_work);
2618
2579int __init blk_dev_init(void) 2619int __init blk_dev_init(void)
2580{ 2620{
2581 BUILD_BUG_ON(__REQ_NR_BITS > 8 * 2621 BUILD_BUG_ON(__REQ_NR_BITS > 8 *
diff --git a/block/blk-exec.c b/block/blk-exec.c
index e1672f14840e..cf1456a02acd 100644
--- a/block/blk-exec.c
+++ b/block/blk-exec.c
@@ -80,6 +80,7 @@ int blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk,
80 DECLARE_COMPLETION_ONSTACK(wait); 80 DECLARE_COMPLETION_ONSTACK(wait);
81 char sense[SCSI_SENSE_BUFFERSIZE]; 81 char sense[SCSI_SENSE_BUFFERSIZE];
82 int err = 0; 82 int err = 0;
83 unsigned long hang_check;
83 84
84 /* 85 /*
85 * we need an extra reference to the request, so we can look at 86 * we need an extra reference to the request, so we can look at
@@ -95,7 +96,13 @@ int blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk,
95 96
96 rq->end_io_data = &wait; 97 rq->end_io_data = &wait;
97 blk_execute_rq_nowait(q, bd_disk, rq, at_head, blk_end_sync_rq); 98 blk_execute_rq_nowait(q, bd_disk, rq, at_head, blk_end_sync_rq);
98 wait_for_completion(&wait); 99
100 /* Prevent hang_check timer from firing at us during very long I/O */
101 hang_check = sysctl_hung_task_timeout_secs;
102 if (hang_check)
103 while (!wait_for_completion_timeout(&wait, hang_check * (HZ/2)));
104 else
105 wait_for_completion(&wait);
99 106
100 if (rq->errors) 107 if (rq->errors)
101 err = -EIO; 108 err = -EIO;
diff --git a/block/blk-flush.c b/block/blk-flush.c
new file mode 100644
index 000000000000..54b123d6563e
--- /dev/null
+++ b/block/blk-flush.c
@@ -0,0 +1,262 @@
1/*
2 * Functions to sequence FLUSH and FUA writes.
3 */
4#include <linux/kernel.h>
5#include <linux/module.h>
6#include <linux/bio.h>
7#include <linux/blkdev.h>
8#include <linux/gfp.h>
9
10#include "blk.h"
11
12/* FLUSH/FUA sequences */
13enum {
14 QUEUE_FSEQ_STARTED = (1 << 0), /* flushing in progress */
15 QUEUE_FSEQ_PREFLUSH = (1 << 1), /* pre-flushing in progress */
16 QUEUE_FSEQ_DATA = (1 << 2), /* data write in progress */
17 QUEUE_FSEQ_POSTFLUSH = (1 << 3), /* post-flushing in progress */
18 QUEUE_FSEQ_DONE = (1 << 4),
19};
20
21static struct request *queue_next_fseq(struct request_queue *q);
22
23unsigned blk_flush_cur_seq(struct request_queue *q)
24{
25 if (!q->flush_seq)
26 return 0;
27 return 1 << ffz(q->flush_seq);
28}
29
30static struct request *blk_flush_complete_seq(struct request_queue *q,
31 unsigned seq, int error)
32{
33 struct request *next_rq = NULL;
34
35 if (error && !q->flush_err)
36 q->flush_err = error;
37
38 BUG_ON(q->flush_seq & seq);
39 q->flush_seq |= seq;
40
41 if (blk_flush_cur_seq(q) != QUEUE_FSEQ_DONE) {
42 /* not complete yet, queue the next flush sequence */
43 next_rq = queue_next_fseq(q);
44 } else {
45 /* complete this flush request */
46 __blk_end_request_all(q->orig_flush_rq, q->flush_err);
47 q->orig_flush_rq = NULL;
48 q->flush_seq = 0;
49
50 /* dispatch the next flush if there's one */
51 if (!list_empty(&q->pending_flushes)) {
52 next_rq = list_entry_rq(q->pending_flushes.next);
53 list_move(&next_rq->queuelist, &q->queue_head);
54 }
55 }
56 return next_rq;
57}
58
59static void blk_flush_complete_seq_end_io(struct request_queue *q,
60 unsigned seq, int error)
61{
62 bool was_empty = elv_queue_empty(q);
63 struct request *next_rq;
64
65 next_rq = blk_flush_complete_seq(q, seq, error);
66
67 /*
68 * Moving a request silently to empty queue_head may stall the
69 * queue. Kick the queue in those cases.
70 */
71 if (was_empty && next_rq)
72 __blk_run_queue(q);
73}
74
75static void pre_flush_end_io(struct request *rq, int error)
76{
77 elv_completed_request(rq->q, rq);
78 blk_flush_complete_seq_end_io(rq->q, QUEUE_FSEQ_PREFLUSH, error);
79}
80
81static void flush_data_end_io(struct request *rq, int error)
82{
83 elv_completed_request(rq->q, rq);
84 blk_flush_complete_seq_end_io(rq->q, QUEUE_FSEQ_DATA, error);
85}
86
87static void post_flush_end_io(struct request *rq, int error)
88{
89 elv_completed_request(rq->q, rq);
90 blk_flush_complete_seq_end_io(rq->q, QUEUE_FSEQ_POSTFLUSH, error);
91}
92
93static void init_flush_request(struct request *rq, struct gendisk *disk)
94{
95 rq->cmd_type = REQ_TYPE_FS;
96 rq->cmd_flags = WRITE_FLUSH;
97 rq->rq_disk = disk;
98}
99
100static struct request *queue_next_fseq(struct request_queue *q)
101{
102 struct request *orig_rq = q->orig_flush_rq;
103 struct request *rq = &q->flush_rq;
104
105 blk_rq_init(q, rq);
106
107 switch (blk_flush_cur_seq(q)) {
108 case QUEUE_FSEQ_PREFLUSH:
109 init_flush_request(rq, orig_rq->rq_disk);
110 rq->end_io = pre_flush_end_io;
111 break;
112 case QUEUE_FSEQ_DATA:
113 init_request_from_bio(rq, orig_rq->bio);
114 /*
115 * orig_rq->rq_disk may be different from
116 * bio->bi_bdev->bd_disk if orig_rq got here through
117 * remapping drivers. Make sure rq->rq_disk points
118 * to the same one as orig_rq.
119 */
120 rq->rq_disk = orig_rq->rq_disk;
121 rq->cmd_flags &= ~(REQ_FLUSH | REQ_FUA);
122 rq->cmd_flags |= orig_rq->cmd_flags & (REQ_FLUSH | REQ_FUA);
123 rq->end_io = flush_data_end_io;
124 break;
125 case QUEUE_FSEQ_POSTFLUSH:
126 init_flush_request(rq, orig_rq->rq_disk);
127 rq->end_io = post_flush_end_io;
128 break;
129 default:
130 BUG();
131 }
132
133 elv_insert(q, rq, ELEVATOR_INSERT_FRONT);
134 return rq;
135}
136
137struct request *blk_do_flush(struct request_queue *q, struct request *rq)
138{
139 unsigned int fflags = q->flush_flags; /* may change, cache it */
140 bool has_flush = fflags & REQ_FLUSH, has_fua = fflags & REQ_FUA;
141 bool do_preflush = has_flush && (rq->cmd_flags & REQ_FLUSH);
142 bool do_postflush = has_flush && !has_fua && (rq->cmd_flags & REQ_FUA);
143 unsigned skip = 0;
144
145 /*
146 * Special case. If there's data but flush is not necessary,
147 * the request can be issued directly.
148 *
149 * Flush w/o data should be able to be issued directly too but
150 * currently some drivers assume that rq->bio contains
151 * non-zero data if it isn't NULL and empty FLUSH requests
152 * getting here usually have bio's without data.
153 */
154 if (blk_rq_sectors(rq) && !do_preflush && !do_postflush) {
155 rq->cmd_flags &= ~REQ_FLUSH;
156 if (!has_fua)
157 rq->cmd_flags &= ~REQ_FUA;
158 return rq;
159 }
160
161 /*
162 * Sequenced flushes can't be processed in parallel. If
163 * another one is already in progress, queue for later
164 * processing.
165 */
166 if (q->flush_seq) {
167 list_move_tail(&rq->queuelist, &q->pending_flushes);
168 return NULL;
169 }
170
171 /*
172 * Start a new flush sequence
173 */
174 q->flush_err = 0;
175 q->flush_seq |= QUEUE_FSEQ_STARTED;
176
177 /* adjust FLUSH/FUA of the original request and stash it away */
178 rq->cmd_flags &= ~REQ_FLUSH;
179 if (!has_fua)
180 rq->cmd_flags &= ~REQ_FUA;
181 blk_dequeue_request(rq);
182 q->orig_flush_rq = rq;
183
184 /* skip unneded sequences and return the first one */
185 if (!do_preflush)
186 skip |= QUEUE_FSEQ_PREFLUSH;
187 if (!blk_rq_sectors(rq))
188 skip |= QUEUE_FSEQ_DATA;
189 if (!do_postflush)
190 skip |= QUEUE_FSEQ_POSTFLUSH;
191 return blk_flush_complete_seq(q, skip, 0);
192}
193
194static void bio_end_flush(struct bio *bio, int err)
195{
196 if (err)
197 clear_bit(BIO_UPTODATE, &bio->bi_flags);
198 if (bio->bi_private)
199 complete(bio->bi_private);
200 bio_put(bio);
201}
202
203/**
204 * blkdev_issue_flush - queue a flush
205 * @bdev: blockdev to issue flush for
206 * @gfp_mask: memory allocation flags (for bio_alloc)
207 * @error_sector: error sector
208 *
209 * Description:
210 * Issue a flush for the block device in question. Caller can supply
211 * room for storing the error offset in case of a flush error, if they
212 * wish to. If WAIT flag is not passed then caller may check only what
213 * request was pushed in some internal queue for later handling.
214 */
215int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask,
216 sector_t *error_sector)
217{
218 DECLARE_COMPLETION_ONSTACK(wait);
219 struct request_queue *q;
220 struct bio *bio;
221 int ret = 0;
222
223 if (bdev->bd_disk == NULL)
224 return -ENXIO;
225
226 q = bdev_get_queue(bdev);
227 if (!q)
228 return -ENXIO;
229
230 /*
231 * some block devices may not have their queue correctly set up here
232 * (e.g. loop device without a backing file) and so issuing a flush
233 * here will panic. Ensure there is a request function before issuing
234 * the flush.
235 */
236 if (!q->make_request_fn)
237 return -ENXIO;
238
239 bio = bio_alloc(gfp_mask, 0);
240 bio->bi_end_io = bio_end_flush;
241 bio->bi_bdev = bdev;
242 bio->bi_private = &wait;
243
244 bio_get(bio);
245 submit_bio(WRITE_FLUSH, bio);
246 wait_for_completion(&wait);
247
248 /*
249 * The driver must store the error location in ->bi_sector, if
250 * it supports it. For non-stacked drivers, this should be
251 * copied from blk_rq_pos(rq).
252 */
253 if (error_sector)
254 *error_sector = bio->bi_sector;
255
256 if (!bio_flagged(bio, BIO_UPTODATE))
257 ret = -EIO;
258
259 bio_put(bio);
260 return ret;
261}
262EXPORT_SYMBOL(blkdev_issue_flush);
diff --git a/block/blk-integrity.c b/block/blk-integrity.c
index edce1ef7933d..54bcba6c02a7 100644
--- a/block/blk-integrity.c
+++ b/block/blk-integrity.c
@@ -32,24 +32,37 @@ static struct kmem_cache *integrity_cachep;
32 32
33/** 33/**
34 * blk_rq_count_integrity_sg - Count number of integrity scatterlist elements 34 * blk_rq_count_integrity_sg - Count number of integrity scatterlist elements
35 * @rq: request with integrity metadata attached 35 * @q: request queue
36 * @bio: bio with integrity metadata attached
36 * 37 *
37 * Description: Returns the number of elements required in a 38 * Description: Returns the number of elements required in a
38 * scatterlist corresponding to the integrity metadata in a request. 39 * scatterlist corresponding to the integrity metadata in a bio.
39 */ 40 */
40int blk_rq_count_integrity_sg(struct request *rq) 41int blk_rq_count_integrity_sg(struct request_queue *q, struct bio *bio)
41{ 42{
42 struct bio_vec *iv, *ivprv; 43 struct bio_vec *iv, *ivprv = NULL;
43 struct req_iterator iter; 44 unsigned int segments = 0;
44 unsigned int segments; 45 unsigned int seg_size = 0;
46 unsigned int i = 0;
45 47
46 ivprv = NULL; 48 bio_for_each_integrity_vec(iv, bio, i) {
47 segments = 0;
48 49
49 rq_for_each_integrity_segment(iv, rq, iter) { 50 if (ivprv) {
51 if (!BIOVEC_PHYS_MERGEABLE(ivprv, iv))
52 goto new_segment;
53
54 if (!BIOVEC_SEG_BOUNDARY(q, ivprv, iv))
55 goto new_segment;
50 56
51 if (!ivprv || !BIOVEC_PHYS_MERGEABLE(ivprv, iv)) 57 if (seg_size + iv->bv_len > queue_max_segment_size(q))
58 goto new_segment;
59
60 seg_size += iv->bv_len;
61 } else {
62new_segment:
52 segments++; 63 segments++;
64 seg_size = iv->bv_len;
65 }
53 66
54 ivprv = iv; 67 ivprv = iv;
55 } 68 }
@@ -60,30 +73,34 @@ EXPORT_SYMBOL(blk_rq_count_integrity_sg);
60 73
61/** 74/**
62 * blk_rq_map_integrity_sg - Map integrity metadata into a scatterlist 75 * blk_rq_map_integrity_sg - Map integrity metadata into a scatterlist
63 * @rq: request with integrity metadata attached 76 * @q: request queue
77 * @bio: bio with integrity metadata attached
64 * @sglist: target scatterlist 78 * @sglist: target scatterlist
65 * 79 *
66 * Description: Map the integrity vectors in request into a 80 * Description: Map the integrity vectors in request into a
67 * scatterlist. The scatterlist must be big enough to hold all 81 * scatterlist. The scatterlist must be big enough to hold all
68 * elements. I.e. sized using blk_rq_count_integrity_sg(). 82 * elements. I.e. sized using blk_rq_count_integrity_sg().
69 */ 83 */
70int blk_rq_map_integrity_sg(struct request *rq, struct scatterlist *sglist) 84int blk_rq_map_integrity_sg(struct request_queue *q, struct bio *bio,
85 struct scatterlist *sglist)
71{ 86{
72 struct bio_vec *iv, *ivprv; 87 struct bio_vec *iv, *ivprv = NULL;
73 struct req_iterator iter; 88 struct scatterlist *sg = NULL;
74 struct scatterlist *sg; 89 unsigned int segments = 0;
75 unsigned int segments; 90 unsigned int i = 0;
76
77 ivprv = NULL;
78 sg = NULL;
79 segments = 0;
80 91
81 rq_for_each_integrity_segment(iv, rq, iter) { 92 bio_for_each_integrity_vec(iv, bio, i) {
82 93
83 if (ivprv) { 94 if (ivprv) {
84 if (!BIOVEC_PHYS_MERGEABLE(ivprv, iv)) 95 if (!BIOVEC_PHYS_MERGEABLE(ivprv, iv))
85 goto new_segment; 96 goto new_segment;
86 97
98 if (!BIOVEC_SEG_BOUNDARY(q, ivprv, iv))
99 goto new_segment;
100
101 if (sg->length + iv->bv_len > queue_max_segment_size(q))
102 goto new_segment;
103
87 sg->length += iv->bv_len; 104 sg->length += iv->bv_len;
88 } else { 105 } else {
89new_segment: 106new_segment:
@@ -162,6 +179,40 @@ int blk_integrity_compare(struct gendisk *gd1, struct gendisk *gd2)
162} 179}
163EXPORT_SYMBOL(blk_integrity_compare); 180EXPORT_SYMBOL(blk_integrity_compare);
164 181
182int blk_integrity_merge_rq(struct request_queue *q, struct request *req,
183 struct request *next)
184{
185 if (blk_integrity_rq(req) != blk_integrity_rq(next))
186 return -1;
187
188 if (req->nr_integrity_segments + next->nr_integrity_segments >
189 q->limits.max_integrity_segments)
190 return -1;
191
192 return 0;
193}
194EXPORT_SYMBOL(blk_integrity_merge_rq);
195
196int blk_integrity_merge_bio(struct request_queue *q, struct request *req,
197 struct bio *bio)
198{
199 int nr_integrity_segs;
200 struct bio *next = bio->bi_next;
201
202 bio->bi_next = NULL;
203 nr_integrity_segs = blk_rq_count_integrity_sg(q, bio);
204 bio->bi_next = next;
205
206 if (req->nr_integrity_segments + nr_integrity_segs >
207 q->limits.max_integrity_segments)
208 return -1;
209
210 req->nr_integrity_segments += nr_integrity_segs;
211
212 return 0;
213}
214EXPORT_SYMBOL(blk_integrity_merge_bio);
215
165struct integrity_sysfs_entry { 216struct integrity_sysfs_entry {
166 struct attribute attr; 217 struct attribute attr;
167 ssize_t (*show)(struct blk_integrity *, char *); 218 ssize_t (*show)(struct blk_integrity *, char *);
@@ -381,7 +432,6 @@ void blk_integrity_unregister(struct gendisk *disk)
381 kobject_uevent(&bi->kobj, KOBJ_REMOVE); 432 kobject_uevent(&bi->kobj, KOBJ_REMOVE);
382 kobject_del(&bi->kobj); 433 kobject_del(&bi->kobj);
383 kobject_put(&bi->kobj); 434 kobject_put(&bi->kobj);
384 kmem_cache_free(integrity_cachep, bi);
385 disk->integrity = NULL; 435 disk->integrity = NULL;
386} 436}
387EXPORT_SYMBOL(blk_integrity_unregister); 437EXPORT_SYMBOL(blk_integrity_unregister);
diff --git a/block/blk-lib.c b/block/blk-lib.c
index c1fc55a83ba1..1a320d2406b0 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -39,8 +39,7 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
39{ 39{
40 DECLARE_COMPLETION_ONSTACK(wait); 40 DECLARE_COMPLETION_ONSTACK(wait);
41 struct request_queue *q = bdev_get_queue(bdev); 41 struct request_queue *q = bdev_get_queue(bdev);
42 int type = flags & BLKDEV_IFL_BARRIER ? 42 int type = REQ_WRITE | REQ_DISCARD;
43 DISCARD_BARRIER : DISCARD_NOBARRIER;
44 unsigned int max_discard_sectors; 43 unsigned int max_discard_sectors;
45 struct bio *bio; 44 struct bio *bio;
46 int ret = 0; 45 int ret = 0;
@@ -62,6 +61,12 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
62 max_discard_sectors &= ~(disc_sects - 1); 61 max_discard_sectors &= ~(disc_sects - 1);
63 } 62 }
64 63
64 if (flags & BLKDEV_DISCARD_SECURE) {
65 if (!blk_queue_secdiscard(q))
66 return -EOPNOTSUPP;
67 type |= REQ_SECURE;
68 }
69
65 while (nr_sects && !ret) { 70 while (nr_sects && !ret) {
66 bio = bio_alloc(gfp_mask, 1); 71 bio = bio_alloc(gfp_mask, 1);
67 if (!bio) { 72 if (!bio) {
@@ -72,8 +77,7 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
72 bio->bi_sector = sector; 77 bio->bi_sector = sector;
73 bio->bi_end_io = blkdev_discard_end_io; 78 bio->bi_end_io = blkdev_discard_end_io;
74 bio->bi_bdev = bdev; 79 bio->bi_bdev = bdev;
75 if (flags & BLKDEV_IFL_WAIT) 80 bio->bi_private = &wait;
76 bio->bi_private = &wait;
77 81
78 if (nr_sects > max_discard_sectors) { 82 if (nr_sects > max_discard_sectors) {
79 bio->bi_size = max_discard_sectors << 9; 83 bio->bi_size = max_discard_sectors << 9;
@@ -87,8 +91,7 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
87 bio_get(bio); 91 bio_get(bio);
88 submit_bio(type, bio); 92 submit_bio(type, bio);
89 93
90 if (flags & BLKDEV_IFL_WAIT) 94 wait_for_completion(&wait);
91 wait_for_completion(&wait);
92 95
93 if (bio_flagged(bio, BIO_EOPNOTSUPP)) 96 if (bio_flagged(bio, BIO_EOPNOTSUPP))
94 ret = -EOPNOTSUPP; 97 ret = -EOPNOTSUPP;
@@ -134,7 +137,6 @@ static void bio_batch_end_io(struct bio *bio, int err)
134 * @sector: start sector 137 * @sector: start sector
135 * @nr_sects: number of sectors to write 138 * @nr_sects: number of sectors to write
136 * @gfp_mask: memory allocation flags (for bio_alloc) 139 * @gfp_mask: memory allocation flags (for bio_alloc)
137 * @flags: BLKDEV_IFL_* flags to control behaviour
138 * 140 *
139 * Description: 141 * Description:
140 * Generate and issue number of bios with zerofiled pages. 142 * Generate and issue number of bios with zerofiled pages.
@@ -143,7 +145,7 @@ static void bio_batch_end_io(struct bio *bio, int err)
143 */ 145 */
144 146
145int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, 147int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
146 sector_t nr_sects, gfp_t gfp_mask, unsigned long flags) 148 sector_t nr_sects, gfp_t gfp_mask)
147{ 149{
148 int ret; 150 int ret;
149 struct bio *bio; 151 struct bio *bio;
@@ -156,12 +158,6 @@ int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
156 bb.wait = &wait; 158 bb.wait = &wait;
157 bb.end_io = NULL; 159 bb.end_io = NULL;
158 160
159 if (flags & BLKDEV_IFL_BARRIER) {
160 /* issue async barrier before the data */
161 ret = blkdev_issue_flush(bdev, gfp_mask, NULL, 0);
162 if (ret)
163 return ret;
164 }
165submit: 161submit:
166 ret = 0; 162 ret = 0;
167 while (nr_sects != 0) { 163 while (nr_sects != 0) {
@@ -175,8 +171,7 @@ submit:
175 bio->bi_sector = sector; 171 bio->bi_sector = sector;
176 bio->bi_bdev = bdev; 172 bio->bi_bdev = bdev;
177 bio->bi_end_io = bio_batch_end_io; 173 bio->bi_end_io = bio_batch_end_io;
178 if (flags & BLKDEV_IFL_WAIT) 174 bio->bi_private = &bb;
179 bio->bi_private = &bb;
180 175
181 while (nr_sects != 0) { 176 while (nr_sects != 0) {
182 sz = min((sector_t) PAGE_SIZE >> 9 , nr_sects); 177 sz = min((sector_t) PAGE_SIZE >> 9 , nr_sects);
@@ -193,18 +188,10 @@ submit:
193 issued++; 188 issued++;
194 submit_bio(WRITE, bio); 189 submit_bio(WRITE, bio);
195 } 190 }
196 /*
197 * When all data bios are in flight. Send final barrier if requeted.
198 */
199 if (nr_sects == 0 && flags & BLKDEV_IFL_BARRIER)
200 ret = blkdev_issue_flush(bdev, gfp_mask, NULL,
201 flags & BLKDEV_IFL_WAIT);
202
203 191
204 if (flags & BLKDEV_IFL_WAIT) 192 /* Wait for bios in-flight */
205 /* Wait for bios in-flight */ 193 while (issued != atomic_read(&bb.done))
206 while ( issued != atomic_read(&bb.done)) 194 wait_for_completion(&wait);
207 wait_for_completion(&wait);
208 195
209 if (!test_bit(BIO_UPTODATE, &bb.flags)) 196 if (!test_bit(BIO_UPTODATE, &bb.flags))
210 /* One of bios in the batch was completed with error.*/ 197 /* One of bios in the batch was completed with error.*/
diff --git a/block/blk-map.c b/block/blk-map.c
index c65d7593f7f1..d4a586d8691e 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -54,7 +54,7 @@ static int __blk_rq_map_user(struct request_queue *q, struct request *rq,
54 * direct dma. else, set up kernel bounce buffers 54 * direct dma. else, set up kernel bounce buffers
55 */ 55 */
56 uaddr = (unsigned long) ubuf; 56 uaddr = (unsigned long) ubuf;
57 if (blk_rq_aligned(q, ubuf, len) && !map_data) 57 if (blk_rq_aligned(q, uaddr, len) && !map_data)
58 bio = bio_map_user(q, NULL, uaddr, len, reading, gfp_mask); 58 bio = bio_map_user(q, NULL, uaddr, len, reading, gfp_mask);
59 else 59 else
60 bio = bio_copy_user(q, map_data, uaddr, len, reading, gfp_mask); 60 bio = bio_copy_user(q, map_data, uaddr, len, reading, gfp_mask);
@@ -288,6 +288,7 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
288 unsigned int len, gfp_t gfp_mask) 288 unsigned int len, gfp_t gfp_mask)
289{ 289{
290 int reading = rq_data_dir(rq) == READ; 290 int reading = rq_data_dir(rq) == READ;
291 unsigned long addr = (unsigned long) kbuf;
291 int do_copy = 0; 292 int do_copy = 0;
292 struct bio *bio; 293 struct bio *bio;
293 int ret; 294 int ret;
@@ -297,7 +298,7 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
297 if (!len || !kbuf) 298 if (!len || !kbuf)
298 return -EINVAL; 299 return -EINVAL;
299 300
300 do_copy = !blk_rq_aligned(q, kbuf, len) || object_is_on_stack(kbuf); 301 do_copy = !blk_rq_aligned(q, addr, len) || object_is_on_stack(kbuf);
301 if (do_copy) 302 if (do_copy)
302 bio = bio_copy_kern(q, kbuf, len, gfp_mask, reading); 303 bio = bio_copy_kern(q, kbuf, len, gfp_mask, reading);
303 else 304 else
@@ -307,7 +308,7 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
307 return PTR_ERR(bio); 308 return PTR_ERR(bio);
308 309
309 if (rq_data_dir(rq) == WRITE) 310 if (rq_data_dir(rq) == WRITE)
310 bio->bi_rw |= (1 << REQ_WRITE); 311 bio->bi_rw |= REQ_WRITE;
311 312
312 if (do_copy) 313 if (do_copy)
313 rq->cmd_flags |= REQ_COPY_USER; 314 rq->cmd_flags |= REQ_COPY_USER;
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 3b0cd4249671..0a2fd8a48a38 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -205,12 +205,11 @@ static inline int ll_new_hw_segment(struct request_queue *q,
205{ 205{
206 int nr_phys_segs = bio_phys_segments(q, bio); 206 int nr_phys_segs = bio_phys_segments(q, bio);
207 207
208 if (req->nr_phys_segments + nr_phys_segs > queue_max_segments(q)) { 208 if (req->nr_phys_segments + nr_phys_segs > queue_max_segments(q))
209 req->cmd_flags |= REQ_NOMERGE; 209 goto no_merge;
210 if (req == q->last_merge) 210
211 q->last_merge = NULL; 211 if (bio_integrity(bio) && blk_integrity_merge_bio(q, req, bio))
212 return 0; 212 goto no_merge;
213 }
214 213
215 /* 214 /*
216 * This will form the start of a new hw segment. Bump both 215 * This will form the start of a new hw segment. Bump both
@@ -218,6 +217,12 @@ static inline int ll_new_hw_segment(struct request_queue *q,
218 */ 217 */
219 req->nr_phys_segments += nr_phys_segs; 218 req->nr_phys_segments += nr_phys_segs;
220 return 1; 219 return 1;
220
221no_merge:
222 req->cmd_flags |= REQ_NOMERGE;
223 if (req == q->last_merge)
224 q->last_merge = NULL;
225 return 0;
221} 226}
222 227
223int ll_back_merge_fn(struct request_queue *q, struct request *req, 228int ll_back_merge_fn(struct request_queue *q, struct request *req,
@@ -301,6 +306,9 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
301 if (total_phys_segments > queue_max_segments(q)) 306 if (total_phys_segments > queue_max_segments(q))
302 return 0; 307 return 0;
303 308
309 if (blk_integrity_rq(req) && blk_integrity_merge_rq(q, req, next))
310 return 0;
311
304 /* Merge is OK... */ 312 /* Merge is OK... */
305 req->nr_phys_segments = total_phys_segments; 313 req->nr_phys_segments = total_phys_segments;
306 return 1; 314 return 1;
@@ -343,7 +351,7 @@ static void blk_account_io_merge(struct request *req)
343 int cpu; 351 int cpu;
344 352
345 cpu = part_stat_lock(); 353 cpu = part_stat_lock();
346 part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req)); 354 part = req->part;
347 355
348 part_round_stats(cpu, part); 356 part_round_stats(cpu, part);
349 part_dec_in_flight(part, rq_data_dir(req)); 357 part_dec_in_flight(part, rq_data_dir(req));
@@ -362,6 +370,18 @@ static int attempt_merge(struct request_queue *q, struct request *req,
362 return 0; 370 return 0;
363 371
364 /* 372 /*
373 * Don't merge file system requests and discard requests
374 */
375 if ((req->cmd_flags & REQ_DISCARD) != (next->cmd_flags & REQ_DISCARD))
376 return 0;
377
378 /*
379 * Don't merge discard requests and secure discard requests
380 */
381 if ((req->cmd_flags & REQ_SECURE) != (next->cmd_flags & REQ_SECURE))
382 return 0;
383
384 /*
365 * not contiguous 385 * not contiguous
366 */ 386 */
367 if (blk_rq_pos(req) + blk_rq_sectors(req) != blk_rq_pos(next)) 387 if (blk_rq_pos(req) + blk_rq_sectors(req) != blk_rq_pos(next))
@@ -372,9 +392,6 @@ static int attempt_merge(struct request_queue *q, struct request *req,
372 || next->special) 392 || next->special)
373 return 0; 393 return 0;
374 394
375 if (blk_integrity_rq(req) != blk_integrity_rq(next))
376 return 0;
377
378 /* 395 /*
379 * If we are allowed to merge, then append bio list 396 * If we are allowed to merge, then append bio list
380 * from next to rq and release next. merge_requests_fn 397 * from next to rq and release next. merge_requests_fn
diff --git a/block/blk-settings.c b/block/blk-settings.c
index a234f4bf1d6f..701859fb9647 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -111,6 +111,7 @@ EXPORT_SYMBOL_GPL(blk_queue_lld_busy);
111void blk_set_default_limits(struct queue_limits *lim) 111void blk_set_default_limits(struct queue_limits *lim)
112{ 112{
113 lim->max_segments = BLK_MAX_SEGMENTS; 113 lim->max_segments = BLK_MAX_SEGMENTS;
114 lim->max_integrity_segments = 0;
114 lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK; 115 lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK;
115 lim->max_segment_size = BLK_MAX_SEGMENT_SIZE; 116 lim->max_segment_size = BLK_MAX_SEGMENT_SIZE;
116 lim->max_sectors = BLK_DEF_MAX_SECTORS; 117 lim->max_sectors = BLK_DEF_MAX_SECTORS;
@@ -213,7 +214,7 @@ void blk_queue_bounce_limit(struct request_queue *q, u64 dma_mask)
213 */ 214 */
214 if (b_pfn < (min_t(u64, 0xffffffffUL, BLK_BOUNCE_HIGH) >> PAGE_SHIFT)) 215 if (b_pfn < (min_t(u64, 0xffffffffUL, BLK_BOUNCE_HIGH) >> PAGE_SHIFT))
215 dma = 1; 216 dma = 1;
216 q->limits.bounce_pfn = max_low_pfn; 217 q->limits.bounce_pfn = max(max_low_pfn, b_pfn);
217#else 218#else
218 if (b_pfn < blk_max_low_pfn) 219 if (b_pfn < blk_max_low_pfn)
219 dma = 1; 220 dma = 1;
@@ -343,7 +344,7 @@ EXPORT_SYMBOL(blk_queue_logical_block_size);
343 * hardware can operate on without reverting to read-modify-write 344 * hardware can operate on without reverting to read-modify-write
344 * operations. 345 * operations.
345 */ 346 */
346void blk_queue_physical_block_size(struct request_queue *q, unsigned short size) 347void blk_queue_physical_block_size(struct request_queue *q, unsigned int size)
347{ 348{
348 q->limits.physical_block_size = size; 349 q->limits.physical_block_size = size;
349 350
@@ -455,11 +456,6 @@ void blk_queue_io_opt(struct request_queue *q, unsigned int opt)
455} 456}
456EXPORT_SYMBOL(blk_queue_io_opt); 457EXPORT_SYMBOL(blk_queue_io_opt);
457 458
458/*
459 * Returns the minimum that is _not_ zero, unless both are zero.
460 */
461#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r))
462
463/** 459/**
464 * blk_queue_stack_limits - inherit underlying queue limits for stacked drivers 460 * blk_queue_stack_limits - inherit underlying queue limits for stacked drivers
465 * @t: the stacking driver (top) 461 * @t: the stacking driver (top)
@@ -514,6 +510,8 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
514 b->seg_boundary_mask); 510 b->seg_boundary_mask);
515 511
516 t->max_segments = min_not_zero(t->max_segments, b->max_segments); 512 t->max_segments = min_not_zero(t->max_segments, b->max_segments);
513 t->max_integrity_segments = min_not_zero(t->max_integrity_segments,
514 b->max_integrity_segments);
517 515
518 t->max_segment_size = min_not_zero(t->max_segment_size, 516 t->max_segment_size = min_not_zero(t->max_segment_size,
519 b->max_segment_size); 517 b->max_segment_size);
@@ -794,6 +792,26 @@ void blk_queue_update_dma_alignment(struct request_queue *q, int mask)
794} 792}
795EXPORT_SYMBOL(blk_queue_update_dma_alignment); 793EXPORT_SYMBOL(blk_queue_update_dma_alignment);
796 794
795/**
796 * blk_queue_flush - configure queue's cache flush capability
797 * @q: the request queue for the device
798 * @flush: 0, REQ_FLUSH or REQ_FLUSH | REQ_FUA
799 *
800 * Tell block layer cache flush capability of @q. If it supports
801 * flushing, REQ_FLUSH should be set. If it supports bypassing
802 * write cache for individual writes, REQ_FUA should be set.
803 */
804void blk_queue_flush(struct request_queue *q, unsigned int flush)
805{
806 WARN_ON_ONCE(flush & ~(REQ_FLUSH | REQ_FUA));
807
808 if (WARN_ON_ONCE(!(flush & REQ_FLUSH) && (flush & REQ_FUA)))
809 flush &= ~REQ_FUA;
810
811 q->flush_flags = flush & (REQ_FLUSH | REQ_FUA);
812}
813EXPORT_SYMBOL_GPL(blk_queue_flush);
814
797static int __init blk_settings_init(void) 815static int __init blk_settings_init(void)
798{ 816{
799 blk_max_low_pfn = max_low_pfn - 1; 817 blk_max_low_pfn = max_low_pfn - 1;
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 001ab18078f5..013457f47fdc 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -112,6 +112,11 @@ static ssize_t queue_max_segments_show(struct request_queue *q, char *page)
112 return queue_var_show(queue_max_segments(q), (page)); 112 return queue_var_show(queue_max_segments(q), (page));
113} 113}
114 114
115static ssize_t queue_max_integrity_segments_show(struct request_queue *q, char *page)
116{
117 return queue_var_show(q->limits.max_integrity_segments, (page));
118}
119
115static ssize_t queue_max_segment_size_show(struct request_queue *q, char *page) 120static ssize_t queue_max_segment_size_show(struct request_queue *q, char *page)
116{ 121{
117 if (test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags)) 122 if (test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags))
@@ -288,6 +293,11 @@ static struct queue_sysfs_entry queue_max_segments_entry = {
288 .show = queue_max_segments_show, 293 .show = queue_max_segments_show,
289}; 294};
290 295
296static struct queue_sysfs_entry queue_max_integrity_segments_entry = {
297 .attr = {.name = "max_integrity_segments", .mode = S_IRUGO },
298 .show = queue_max_integrity_segments_show,
299};
300
291static struct queue_sysfs_entry queue_max_segment_size_entry = { 301static struct queue_sysfs_entry queue_max_segment_size_entry = {
292 .attr = {.name = "max_segment_size", .mode = S_IRUGO }, 302 .attr = {.name = "max_segment_size", .mode = S_IRUGO },
293 .show = queue_max_segment_size_show, 303 .show = queue_max_segment_size_show,
@@ -375,6 +385,7 @@ static struct attribute *default_attrs[] = {
375 &queue_max_hw_sectors_entry.attr, 385 &queue_max_hw_sectors_entry.attr,
376 &queue_max_sectors_entry.attr, 386 &queue_max_sectors_entry.attr,
377 &queue_max_segments_entry.attr, 387 &queue_max_segments_entry.attr,
388 &queue_max_integrity_segments_entry.attr,
378 &queue_max_segment_size_entry.attr, 389 &queue_max_segment_size_entry.attr,
379 &queue_iosched_entry.attr, 390 &queue_iosched_entry.attr,
380 &queue_hw_sector_size_entry.attr, 391 &queue_hw_sector_size_entry.attr,
@@ -460,6 +471,8 @@ static void blk_release_queue(struct kobject *kobj)
460 471
461 blk_sync_queue(q); 472 blk_sync_queue(q);
462 473
474 blk_throtl_exit(q);
475
463 if (rl->rq_pool) 476 if (rl->rq_pool)
464 mempool_destroy(rl->rq_pool); 477 mempool_destroy(rl->rq_pool);
465 478
@@ -511,6 +524,7 @@ int blk_register_queue(struct gendisk *disk)
511 kobject_uevent(&q->kobj, KOBJ_REMOVE); 524 kobject_uevent(&q->kobj, KOBJ_REMOVE);
512 kobject_del(&q->kobj); 525 kobject_del(&q->kobj);
513 blk_trace_remove_sysfs(disk_to_dev(disk)); 526 blk_trace_remove_sysfs(disk_to_dev(disk));
527 kobject_put(&dev->kobj);
514 return ret; 528 return ret;
515 } 529 }
516 530
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
new file mode 100644
index 000000000000..56ad4531b412
--- /dev/null
+++ b/block/blk-throttle.c
@@ -0,0 +1,1123 @@
1/*
2 * Interface for controlling IO bandwidth on a request queue
3 *
4 * Copyright (C) 2010 Vivek Goyal <vgoyal@redhat.com>
5 */
6
7#include <linux/module.h>
8#include <linux/slab.h>
9#include <linux/blkdev.h>
10#include <linux/bio.h>
11#include <linux/blktrace_api.h>
12#include "blk-cgroup.h"
13
14/* Max dispatch from a group in 1 round */
15static int throtl_grp_quantum = 8;
16
17/* Total max dispatch from all groups in one round */
18static int throtl_quantum = 32;
19
20/* Throttling is performed over 100ms slice and after that slice is renewed */
21static unsigned long throtl_slice = HZ/10; /* 100 ms */
22
23struct throtl_rb_root {
24 struct rb_root rb;
25 struct rb_node *left;
26 unsigned int count;
27 unsigned long min_disptime;
28};
29
30#define THROTL_RB_ROOT (struct throtl_rb_root) { .rb = RB_ROOT, .left = NULL, \
31 .count = 0, .min_disptime = 0}
32
33#define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node)
34
35struct throtl_grp {
36 /* List of throtl groups on the request queue*/
37 struct hlist_node tg_node;
38
39 /* active throtl group service_tree member */
40 struct rb_node rb_node;
41
42 /*
43 * Dispatch time in jiffies. This is the estimated time when group
44 * will unthrottle and is ready to dispatch more bio. It is used as
45 * key to sort active groups in service tree.
46 */
47 unsigned long disptime;
48
49 struct blkio_group blkg;
50 atomic_t ref;
51 unsigned int flags;
52
53 /* Two lists for READ and WRITE */
54 struct bio_list bio_lists[2];
55
56 /* Number of queued bios on READ and WRITE lists */
57 unsigned int nr_queued[2];
58
59 /* bytes per second rate limits */
60 uint64_t bps[2];
61
62 /* IOPS limits */
63 unsigned int iops[2];
64
65 /* Number of bytes disptached in current slice */
66 uint64_t bytes_disp[2];
67 /* Number of bio's dispatched in current slice */
68 unsigned int io_disp[2];
69
70 /* When did we start a new slice */
71 unsigned long slice_start[2];
72 unsigned long slice_end[2];
73
74 /* Some throttle limits got updated for the group */
75 bool limits_changed;
76};
77
78struct throtl_data
79{
80 /* List of throtl groups */
81 struct hlist_head tg_list;
82
83 /* service tree for active throtl groups */
84 struct throtl_rb_root tg_service_tree;
85
86 struct throtl_grp root_tg;
87 struct request_queue *queue;
88
89 /* Total Number of queued bios on READ and WRITE lists */
90 unsigned int nr_queued[2];
91
92 /*
93 * number of total undestroyed groups
94 */
95 unsigned int nr_undestroyed_grps;
96
97 /* Work for dispatching throttled bios */
98 struct delayed_work throtl_work;
99
100 atomic_t limits_changed;
101};
102
103enum tg_state_flags {
104 THROTL_TG_FLAG_on_rr = 0, /* on round-robin busy list */
105};
106
107#define THROTL_TG_FNS(name) \
108static inline void throtl_mark_tg_##name(struct throtl_grp *tg) \
109{ \
110 (tg)->flags |= (1 << THROTL_TG_FLAG_##name); \
111} \
112static inline void throtl_clear_tg_##name(struct throtl_grp *tg) \
113{ \
114 (tg)->flags &= ~(1 << THROTL_TG_FLAG_##name); \
115} \
116static inline int throtl_tg_##name(const struct throtl_grp *tg) \
117{ \
118 return ((tg)->flags & (1 << THROTL_TG_FLAG_##name)) != 0; \
119}
120
121THROTL_TG_FNS(on_rr);
122
123#define throtl_log_tg(td, tg, fmt, args...) \
124 blk_add_trace_msg((td)->queue, "throtl %s " fmt, \
125 blkg_path(&(tg)->blkg), ##args); \
126
127#define throtl_log(td, fmt, args...) \
128 blk_add_trace_msg((td)->queue, "throtl " fmt, ##args)
129
130static inline struct throtl_grp *tg_of_blkg(struct blkio_group *blkg)
131{
132 if (blkg)
133 return container_of(blkg, struct throtl_grp, blkg);
134
135 return NULL;
136}
137
138static inline int total_nr_queued(struct throtl_data *td)
139{
140 return (td->nr_queued[0] + td->nr_queued[1]);
141}
142
143static inline struct throtl_grp *throtl_ref_get_tg(struct throtl_grp *tg)
144{
145 atomic_inc(&tg->ref);
146 return tg;
147}
148
149static void throtl_put_tg(struct throtl_grp *tg)
150{
151 BUG_ON(atomic_read(&tg->ref) <= 0);
152 if (!atomic_dec_and_test(&tg->ref))
153 return;
154 kfree(tg);
155}
156
157static struct throtl_grp * throtl_find_alloc_tg(struct throtl_data *td,
158 struct cgroup *cgroup)
159{
160 struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
161 struct throtl_grp *tg = NULL;
162 void *key = td;
163 struct backing_dev_info *bdi = &td->queue->backing_dev_info;
164 unsigned int major, minor;
165
166 /*
167 * TODO: Speed up blkiocg_lookup_group() by maintaining a radix
168 * tree of blkg (instead of traversing through hash list all
169 * the time.
170 */
171 tg = tg_of_blkg(blkiocg_lookup_group(blkcg, key));
172
173 /* Fill in device details for root group */
174 if (tg && !tg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
175 sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
176 tg->blkg.dev = MKDEV(major, minor);
177 goto done;
178 }
179
180 if (tg)
181 goto done;
182
183 tg = kzalloc_node(sizeof(*tg), GFP_ATOMIC, td->queue->node);
184 if (!tg)
185 goto done;
186
187 INIT_HLIST_NODE(&tg->tg_node);
188 RB_CLEAR_NODE(&tg->rb_node);
189 bio_list_init(&tg->bio_lists[0]);
190 bio_list_init(&tg->bio_lists[1]);
191
192 /*
193 * Take the initial reference that will be released on destroy
194 * This can be thought of a joint reference by cgroup and
195 * request queue which will be dropped by either request queue
196 * exit or cgroup deletion path depending on who is exiting first.
197 */
198 atomic_set(&tg->ref, 1);
199
200 /* Add group onto cgroup list */
201 sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
202 blkiocg_add_blkio_group(blkcg, &tg->blkg, (void *)td,
203 MKDEV(major, minor), BLKIO_POLICY_THROTL);
204
205 tg->bps[READ] = blkcg_get_read_bps(blkcg, tg->blkg.dev);
206 tg->bps[WRITE] = blkcg_get_write_bps(blkcg, tg->blkg.dev);
207 tg->iops[READ] = blkcg_get_read_iops(blkcg, tg->blkg.dev);
208 tg->iops[WRITE] = blkcg_get_write_iops(blkcg, tg->blkg.dev);
209
210 hlist_add_head(&tg->tg_node, &td->tg_list);
211 td->nr_undestroyed_grps++;
212done:
213 return tg;
214}
215
216static struct throtl_grp * throtl_get_tg(struct throtl_data *td)
217{
218 struct cgroup *cgroup;
219 struct throtl_grp *tg = NULL;
220
221 rcu_read_lock();
222 cgroup = task_cgroup(current, blkio_subsys_id);
223 tg = throtl_find_alloc_tg(td, cgroup);
224 if (!tg)
225 tg = &td->root_tg;
226 rcu_read_unlock();
227 return tg;
228}
229
230static struct throtl_grp *throtl_rb_first(struct throtl_rb_root *root)
231{
232 /* Service tree is empty */
233 if (!root->count)
234 return NULL;
235
236 if (!root->left)
237 root->left = rb_first(&root->rb);
238
239 if (root->left)
240 return rb_entry_tg(root->left);
241
242 return NULL;
243}
244
245static void rb_erase_init(struct rb_node *n, struct rb_root *root)
246{
247 rb_erase(n, root);
248 RB_CLEAR_NODE(n);
249}
250
251static void throtl_rb_erase(struct rb_node *n, struct throtl_rb_root *root)
252{
253 if (root->left == n)
254 root->left = NULL;
255 rb_erase_init(n, &root->rb);
256 --root->count;
257}
258
259static void update_min_dispatch_time(struct throtl_rb_root *st)
260{
261 struct throtl_grp *tg;
262
263 tg = throtl_rb_first(st);
264 if (!tg)
265 return;
266
267 st->min_disptime = tg->disptime;
268}
269
270static void
271tg_service_tree_add(struct throtl_rb_root *st, struct throtl_grp *tg)
272{
273 struct rb_node **node = &st->rb.rb_node;
274 struct rb_node *parent = NULL;
275 struct throtl_grp *__tg;
276 unsigned long key = tg->disptime;
277 int left = 1;
278
279 while (*node != NULL) {
280 parent = *node;
281 __tg = rb_entry_tg(parent);
282
283 if (time_before(key, __tg->disptime))
284 node = &parent->rb_left;
285 else {
286 node = &parent->rb_right;
287 left = 0;
288 }
289 }
290
291 if (left)
292 st->left = &tg->rb_node;
293
294 rb_link_node(&tg->rb_node, parent, node);
295 rb_insert_color(&tg->rb_node, &st->rb);
296}
297
298static void __throtl_enqueue_tg(struct throtl_data *td, struct throtl_grp *tg)
299{
300 struct throtl_rb_root *st = &td->tg_service_tree;
301
302 tg_service_tree_add(st, tg);
303 throtl_mark_tg_on_rr(tg);
304 st->count++;
305}
306
307static void throtl_enqueue_tg(struct throtl_data *td, struct throtl_grp *tg)
308{
309 if (!throtl_tg_on_rr(tg))
310 __throtl_enqueue_tg(td, tg);
311}
312
313static void __throtl_dequeue_tg(struct throtl_data *td, struct throtl_grp *tg)
314{
315 throtl_rb_erase(&tg->rb_node, &td->tg_service_tree);
316 throtl_clear_tg_on_rr(tg);
317}
318
319static void throtl_dequeue_tg(struct throtl_data *td, struct throtl_grp *tg)
320{
321 if (throtl_tg_on_rr(tg))
322 __throtl_dequeue_tg(td, tg);
323}
324
325static void throtl_schedule_next_dispatch(struct throtl_data *td)
326{
327 struct throtl_rb_root *st = &td->tg_service_tree;
328
329 /*
330 * If there are more bios pending, schedule more work.
331 */
332 if (!total_nr_queued(td))
333 return;
334
335 BUG_ON(!st->count);
336
337 update_min_dispatch_time(st);
338
339 if (time_before_eq(st->min_disptime, jiffies))
340 throtl_schedule_delayed_work(td->queue, 0);
341 else
342 throtl_schedule_delayed_work(td->queue,
343 (st->min_disptime - jiffies));
344}
345
346static inline void
347throtl_start_new_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
348{
349 tg->bytes_disp[rw] = 0;
350 tg->io_disp[rw] = 0;
351 tg->slice_start[rw] = jiffies;
352 tg->slice_end[rw] = jiffies + throtl_slice;
353 throtl_log_tg(td, tg, "[%c] new slice start=%lu end=%lu jiffies=%lu",
354 rw == READ ? 'R' : 'W', tg->slice_start[rw],
355 tg->slice_end[rw], jiffies);
356}
357
358static inline void throtl_extend_slice(struct throtl_data *td,
359 struct throtl_grp *tg, bool rw, unsigned long jiffy_end)
360{
361 tg->slice_end[rw] = roundup(jiffy_end, throtl_slice);
362 throtl_log_tg(td, tg, "[%c] extend slice start=%lu end=%lu jiffies=%lu",
363 rw == READ ? 'R' : 'W', tg->slice_start[rw],
364 tg->slice_end[rw], jiffies);
365}
366
367/* Determine if previously allocated or extended slice is complete or not */
368static bool
369throtl_slice_used(struct throtl_data *td, struct throtl_grp *tg, bool rw)
370{
371 if (time_in_range(jiffies, tg->slice_start[rw], tg->slice_end[rw]))
372 return 0;
373
374 return 1;
375}
376
377/* Trim the used slices and adjust slice start accordingly */
378static inline void
379throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
380{
381 unsigned long nr_slices, time_elapsed, io_trim;
382 u64 bytes_trim, tmp;
383
384 BUG_ON(time_before(tg->slice_end[rw], tg->slice_start[rw]));
385
386 /*
387 * If bps are unlimited (-1), then time slice don't get
388 * renewed. Don't try to trim the slice if slice is used. A new
389 * slice will start when appropriate.
390 */
391 if (throtl_slice_used(td, tg, rw))
392 return;
393
394 time_elapsed = jiffies - tg->slice_start[rw];
395
396 nr_slices = time_elapsed / throtl_slice;
397
398 if (!nr_slices)
399 return;
400 tmp = tg->bps[rw] * throtl_slice * nr_slices;
401 do_div(tmp, HZ);
402 bytes_trim = tmp;
403
404 io_trim = (tg->iops[rw] * throtl_slice * nr_slices)/HZ;
405
406 if (!bytes_trim && !io_trim)
407 return;
408
409 if (tg->bytes_disp[rw] >= bytes_trim)
410 tg->bytes_disp[rw] -= bytes_trim;
411 else
412 tg->bytes_disp[rw] = 0;
413
414 if (tg->io_disp[rw] >= io_trim)
415 tg->io_disp[rw] -= io_trim;
416 else
417 tg->io_disp[rw] = 0;
418
419 tg->slice_start[rw] += nr_slices * throtl_slice;
420
421 throtl_log_tg(td, tg, "[%c] trim slice nr=%lu bytes=%llu io=%lu"
422 " start=%lu end=%lu jiffies=%lu",
423 rw == READ ? 'R' : 'W', nr_slices, bytes_trim, io_trim,
424 tg->slice_start[rw], tg->slice_end[rw], jiffies);
425}
426
427static bool tg_with_in_iops_limit(struct throtl_data *td, struct throtl_grp *tg,
428 struct bio *bio, unsigned long *wait)
429{
430 bool rw = bio_data_dir(bio);
431 unsigned int io_allowed;
432 unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
433 u64 tmp;
434
435 jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw];
436
437 /* Slice has just started. Consider one slice interval */
438 if (!jiffy_elapsed)
439 jiffy_elapsed_rnd = throtl_slice;
440
441 jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice);
442
443 /*
444 * jiffy_elapsed_rnd should not be a big value as minimum iops can be
445 * 1 then at max jiffy elapsed should be equivalent of 1 second as we
446 * will allow dispatch after 1 second and after that slice should
447 * have been trimmed.
448 */
449
450 tmp = (u64)tg->iops[rw] * jiffy_elapsed_rnd;
451 do_div(tmp, HZ);
452
453 if (tmp > UINT_MAX)
454 io_allowed = UINT_MAX;
455 else
456 io_allowed = tmp;
457
458 if (tg->io_disp[rw] + 1 <= io_allowed) {
459 if (wait)
460 *wait = 0;
461 return 1;
462 }
463
464 /* Calc approx time to dispatch */
465 jiffy_wait = ((tg->io_disp[rw] + 1) * HZ)/tg->iops[rw] + 1;
466
467 if (jiffy_wait > jiffy_elapsed)
468 jiffy_wait = jiffy_wait - jiffy_elapsed;
469 else
470 jiffy_wait = 1;
471
472 if (wait)
473 *wait = jiffy_wait;
474 return 0;
475}
476
477static bool tg_with_in_bps_limit(struct throtl_data *td, struct throtl_grp *tg,
478 struct bio *bio, unsigned long *wait)
479{
480 bool rw = bio_data_dir(bio);
481 u64 bytes_allowed, extra_bytes, tmp;
482 unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
483
484 jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw];
485
486 /* Slice has just started. Consider one slice interval */
487 if (!jiffy_elapsed)
488 jiffy_elapsed_rnd = throtl_slice;
489
490 jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice);
491
492 tmp = tg->bps[rw] * jiffy_elapsed_rnd;
493 do_div(tmp, HZ);
494 bytes_allowed = tmp;
495
496 if (tg->bytes_disp[rw] + bio->bi_size <= bytes_allowed) {
497 if (wait)
498 *wait = 0;
499 return 1;
500 }
501
502 /* Calc approx time to dispatch */
503 extra_bytes = tg->bytes_disp[rw] + bio->bi_size - bytes_allowed;
504 jiffy_wait = div64_u64(extra_bytes * HZ, tg->bps[rw]);
505
506 if (!jiffy_wait)
507 jiffy_wait = 1;
508
509 /*
510 * This wait time is without taking into consideration the rounding
511 * up we did. Add that time also.
512 */
513 jiffy_wait = jiffy_wait + (jiffy_elapsed_rnd - jiffy_elapsed);
514 if (wait)
515 *wait = jiffy_wait;
516 return 0;
517}
518
519/*
520 * Returns whether one can dispatch a bio or not. Also returns approx number
521 * of jiffies to wait before this bio is with-in IO rate and can be dispatched
522 */
523static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg,
524 struct bio *bio, unsigned long *wait)
525{
526 bool rw = bio_data_dir(bio);
527 unsigned long bps_wait = 0, iops_wait = 0, max_wait = 0;
528
529 /*
530 * Currently whole state machine of group depends on first bio
531 * queued in the group bio list. So one should not be calling
532 * this function with a different bio if there are other bios
533 * queued.
534 */
535 BUG_ON(tg->nr_queued[rw] && bio != bio_list_peek(&tg->bio_lists[rw]));
536
537 /* If tg->bps = -1, then BW is unlimited */
538 if (tg->bps[rw] == -1 && tg->iops[rw] == -1) {
539 if (wait)
540 *wait = 0;
541 return 1;
542 }
543
544 /*
545 * If previous slice expired, start a new one otherwise renew/extend
546 * existing slice to make sure it is at least throtl_slice interval
547 * long since now.
548 */
549 if (throtl_slice_used(td, tg, rw))
550 throtl_start_new_slice(td, tg, rw);
551 else {
552 if (time_before(tg->slice_end[rw], jiffies + throtl_slice))
553 throtl_extend_slice(td, tg, rw, jiffies + throtl_slice);
554 }
555
556 if (tg_with_in_bps_limit(td, tg, bio, &bps_wait)
557 && tg_with_in_iops_limit(td, tg, bio, &iops_wait)) {
558 if (wait)
559 *wait = 0;
560 return 1;
561 }
562
563 max_wait = max(bps_wait, iops_wait);
564
565 if (wait)
566 *wait = max_wait;
567
568 if (time_before(tg->slice_end[rw], jiffies + max_wait))
569 throtl_extend_slice(td, tg, rw, jiffies + max_wait);
570
571 return 0;
572}
573
574static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
575{
576 bool rw = bio_data_dir(bio);
577 bool sync = bio->bi_rw & REQ_SYNC;
578
579 /* Charge the bio to the group */
580 tg->bytes_disp[rw] += bio->bi_size;
581 tg->io_disp[rw]++;
582
583 /*
584 * TODO: This will take blkg->stats_lock. Figure out a way
585 * to avoid this cost.
586 */
587 blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size, rw, sync);
588}
589
590static void throtl_add_bio_tg(struct throtl_data *td, struct throtl_grp *tg,
591 struct bio *bio)
592{
593 bool rw = bio_data_dir(bio);
594
595 bio_list_add(&tg->bio_lists[rw], bio);
596 /* Take a bio reference on tg */
597 throtl_ref_get_tg(tg);
598 tg->nr_queued[rw]++;
599 td->nr_queued[rw]++;
600 throtl_enqueue_tg(td, tg);
601}
602
603static void tg_update_disptime(struct throtl_data *td, struct throtl_grp *tg)
604{
605 unsigned long read_wait = -1, write_wait = -1, min_wait = -1, disptime;
606 struct bio *bio;
607
608 if ((bio = bio_list_peek(&tg->bio_lists[READ])))
609 tg_may_dispatch(td, tg, bio, &read_wait);
610
611 if ((bio = bio_list_peek(&tg->bio_lists[WRITE])))
612 tg_may_dispatch(td, tg, bio, &write_wait);
613
614 min_wait = min(read_wait, write_wait);
615 disptime = jiffies + min_wait;
616
617 /* Update dispatch time */
618 throtl_dequeue_tg(td, tg);
619 tg->disptime = disptime;
620 throtl_enqueue_tg(td, tg);
621}
622
623static void tg_dispatch_one_bio(struct throtl_data *td, struct throtl_grp *tg,
624 bool rw, struct bio_list *bl)
625{
626 struct bio *bio;
627
628 bio = bio_list_pop(&tg->bio_lists[rw]);
629 tg->nr_queued[rw]--;
630 /* Drop bio reference on tg */
631 throtl_put_tg(tg);
632
633 BUG_ON(td->nr_queued[rw] <= 0);
634 td->nr_queued[rw]--;
635
636 throtl_charge_bio(tg, bio);
637 bio_list_add(bl, bio);
638 bio->bi_rw |= REQ_THROTTLED;
639
640 throtl_trim_slice(td, tg, rw);
641}
642
643static int throtl_dispatch_tg(struct throtl_data *td, struct throtl_grp *tg,
644 struct bio_list *bl)
645{
646 unsigned int nr_reads = 0, nr_writes = 0;
647 unsigned int max_nr_reads = throtl_grp_quantum*3/4;
648 unsigned int max_nr_writes = throtl_grp_quantum - nr_reads;
649 struct bio *bio;
650
651 /* Try to dispatch 75% READS and 25% WRITES */
652
653 while ((bio = bio_list_peek(&tg->bio_lists[READ]))
654 && tg_may_dispatch(td, tg, bio, NULL)) {
655
656 tg_dispatch_one_bio(td, tg, bio_data_dir(bio), bl);
657 nr_reads++;
658
659 if (nr_reads >= max_nr_reads)
660 break;
661 }
662
663 while ((bio = bio_list_peek(&tg->bio_lists[WRITE]))
664 && tg_may_dispatch(td, tg, bio, NULL)) {
665
666 tg_dispatch_one_bio(td, tg, bio_data_dir(bio), bl);
667 nr_writes++;
668
669 if (nr_writes >= max_nr_writes)
670 break;
671 }
672
673 return nr_reads + nr_writes;
674}
675
676static int throtl_select_dispatch(struct throtl_data *td, struct bio_list *bl)
677{
678 unsigned int nr_disp = 0;
679 struct throtl_grp *tg;
680 struct throtl_rb_root *st = &td->tg_service_tree;
681
682 while (1) {
683 tg = throtl_rb_first(st);
684
685 if (!tg)
686 break;
687
688 if (time_before(jiffies, tg->disptime))
689 break;
690
691 throtl_dequeue_tg(td, tg);
692
693 nr_disp += throtl_dispatch_tg(td, tg, bl);
694
695 if (tg->nr_queued[0] || tg->nr_queued[1]) {
696 tg_update_disptime(td, tg);
697 throtl_enqueue_tg(td, tg);
698 }
699
700 if (nr_disp >= throtl_quantum)
701 break;
702 }
703
704 return nr_disp;
705}
706
707static void throtl_process_limit_change(struct throtl_data *td)
708{
709 struct throtl_grp *tg;
710 struct hlist_node *pos, *n;
711
712 /*
713 * Make sure atomic_inc() effects from
714 * throtl_update_blkio_group_read_bps(), group of functions are
715 * visible.
716 * Is this required or smp_mb__after_atomic_inc() was suffcient
717 * after the atomic_inc().
718 */
719 smp_rmb();
720 if (!atomic_read(&td->limits_changed))
721 return;
722
723 throtl_log(td, "limit changed =%d", atomic_read(&td->limits_changed));
724
725 hlist_for_each_entry_safe(tg, pos, n, &td->tg_list, tg_node) {
726 /*
727 * Do I need an smp_rmb() here to make sure tg->limits_changed
728 * update is visible. I am relying on smp_rmb() at the
729 * beginning of function and not putting a new one here.
730 */
731
732 if (throtl_tg_on_rr(tg) && tg->limits_changed) {
733 throtl_log_tg(td, tg, "limit change rbps=%llu wbps=%llu"
734 " riops=%u wiops=%u", tg->bps[READ],
735 tg->bps[WRITE], tg->iops[READ],
736 tg->iops[WRITE]);
737 tg_update_disptime(td, tg);
738 tg->limits_changed = false;
739 }
740 }
741
742 smp_mb__before_atomic_dec();
743 atomic_dec(&td->limits_changed);
744 smp_mb__after_atomic_dec();
745}
746
747/* Dispatch throttled bios. Should be called without queue lock held. */
748static int throtl_dispatch(struct request_queue *q)
749{
750 struct throtl_data *td = q->td;
751 unsigned int nr_disp = 0;
752 struct bio_list bio_list_on_stack;
753 struct bio *bio;
754
755 spin_lock_irq(q->queue_lock);
756
757 throtl_process_limit_change(td);
758
759 if (!total_nr_queued(td))
760 goto out;
761
762 bio_list_init(&bio_list_on_stack);
763
764 throtl_log(td, "dispatch nr_queued=%lu read=%u write=%u",
765 total_nr_queued(td), td->nr_queued[READ],
766 td->nr_queued[WRITE]);
767
768 nr_disp = throtl_select_dispatch(td, &bio_list_on_stack);
769
770 if (nr_disp)
771 throtl_log(td, "bios disp=%u", nr_disp);
772
773 throtl_schedule_next_dispatch(td);
774out:
775 spin_unlock_irq(q->queue_lock);
776
777 /*
778 * If we dispatched some requests, unplug the queue to make sure
779 * immediate dispatch
780 */
781 if (nr_disp) {
782 while((bio = bio_list_pop(&bio_list_on_stack)))
783 generic_make_request(bio);
784 blk_unplug(q);
785 }
786 return nr_disp;
787}
788
789void blk_throtl_work(struct work_struct *work)
790{
791 struct throtl_data *td = container_of(work, struct throtl_data,
792 throtl_work.work);
793 struct request_queue *q = td->queue;
794
795 throtl_dispatch(q);
796}
797
798/* Call with queue lock held */
799void throtl_schedule_delayed_work(struct request_queue *q, unsigned long delay)
800{
801
802 struct throtl_data *td = q->td;
803 struct delayed_work *dwork = &td->throtl_work;
804
805 if (total_nr_queued(td) > 0) {
806 /*
807 * We might have a work scheduled to be executed in future.
808 * Cancel that and schedule a new one.
809 */
810 __cancel_delayed_work(dwork);
811 kblockd_schedule_delayed_work(q, dwork, delay);
812 throtl_log(td, "schedule work. delay=%lu jiffies=%lu",
813 delay, jiffies);
814 }
815}
816EXPORT_SYMBOL(throtl_schedule_delayed_work);
817
818static void
819throtl_destroy_tg(struct throtl_data *td, struct throtl_grp *tg)
820{
821 /* Something wrong if we are trying to remove same group twice */
822 BUG_ON(hlist_unhashed(&tg->tg_node));
823
824 hlist_del_init(&tg->tg_node);
825
826 /*
827 * Put the reference taken at the time of creation so that when all
828 * queues are gone, group can be destroyed.
829 */
830 throtl_put_tg(tg);
831 td->nr_undestroyed_grps--;
832}
833
834static void throtl_release_tgs(struct throtl_data *td)
835{
836 struct hlist_node *pos, *n;
837 struct throtl_grp *tg;
838
839 hlist_for_each_entry_safe(tg, pos, n, &td->tg_list, tg_node) {
840 /*
841 * If cgroup removal path got to blk_group first and removed
842 * it from cgroup list, then it will take care of destroying
843 * cfqg also.
844 */
845 if (!blkiocg_del_blkio_group(&tg->blkg))
846 throtl_destroy_tg(td, tg);
847 }
848}
849
850static void throtl_td_free(struct throtl_data *td)
851{
852 kfree(td);
853}
854
855/*
856 * Blk cgroup controller notification saying that blkio_group object is being
857 * delinked as associated cgroup object is going away. That also means that
858 * no new IO will come in this group. So get rid of this group as soon as
859 * any pending IO in the group is finished.
860 *
861 * This function is called under rcu_read_lock(). key is the rcu protected
862 * pointer. That means "key" is a valid throtl_data pointer as long as we are
863 * rcu read lock.
864 *
865 * "key" was fetched from blkio_group under blkio_cgroup->lock. That means
866 * it should not be NULL as even if queue was going away, cgroup deltion
867 * path got to it first.
868 */
869void throtl_unlink_blkio_group(void *key, struct blkio_group *blkg)
870{
871 unsigned long flags;
872 struct throtl_data *td = key;
873
874 spin_lock_irqsave(td->queue->queue_lock, flags);
875 throtl_destroy_tg(td, tg_of_blkg(blkg));
876 spin_unlock_irqrestore(td->queue->queue_lock, flags);
877}
878
879/*
880 * For all update functions, key should be a valid pointer because these
881 * update functions are called under blkcg_lock, that means, blkg is
882 * valid and in turn key is valid. queue exit path can not race becuase
883 * of blkcg_lock
884 *
885 * Can not take queue lock in update functions as queue lock under blkcg_lock
886 * is not allowed. Under other paths we take blkcg_lock under queue_lock.
887 */
888static void throtl_update_blkio_group_read_bps(void *key,
889 struct blkio_group *blkg, u64 read_bps)
890{
891 struct throtl_data *td = key;
892
893 tg_of_blkg(blkg)->bps[READ] = read_bps;
894 /* Make sure read_bps is updated before setting limits_changed */
895 smp_wmb();
896 tg_of_blkg(blkg)->limits_changed = true;
897
898 /* Make sure tg->limits_changed is updated before td->limits_changed */
899 smp_mb__before_atomic_inc();
900 atomic_inc(&td->limits_changed);
901 smp_mb__after_atomic_inc();
902
903 /* Schedule a work now to process the limit change */
904 throtl_schedule_delayed_work(td->queue, 0);
905}
906
907static void throtl_update_blkio_group_write_bps(void *key,
908 struct blkio_group *blkg, u64 write_bps)
909{
910 struct throtl_data *td = key;
911
912 tg_of_blkg(blkg)->bps[WRITE] = write_bps;
913 smp_wmb();
914 tg_of_blkg(blkg)->limits_changed = true;
915 smp_mb__before_atomic_inc();
916 atomic_inc(&td->limits_changed);
917 smp_mb__after_atomic_inc();
918 throtl_schedule_delayed_work(td->queue, 0);
919}
920
921static void throtl_update_blkio_group_read_iops(void *key,
922 struct blkio_group *blkg, unsigned int read_iops)
923{
924 struct throtl_data *td = key;
925
926 tg_of_blkg(blkg)->iops[READ] = read_iops;
927 smp_wmb();
928 tg_of_blkg(blkg)->limits_changed = true;
929 smp_mb__before_atomic_inc();
930 atomic_inc(&td->limits_changed);
931 smp_mb__after_atomic_inc();
932 throtl_schedule_delayed_work(td->queue, 0);
933}
934
935static void throtl_update_blkio_group_write_iops(void *key,
936 struct blkio_group *blkg, unsigned int write_iops)
937{
938 struct throtl_data *td = key;
939
940 tg_of_blkg(blkg)->iops[WRITE] = write_iops;
941 smp_wmb();
942 tg_of_blkg(blkg)->limits_changed = true;
943 smp_mb__before_atomic_inc();
944 atomic_inc(&td->limits_changed);
945 smp_mb__after_atomic_inc();
946 throtl_schedule_delayed_work(td->queue, 0);
947}
948
949void throtl_shutdown_timer_wq(struct request_queue *q)
950{
951 struct throtl_data *td = q->td;
952
953 cancel_delayed_work_sync(&td->throtl_work);
954}
955
956static struct blkio_policy_type blkio_policy_throtl = {
957 .ops = {
958 .blkio_unlink_group_fn = throtl_unlink_blkio_group,
959 .blkio_update_group_read_bps_fn =
960 throtl_update_blkio_group_read_bps,
961 .blkio_update_group_write_bps_fn =
962 throtl_update_blkio_group_write_bps,
963 .blkio_update_group_read_iops_fn =
964 throtl_update_blkio_group_read_iops,
965 .blkio_update_group_write_iops_fn =
966 throtl_update_blkio_group_write_iops,
967 },
968 .plid = BLKIO_POLICY_THROTL,
969};
970
971int blk_throtl_bio(struct request_queue *q, struct bio **biop)
972{
973 struct throtl_data *td = q->td;
974 struct throtl_grp *tg;
975 struct bio *bio = *biop;
976 bool rw = bio_data_dir(bio), update_disptime = true;
977
978 if (bio->bi_rw & REQ_THROTTLED) {
979 bio->bi_rw &= ~REQ_THROTTLED;
980 return 0;
981 }
982
983 spin_lock_irq(q->queue_lock);
984 tg = throtl_get_tg(td);
985
986 if (tg->nr_queued[rw]) {
987 /*
988 * There is already another bio queued in same dir. No
989 * need to update dispatch time.
990 * Still update the disptime if rate limits on this group
991 * were changed.
992 */
993 if (!tg->limits_changed)
994 update_disptime = false;
995 else
996 tg->limits_changed = false;
997
998 goto queue_bio;
999 }
1000
1001 /* Bio is with-in rate limit of group */
1002 if (tg_may_dispatch(td, tg, bio, NULL)) {
1003 throtl_charge_bio(tg, bio);
1004 goto out;
1005 }
1006
1007queue_bio:
1008 throtl_log_tg(td, tg, "[%c] bio. bdisp=%u sz=%u bps=%llu"
1009 " iodisp=%u iops=%u queued=%d/%d",
1010 rw == READ ? 'R' : 'W',
1011 tg->bytes_disp[rw], bio->bi_size, tg->bps[rw],
1012 tg->io_disp[rw], tg->iops[rw],
1013 tg->nr_queued[READ], tg->nr_queued[WRITE]);
1014
1015 throtl_add_bio_tg(q->td, tg, bio);
1016 *biop = NULL;
1017
1018 if (update_disptime) {
1019 tg_update_disptime(td, tg);
1020 throtl_schedule_next_dispatch(td);
1021 }
1022
1023out:
1024 spin_unlock_irq(q->queue_lock);
1025 return 0;
1026}
1027
1028int blk_throtl_init(struct request_queue *q)
1029{
1030 struct throtl_data *td;
1031 struct throtl_grp *tg;
1032
1033 td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node);
1034 if (!td)
1035 return -ENOMEM;
1036
1037 INIT_HLIST_HEAD(&td->tg_list);
1038 td->tg_service_tree = THROTL_RB_ROOT;
1039 atomic_set(&td->limits_changed, 0);
1040
1041 /* Init root group */
1042 tg = &td->root_tg;
1043 INIT_HLIST_NODE(&tg->tg_node);
1044 RB_CLEAR_NODE(&tg->rb_node);
1045 bio_list_init(&tg->bio_lists[0]);
1046 bio_list_init(&tg->bio_lists[1]);
1047
1048 /* Practically unlimited BW */
1049 tg->bps[0] = tg->bps[1] = -1;
1050 tg->iops[0] = tg->iops[1] = -1;
1051
1052 /*
1053 * Set root group reference to 2. One reference will be dropped when
1054 * all groups on tg_list are being deleted during queue exit. Other
1055 * reference will remain there as we don't want to delete this group
1056 * as it is statically allocated and gets destroyed when throtl_data
1057 * goes away.
1058 */
1059 atomic_set(&tg->ref, 2);
1060 hlist_add_head(&tg->tg_node, &td->tg_list);
1061 td->nr_undestroyed_grps++;
1062
1063 INIT_DELAYED_WORK(&td->throtl_work, blk_throtl_work);
1064
1065 rcu_read_lock();
1066 blkiocg_add_blkio_group(&blkio_root_cgroup, &tg->blkg, (void *)td,
1067 0, BLKIO_POLICY_THROTL);
1068 rcu_read_unlock();
1069
1070 /* Attach throtl data to request queue */
1071 td->queue = q;
1072 q->td = td;
1073 return 0;
1074}
1075
1076void blk_throtl_exit(struct request_queue *q)
1077{
1078 struct throtl_data *td = q->td;
1079 bool wait = false;
1080
1081 BUG_ON(!td);
1082
1083 throtl_shutdown_timer_wq(q);
1084
1085 spin_lock_irq(q->queue_lock);
1086 throtl_release_tgs(td);
1087
1088 /* If there are other groups */
1089 if (td->nr_undestroyed_grps > 0)
1090 wait = true;
1091
1092 spin_unlock_irq(q->queue_lock);
1093
1094 /*
1095 * Wait for tg->blkg->key accessors to exit their grace periods.
1096 * Do this wait only if there are other undestroyed groups out
1097 * there (other than root group). This can happen if cgroup deletion
1098 * path claimed the responsibility of cleaning up a group before
1099 * queue cleanup code get to the group.
1100 *
1101 * Do not call synchronize_rcu() unconditionally as there are drivers
1102 * which create/delete request queue hundreds of times during scan/boot
1103 * and synchronize_rcu() can take significant time and slow down boot.
1104 */
1105 if (wait)
1106 synchronize_rcu();
1107
1108 /*
1109 * Just being safe to make sure after previous flush if some body did
1110 * update limits through cgroup and another work got queued, cancel
1111 * it.
1112 */
1113 throtl_shutdown_timer_wq(q);
1114 throtl_td_free(td);
1115}
1116
1117static int __init throtl_init(void)
1118{
1119 blkio_policy_register(&blkio_policy_throtl);
1120 return 0;
1121}
1122
1123module_init(throtl_init);
diff --git a/block/blk.h b/block/blk.h
index 6e7dc87141e4..1e675e5ade02 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -51,6 +51,8 @@ static inline void blk_clear_rq_complete(struct request *rq)
51 */ 51 */
52#define ELV_ON_HASH(rq) (!hlist_unhashed(&(rq)->hash)) 52#define ELV_ON_HASH(rq) (!hlist_unhashed(&(rq)->hash))
53 53
54struct request *blk_do_flush(struct request_queue *q, struct request *rq);
55
54static inline struct request *__elv_next_request(struct request_queue *q) 56static inline struct request *__elv_next_request(struct request_queue *q)
55{ 57{
56 struct request *rq; 58 struct request *rq;
@@ -58,7 +60,11 @@ static inline struct request *__elv_next_request(struct request_queue *q)
58 while (1) { 60 while (1) {
59 while (!list_empty(&q->queue_head)) { 61 while (!list_empty(&q->queue_head)) {
60 rq = list_entry_rq(q->queue_head.next); 62 rq = list_entry_rq(q->queue_head.next);
61 if (blk_do_ordered(q, &rq)) 63 if (!(rq->cmd_flags & (REQ_FLUSH | REQ_FUA)) ||
64 rq == &q->flush_rq)
65 return rq;
66 rq = blk_do_flush(q, rq);
67 if (rq)
62 return rq; 68 return rq;
63 } 69 }
64 70
@@ -110,10 +116,6 @@ void blk_queue_congestion_threshold(struct request_queue *q);
110 116
111int blk_dev_init(void); 117int blk_dev_init(void);
112 118
113void elv_quiesce_start(struct request_queue *q);
114void elv_quiesce_end(struct request_queue *q);
115
116
117/* 119/*
118 * Return the threshold (number of used requests) at which the queue is 120 * Return the threshold (number of used requests) at which the queue is
119 * considered to be congested. It include a little hysteresis to keep the 121 * considered to be congested. It include a little hysteresis to keep the
@@ -132,24 +134,20 @@ static inline int queue_congestion_off_threshold(struct request_queue *q)
132 return q->nr_congestion_off; 134 return q->nr_congestion_off;
133} 135}
134 136
135#if defined(CONFIG_BLK_DEV_INTEGRITY)
136
137#define rq_for_each_integrity_segment(bvl, _rq, _iter) \
138 __rq_for_each_bio(_iter.bio, _rq) \
139 bip_for_each_vec(bvl, _iter.bio->bi_integrity, _iter.i)
140
141#endif /* BLK_DEV_INTEGRITY */
142
143static inline int blk_cpu_to_group(int cpu) 137static inline int blk_cpu_to_group(int cpu)
144{ 138{
139 int group = NR_CPUS;
145#ifdef CONFIG_SCHED_MC 140#ifdef CONFIG_SCHED_MC
146 const struct cpumask *mask = cpu_coregroup_mask(cpu); 141 const struct cpumask *mask = cpu_coregroup_mask(cpu);
147 return cpumask_first(mask); 142 group = cpumask_first(mask);
148#elif defined(CONFIG_SCHED_SMT) 143#elif defined(CONFIG_SCHED_SMT)
149 return cpumask_first(topology_thread_cpumask(cpu)); 144 group = cpumask_first(topology_thread_cpumask(cpu));
150#else 145#else
151 return cpu; 146 return cpu;
152#endif 147#endif
148 if (likely(group < NR_CPUS))
149 return group;
150 return cpu;
153} 151}
154 152
155/* 153/*
diff --git a/block/bsg.c b/block/bsg.c
index 82d58829ba59..f20d6a789d48 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -20,7 +20,6 @@
20#include <linux/uio.h> 20#include <linux/uio.h>
21#include <linux/idr.h> 21#include <linux/idr.h>
22#include <linux/bsg.h> 22#include <linux/bsg.h>
23#include <linux/smp_lock.h>
24#include <linux/slab.h> 23#include <linux/slab.h>
25 24
26#include <scsi/scsi.h> 25#include <scsi/scsi.h>
@@ -426,7 +425,7 @@ static int blk_complete_sgv4_hdr_rq(struct request *rq, struct sg_io_v4 *hdr,
426 /* 425 /*
427 * fill in all the output members 426 * fill in all the output members
428 */ 427 */
429 hdr->device_status = status_byte(rq->errors); 428 hdr->device_status = rq->errors & 0xff;
430 hdr->transport_status = host_byte(rq->errors); 429 hdr->transport_status = host_byte(rq->errors);
431 hdr->driver_status = driver_byte(rq->errors); 430 hdr->driver_status = driver_byte(rq->errors);
432 hdr->info = 0; 431 hdr->info = 0;
@@ -843,9 +842,7 @@ static int bsg_open(struct inode *inode, struct file *file)
843{ 842{
844 struct bsg_device *bd; 843 struct bsg_device *bd;
845 844
846 lock_kernel();
847 bd = bsg_get_device(inode, file); 845 bd = bsg_get_device(inode, file);
848 unlock_kernel();
849 846
850 if (IS_ERR(bd)) 847 if (IS_ERR(bd))
851 return PTR_ERR(bd); 848 return PTR_ERR(bd);
@@ -968,6 +965,7 @@ static const struct file_operations bsg_fops = {
968 .release = bsg_release, 965 .release = bsg_release,
969 .unlocked_ioctl = bsg_ioctl, 966 .unlocked_ioctl = bsg_ioctl,
970 .owner = THIS_MODULE, 967 .owner = THIS_MODULE,
968 .llseek = default_llseek,
971}; 969};
972 970
973void bsg_unregister_queue(struct request_queue *q) 971void bsg_unregister_queue(struct request_queue *q)
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index eb4086f7dfef..4cd59b0d7c15 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -30,6 +30,7 @@ static const int cfq_slice_sync = HZ / 10;
30static int cfq_slice_async = HZ / 25; 30static int cfq_slice_async = HZ / 25;
31static const int cfq_slice_async_rq = 2; 31static const int cfq_slice_async_rq = 2;
32static int cfq_slice_idle = HZ / 125; 32static int cfq_slice_idle = HZ / 125;
33static int cfq_group_idle = HZ / 125;
33static const int cfq_target_latency = HZ * 3/10; /* 300 ms */ 34static const int cfq_target_latency = HZ * 3/10; /* 300 ms */
34static const int cfq_hist_divisor = 4; 35static const int cfq_hist_divisor = 4;
35 36
@@ -147,6 +148,8 @@ struct cfq_queue {
147 struct cfq_queue *new_cfqq; 148 struct cfq_queue *new_cfqq;
148 struct cfq_group *cfqg; 149 struct cfq_group *cfqg;
149 struct cfq_group *orig_cfqg; 150 struct cfq_group *orig_cfqg;
151 /* Number of sectors dispatched from queue in single dispatch round */
152 unsigned long nr_sectors;
150}; 153};
151 154
152/* 155/*
@@ -157,6 +160,7 @@ enum wl_prio_t {
157 BE_WORKLOAD = 0, 160 BE_WORKLOAD = 0,
158 RT_WORKLOAD = 1, 161 RT_WORKLOAD = 1,
159 IDLE_WORKLOAD = 2, 162 IDLE_WORKLOAD = 2,
163 CFQ_PRIO_NR,
160}; 164};
161 165
162/* 166/*
@@ -181,10 +185,19 @@ struct cfq_group {
181 /* number of cfqq currently on this group */ 185 /* number of cfqq currently on this group */
182 int nr_cfqq; 186 int nr_cfqq;
183 187
184 /* Per group busy queus average. Useful for workload slice calc. */
185 unsigned int busy_queues_avg[2];
186 /* 188 /*
187 * rr lists of queues with requests, onle rr for each priority class. 189 * Per group busy queus average. Useful for workload slice calc. We
190 * create the array for each prio class but at run time it is used
191 * only for RT and BE class and slot for IDLE class remains unused.
192 * This is primarily done to avoid confusion and a gcc warning.
193 */
194 unsigned int busy_queues_avg[CFQ_PRIO_NR];
195 /*
196 * rr lists of queues with requests. We maintain service trees for
197 * RT and BE classes. These trees are subdivided in subclasses
198 * of SYNC, SYNC_NOIDLE and ASYNC based on workload type. For IDLE
199 * class there is no subclassification and all the cfq queues go on
200 * a single tree service_tree_idle.
188 * Counts are embedded in the cfq_rb_root 201 * Counts are embedded in the cfq_rb_root
189 */ 202 */
190 struct cfq_rb_root service_trees[2][3]; 203 struct cfq_rb_root service_trees[2][3];
@@ -198,6 +211,8 @@ struct cfq_group {
198 struct hlist_node cfqd_node; 211 struct hlist_node cfqd_node;
199 atomic_t ref; 212 atomic_t ref;
200#endif 213#endif
214 /* number of requests that are on the dispatch list or inside driver */
215 int dispatched;
201}; 216};
202 217
203/* 218/*
@@ -216,7 +231,6 @@ struct cfq_data {
216 enum wl_type_t serving_type; 231 enum wl_type_t serving_type;
217 unsigned long workload_expires; 232 unsigned long workload_expires;
218 struct cfq_group *serving_group; 233 struct cfq_group *serving_group;
219 bool noidle_tree_requires_idle;
220 234
221 /* 235 /*
222 * Each priority tree is sorted by next_request position. These 236 * Each priority tree is sorted by next_request position. These
@@ -271,6 +285,7 @@ struct cfq_data {
271 unsigned int cfq_slice[2]; 285 unsigned int cfq_slice[2];
272 unsigned int cfq_slice_async_rq; 286 unsigned int cfq_slice_async_rq;
273 unsigned int cfq_slice_idle; 287 unsigned int cfq_slice_idle;
288 unsigned int cfq_group_idle;
274 unsigned int cfq_latency; 289 unsigned int cfq_latency;
275 unsigned int cfq_group_isolation; 290 unsigned int cfq_group_isolation;
276 291
@@ -378,6 +393,21 @@ CFQ_CFQQ_FNS(wait_busy);
378 &cfqg->service_trees[i][j]: NULL) \ 393 &cfqg->service_trees[i][j]: NULL) \
379 394
380 395
396static inline bool iops_mode(struct cfq_data *cfqd)
397{
398 /*
399 * If we are not idling on queues and it is a NCQ drive, parallel
400 * execution of requests is on and measuring time is not possible
401 * in most of the cases until and unless we drive shallower queue
402 * depths and that becomes a performance bottleneck. In such cases
403 * switch to start providing fairness in terms of number of IOs.
404 */
405 if (!cfqd->cfq_slice_idle && cfqd->hw_tag)
406 return true;
407 else
408 return false;
409}
410
381static inline enum wl_prio_t cfqq_prio(struct cfq_queue *cfqq) 411static inline enum wl_prio_t cfqq_prio(struct cfq_queue *cfqq)
382{ 412{
383 if (cfq_class_idle(cfqq)) 413 if (cfq_class_idle(cfqq))
@@ -906,7 +936,6 @@ static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
906 slice_used = cfqq->allocated_slice; 936 slice_used = cfqq->allocated_slice;
907 } 937 }
908 938
909 cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u", slice_used);
910 return slice_used; 939 return slice_used;
911} 940}
912 941
@@ -914,19 +943,21 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
914 struct cfq_queue *cfqq) 943 struct cfq_queue *cfqq)
915{ 944{
916 struct cfq_rb_root *st = &cfqd->grp_service_tree; 945 struct cfq_rb_root *st = &cfqd->grp_service_tree;
917 unsigned int used_sl, charge_sl; 946 unsigned int used_sl, charge;
918 int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg) 947 int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg)
919 - cfqg->service_tree_idle.count; 948 - cfqg->service_tree_idle.count;
920 949
921 BUG_ON(nr_sync < 0); 950 BUG_ON(nr_sync < 0);
922 used_sl = charge_sl = cfq_cfqq_slice_usage(cfqq); 951 used_sl = charge = cfq_cfqq_slice_usage(cfqq);
923 952
924 if (!cfq_cfqq_sync(cfqq) && !nr_sync) 953 if (iops_mode(cfqd))
925 charge_sl = cfqq->allocated_slice; 954 charge = cfqq->slice_dispatch;
955 else if (!cfq_cfqq_sync(cfqq) && !nr_sync)
956 charge = cfqq->allocated_slice;
926 957
927 /* Can't update vdisktime while group is on service tree */ 958 /* Can't update vdisktime while group is on service tree */
928 cfq_rb_erase(&cfqg->rb_node, st); 959 cfq_rb_erase(&cfqg->rb_node, st);
929 cfqg->vdisktime += cfq_scale_slice(charge_sl, cfqg); 960 cfqg->vdisktime += cfq_scale_slice(charge, cfqg);
930 __cfq_group_service_tree_add(st, cfqg); 961 __cfq_group_service_tree_add(st, cfqg);
931 962
932 /* This group is being expired. Save the context */ 963 /* This group is being expired. Save the context */
@@ -940,6 +971,9 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
940 971
941 cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime, 972 cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime,
942 st->min_vdisktime); 973 st->min_vdisktime);
974 cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u disp=%u charge=%u iops=%u"
975 " sect=%u", used_sl, cfqq->slice_dispatch, charge,
976 iops_mode(cfqd), cfqq->nr_sectors);
943 cfq_blkiocg_update_timeslice_used(&cfqg->blkg, used_sl); 977 cfq_blkiocg_update_timeslice_used(&cfqg->blkg, used_sl);
944 cfq_blkiocg_set_start_empty_time(&cfqg->blkg); 978 cfq_blkiocg_set_start_empty_time(&cfqg->blkg);
945} 979}
@@ -952,8 +986,8 @@ static inline struct cfq_group *cfqg_of_blkg(struct blkio_group *blkg)
952 return NULL; 986 return NULL;
953} 987}
954 988
955void 989void cfq_update_blkio_group_weight(void *key, struct blkio_group *blkg,
956cfq_update_blkio_group_weight(struct blkio_group *blkg, unsigned int weight) 990 unsigned int weight)
957{ 991{
958 cfqg_of_blkg(blkg)->weight = weight; 992 cfqg_of_blkg(blkg)->weight = weight;
959} 993}
@@ -994,10 +1028,20 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
994 */ 1028 */
995 atomic_set(&cfqg->ref, 1); 1029 atomic_set(&cfqg->ref, 1);
996 1030
997 /* Add group onto cgroup list */ 1031 /*
998 sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); 1032 * Add group onto cgroup list. It might happen that bdi->dev is
999 cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd, 1033 * not initiliazed yet. Initialize this new group without major
1034 * and minor info and this info will be filled in once a new thread
1035 * comes for IO. See code above.
1036 */
1037 if (bdi->dev) {
1038 sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
1039 cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
1000 MKDEV(major, minor)); 1040 MKDEV(major, minor));
1041 } else
1042 cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
1043 0);
1044
1001 cfqg->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev); 1045 cfqg->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
1002 1046
1003 /* Add group on cfqd list */ 1047 /* Add group on cfqd list */
@@ -1587,6 +1631,7 @@ static void __cfq_set_active_queue(struct cfq_data *cfqd,
1587 cfqq->allocated_slice = 0; 1631 cfqq->allocated_slice = 0;
1588 cfqq->slice_end = 0; 1632 cfqq->slice_end = 0;
1589 cfqq->slice_dispatch = 0; 1633 cfqq->slice_dispatch = 0;
1634 cfqq->nr_sectors = 0;
1590 1635
1591 cfq_clear_cfqq_wait_request(cfqq); 1636 cfq_clear_cfqq_wait_request(cfqq);
1592 cfq_clear_cfqq_must_dispatch(cfqq); 1637 cfq_clear_cfqq_must_dispatch(cfqq);
@@ -1839,6 +1884,9 @@ static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1839 BUG_ON(!service_tree); 1884 BUG_ON(!service_tree);
1840 BUG_ON(!service_tree->count); 1885 BUG_ON(!service_tree->count);
1841 1886
1887 if (!cfqd->cfq_slice_idle)
1888 return false;
1889
1842 /* We never do for idle class queues. */ 1890 /* We never do for idle class queues. */
1843 if (prio == IDLE_WORKLOAD) 1891 if (prio == IDLE_WORKLOAD)
1844 return false; 1892 return false;
@@ -1863,7 +1911,7 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
1863{ 1911{
1864 struct cfq_queue *cfqq = cfqd->active_queue; 1912 struct cfq_queue *cfqq = cfqd->active_queue;
1865 struct cfq_io_context *cic; 1913 struct cfq_io_context *cic;
1866 unsigned long sl; 1914 unsigned long sl, group_idle = 0;
1867 1915
1868 /* 1916 /*
1869 * SSD device without seek penalty, disable idling. But only do so 1917 * SSD device without seek penalty, disable idling. But only do so
@@ -1879,8 +1927,13 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
1879 /* 1927 /*
1880 * idle is disabled, either manually or by past process history 1928 * idle is disabled, either manually or by past process history
1881 */ 1929 */
1882 if (!cfqd->cfq_slice_idle || !cfq_should_idle(cfqd, cfqq)) 1930 if (!cfq_should_idle(cfqd, cfqq)) {
1883 return; 1931 /* no queue idling. Check for group idling */
1932 if (cfqd->cfq_group_idle)
1933 group_idle = cfqd->cfq_group_idle;
1934 else
1935 return;
1936 }
1884 1937
1885 /* 1938 /*
1886 * still active requests from this queue, don't idle 1939 * still active requests from this queue, don't idle
@@ -1907,13 +1960,21 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
1907 return; 1960 return;
1908 } 1961 }
1909 1962
1963 /* There are other queues in the group, don't do group idle */
1964 if (group_idle && cfqq->cfqg->nr_cfqq > 1)
1965 return;
1966
1910 cfq_mark_cfqq_wait_request(cfqq); 1967 cfq_mark_cfqq_wait_request(cfqq);
1911 1968
1912 sl = cfqd->cfq_slice_idle; 1969 if (group_idle)
1970 sl = cfqd->cfq_group_idle;
1971 else
1972 sl = cfqd->cfq_slice_idle;
1913 1973
1914 mod_timer(&cfqd->idle_slice_timer, jiffies + sl); 1974 mod_timer(&cfqd->idle_slice_timer, jiffies + sl);
1915 cfq_blkiocg_update_set_idle_time_stats(&cfqq->cfqg->blkg); 1975 cfq_blkiocg_update_set_idle_time_stats(&cfqq->cfqg->blkg);
1916 cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu", sl); 1976 cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu group_idle: %d", sl,
1977 group_idle ? 1 : 0);
1917} 1978}
1918 1979
1919/* 1980/*
@@ -1929,9 +1990,11 @@ static void cfq_dispatch_insert(struct request_queue *q, struct request *rq)
1929 cfqq->next_rq = cfq_find_next_rq(cfqd, cfqq, rq); 1990 cfqq->next_rq = cfq_find_next_rq(cfqd, cfqq, rq);
1930 cfq_remove_request(rq); 1991 cfq_remove_request(rq);
1931 cfqq->dispatched++; 1992 cfqq->dispatched++;
1993 (RQ_CFQG(rq))->dispatched++;
1932 elv_dispatch_sort(q, rq); 1994 elv_dispatch_sort(q, rq);
1933 1995
1934 cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++; 1996 cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++;
1997 cfqq->nr_sectors += blk_rq_sectors(rq);
1935 cfq_blkiocg_update_dispatch_stats(&cfqq->cfqg->blkg, blk_rq_bytes(rq), 1998 cfq_blkiocg_update_dispatch_stats(&cfqq->cfqg->blkg, blk_rq_bytes(rq),
1936 rq_data_dir(rq), rq_is_sync(rq)); 1999 rq_data_dir(rq), rq_is_sync(rq));
1937} 2000}
@@ -2126,7 +2189,6 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
2126 slice = max_t(unsigned, slice, CFQ_MIN_TT); 2189 slice = max_t(unsigned, slice, CFQ_MIN_TT);
2127 cfq_log(cfqd, "workload slice:%d", slice); 2190 cfq_log(cfqd, "workload slice:%d", slice);
2128 cfqd->workload_expires = jiffies + slice; 2191 cfqd->workload_expires = jiffies + slice;
2129 cfqd->noidle_tree_requires_idle = false;
2130} 2192}
2131 2193
2132static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd) 2194static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd)
@@ -2198,7 +2260,7 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
2198 cfqq = NULL; 2260 cfqq = NULL;
2199 goto keep_queue; 2261 goto keep_queue;
2200 } else 2262 } else
2201 goto expire; 2263 goto check_group_idle;
2202 } 2264 }
2203 2265
2204 /* 2266 /*
@@ -2226,8 +2288,23 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
2226 * flight or is idling for a new request, allow either of these 2288 * flight or is idling for a new request, allow either of these
2227 * conditions to happen (or time out) before selecting a new queue. 2289 * conditions to happen (or time out) before selecting a new queue.
2228 */ 2290 */
2229 if (timer_pending(&cfqd->idle_slice_timer) || 2291 if (timer_pending(&cfqd->idle_slice_timer)) {
2230 (cfqq->dispatched && cfq_should_idle(cfqd, cfqq))) { 2292 cfqq = NULL;
2293 goto keep_queue;
2294 }
2295
2296 if (cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) {
2297 cfqq = NULL;
2298 goto keep_queue;
2299 }
2300
2301 /*
2302 * If group idle is enabled and there are requests dispatched from
2303 * this group, wait for requests to complete.
2304 */
2305check_group_idle:
2306 if (cfqd->cfq_group_idle && cfqq->cfqg->nr_cfqq == 1
2307 && cfqq->cfqg->dispatched) {
2231 cfqq = NULL; 2308 cfqq = NULL;
2232 goto keep_queue; 2309 goto keep_queue;
2233 } 2310 }
@@ -3108,7 +3185,9 @@ cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
3108 if (cfqq->queued[0] + cfqq->queued[1] >= 4) 3185 if (cfqq->queued[0] + cfqq->queued[1] >= 4)
3109 cfq_mark_cfqq_deep(cfqq); 3186 cfq_mark_cfqq_deep(cfqq);
3110 3187
3111 if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle || 3188 if (cfqq->next_rq && (cfqq->next_rq->cmd_flags & REQ_NOIDLE))
3189 enable_idle = 0;
3190 else if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle ||
3112 (!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq))) 3191 (!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq)))
3113 enable_idle = 0; 3192 enable_idle = 0;
3114 else if (sample_valid(cic->ttime_samples)) { 3193 else if (sample_valid(cic->ttime_samples)) {
@@ -3375,6 +3454,7 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
3375 WARN_ON(!cfqq->dispatched); 3454 WARN_ON(!cfqq->dispatched);
3376 cfqd->rq_in_driver--; 3455 cfqd->rq_in_driver--;
3377 cfqq->dispatched--; 3456 cfqq->dispatched--;
3457 (RQ_CFQG(rq))->dispatched--;
3378 cfq_blkiocg_update_completion_stats(&cfqq->cfqg->blkg, 3458 cfq_blkiocg_update_completion_stats(&cfqq->cfqg->blkg,
3379 rq_start_time_ns(rq), rq_io_start_time_ns(rq), 3459 rq_start_time_ns(rq), rq_io_start_time_ns(rq),
3380 rq_data_dir(rq), rq_is_sync(rq)); 3460 rq_data_dir(rq), rq_is_sync(rq));
@@ -3404,7 +3484,10 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
3404 * the queue. 3484 * the queue.
3405 */ 3485 */
3406 if (cfq_should_wait_busy(cfqd, cfqq)) { 3486 if (cfq_should_wait_busy(cfqd, cfqq)) {
3407 cfqq->slice_end = jiffies + cfqd->cfq_slice_idle; 3487 unsigned long extend_sl = cfqd->cfq_slice_idle;
3488 if (!cfqd->cfq_slice_idle)
3489 extend_sl = cfqd->cfq_group_idle;
3490 cfqq->slice_end = jiffies + extend_sl;
3408 cfq_mark_cfqq_wait_busy(cfqq); 3491 cfq_mark_cfqq_wait_busy(cfqq);
3409 cfq_log_cfqq(cfqd, cfqq, "will busy wait"); 3492 cfq_log_cfqq(cfqd, cfqq, "will busy wait");
3410 } 3493 }
@@ -3421,17 +3504,7 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
3421 cfq_slice_expired(cfqd, 1); 3504 cfq_slice_expired(cfqd, 1);
3422 else if (sync && cfqq_empty && 3505 else if (sync && cfqq_empty &&
3423 !cfq_close_cooperator(cfqd, cfqq)) { 3506 !cfq_close_cooperator(cfqd, cfqq)) {
3424 cfqd->noidle_tree_requires_idle |= 3507 cfq_arm_slice_timer(cfqd);
3425 !(rq->cmd_flags & REQ_NOIDLE);
3426 /*
3427 * Idling is enabled for SYNC_WORKLOAD.
3428 * SYNC_NOIDLE_WORKLOAD idles at the end of the tree
3429 * only if we processed at least one !REQ_NOIDLE request
3430 */
3431 if (cfqd->serving_type == SYNC_WORKLOAD
3432 || cfqd->noidle_tree_requires_idle
3433 || cfqq->cfqg->nr_cfqq == 1)
3434 cfq_arm_slice_timer(cfqd);
3435 } 3508 }
3436 } 3509 }
3437 3510
@@ -3850,6 +3923,7 @@ static void *cfq_init_queue(struct request_queue *q)
3850 cfqd->cfq_slice[1] = cfq_slice_sync; 3923 cfqd->cfq_slice[1] = cfq_slice_sync;
3851 cfqd->cfq_slice_async_rq = cfq_slice_async_rq; 3924 cfqd->cfq_slice_async_rq = cfq_slice_async_rq;
3852 cfqd->cfq_slice_idle = cfq_slice_idle; 3925 cfqd->cfq_slice_idle = cfq_slice_idle;
3926 cfqd->cfq_group_idle = cfq_group_idle;
3853 cfqd->cfq_latency = 1; 3927 cfqd->cfq_latency = 1;
3854 cfqd->cfq_group_isolation = 0; 3928 cfqd->cfq_group_isolation = 0;
3855 cfqd->hw_tag = -1; 3929 cfqd->hw_tag = -1;
@@ -3922,6 +3996,7 @@ SHOW_FUNCTION(cfq_fifo_expire_async_show, cfqd->cfq_fifo_expire[0], 1);
3922SHOW_FUNCTION(cfq_back_seek_max_show, cfqd->cfq_back_max, 0); 3996SHOW_FUNCTION(cfq_back_seek_max_show, cfqd->cfq_back_max, 0);
3923SHOW_FUNCTION(cfq_back_seek_penalty_show, cfqd->cfq_back_penalty, 0); 3997SHOW_FUNCTION(cfq_back_seek_penalty_show, cfqd->cfq_back_penalty, 0);
3924SHOW_FUNCTION(cfq_slice_idle_show, cfqd->cfq_slice_idle, 1); 3998SHOW_FUNCTION(cfq_slice_idle_show, cfqd->cfq_slice_idle, 1);
3999SHOW_FUNCTION(cfq_group_idle_show, cfqd->cfq_group_idle, 1);
3925SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1); 4000SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1);
3926SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1); 4001SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1);
3927SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0); 4002SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0);
@@ -3954,6 +4029,7 @@ STORE_FUNCTION(cfq_back_seek_max_store, &cfqd->cfq_back_max, 0, UINT_MAX, 0);
3954STORE_FUNCTION(cfq_back_seek_penalty_store, &cfqd->cfq_back_penalty, 1, 4029STORE_FUNCTION(cfq_back_seek_penalty_store, &cfqd->cfq_back_penalty, 1,
3955 UINT_MAX, 0); 4030 UINT_MAX, 0);
3956STORE_FUNCTION(cfq_slice_idle_store, &cfqd->cfq_slice_idle, 0, UINT_MAX, 1); 4031STORE_FUNCTION(cfq_slice_idle_store, &cfqd->cfq_slice_idle, 0, UINT_MAX, 1);
4032STORE_FUNCTION(cfq_group_idle_store, &cfqd->cfq_group_idle, 0, UINT_MAX, 1);
3957STORE_FUNCTION(cfq_slice_sync_store, &cfqd->cfq_slice[1], 1, UINT_MAX, 1); 4033STORE_FUNCTION(cfq_slice_sync_store, &cfqd->cfq_slice[1], 1, UINT_MAX, 1);
3958STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1); 4034STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1);
3959STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1, 4035STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1,
@@ -3975,6 +4051,7 @@ static struct elv_fs_entry cfq_attrs[] = {
3975 CFQ_ATTR(slice_async), 4051 CFQ_ATTR(slice_async),
3976 CFQ_ATTR(slice_async_rq), 4052 CFQ_ATTR(slice_async_rq),
3977 CFQ_ATTR(slice_idle), 4053 CFQ_ATTR(slice_idle),
4054 CFQ_ATTR(group_idle),
3978 CFQ_ATTR(low_latency), 4055 CFQ_ATTR(low_latency),
3979 CFQ_ATTR(group_isolation), 4056 CFQ_ATTR(group_isolation),
3980 __ATTR_NULL 4057 __ATTR_NULL
@@ -4013,6 +4090,7 @@ static struct blkio_policy_type blkio_policy_cfq = {
4013 .blkio_unlink_group_fn = cfq_unlink_blkio_group, 4090 .blkio_unlink_group_fn = cfq_unlink_blkio_group,
4014 .blkio_update_group_weight_fn = cfq_update_blkio_group_weight, 4091 .blkio_update_group_weight_fn = cfq_update_blkio_group_weight,
4015 }, 4092 },
4093 .plid = BLKIO_POLICY_PROP,
4016}; 4094};
4017#else 4095#else
4018static struct blkio_policy_type blkio_policy_cfq; 4096static struct blkio_policy_type blkio_policy_cfq;
@@ -4028,6 +4106,12 @@ static int __init cfq_init(void)
4028 if (!cfq_slice_idle) 4106 if (!cfq_slice_idle)
4029 cfq_slice_idle = 1; 4107 cfq_slice_idle = 1;
4030 4108
4109#ifdef CONFIG_CFQ_GROUP_IOSCHED
4110 if (!cfq_group_idle)
4111 cfq_group_idle = 1;
4112#else
4113 cfq_group_idle = 0;
4114#endif
4031 if (cfq_slab_setup()) 4115 if (cfq_slab_setup())
4032 return -ENOMEM; 4116 return -ENOMEM;
4033 4117
diff --git a/block/cfq.h b/block/cfq.h
index 93448e5a2e41..54a6d90f8e8c 100644
--- a/block/cfq.h
+++ b/block/cfq.h
@@ -69,7 +69,7 @@ static inline void cfq_blkiocg_update_completion_stats(struct blkio_group *blkg,
69 69
70static inline void cfq_blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, 70static inline void cfq_blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
71 struct blkio_group *blkg, void *key, dev_t dev) { 71 struct blkio_group *blkg, void *key, dev_t dev) {
72 blkiocg_add_blkio_group(blkcg, blkg, key, dev); 72 blkiocg_add_blkio_group(blkcg, blkg, key, dev, BLKIO_POLICY_PROP);
73} 73}
74 74
75static inline int cfq_blkiocg_del_blkio_group(struct blkio_group *blkg) 75static inline int cfq_blkiocg_del_blkio_group(struct blkio_group *blkg)
diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c
index d53085637731..119f07b74dc0 100644
--- a/block/compat_ioctl.c
+++ b/block/compat_ioctl.c
@@ -703,6 +703,7 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
703 case BLKFLSBUF: 703 case BLKFLSBUF:
704 case BLKROSET: 704 case BLKROSET:
705 case BLKDISCARD: 705 case BLKDISCARD:
706 case BLKSECDISCARD:
706 /* 707 /*
707 * the ones below are implemented in blkdev_locked_ioctl, 708 * the ones below are implemented in blkdev_locked_ioctl,
708 * but we call blkdev_ioctl, which gets the lock for us 709 * but we call blkdev_ioctl, which gets the lock for us
diff --git a/block/elevator.c b/block/elevator.c
index 816a7c8d6394..282e8308f7e2 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -83,6 +83,12 @@ int elv_rq_merge_ok(struct request *rq, struct bio *bio)
83 return 0; 83 return 0;
84 84
85 /* 85 /*
86 * Don't merge discard requests and secure discard requests
87 */
88 if ((bio->bi_rw & REQ_SECURE) != (rq->bio->bi_rw & REQ_SECURE))
89 return 0;
90
91 /*
86 * different data direction or already started, don't merge 92 * different data direction or already started, don't merge
87 */ 93 */
88 if (bio_data_dir(bio) != rq_data_dir(rq)) 94 if (bio_data_dir(bio) != rq_data_dir(rq))
@@ -611,8 +617,6 @@ void elv_quiesce_end(struct request_queue *q)
611 617
612void elv_insert(struct request_queue *q, struct request *rq, int where) 618void elv_insert(struct request_queue *q, struct request *rq, int where)
613{ 619{
614 struct list_head *pos;
615 unsigned ordseq;
616 int unplug_it = 1; 620 int unplug_it = 1;
617 621
618 trace_block_rq_insert(q, rq); 622 trace_block_rq_insert(q, rq);
@@ -620,9 +624,16 @@ void elv_insert(struct request_queue *q, struct request *rq, int where)
620 rq->q = q; 624 rq->q = q;
621 625
622 switch (where) { 626 switch (where) {
627 case ELEVATOR_INSERT_REQUEUE:
628 /*
629 * Most requeues happen because of a busy condition,
630 * don't force unplug of the queue for that case.
631 * Clear unplug_it and fall through.
632 */
633 unplug_it = 0;
634
623 case ELEVATOR_INSERT_FRONT: 635 case ELEVATOR_INSERT_FRONT:
624 rq->cmd_flags |= REQ_SOFTBARRIER; 636 rq->cmd_flags |= REQ_SOFTBARRIER;
625
626 list_add(&rq->queuelist, &q->queue_head); 637 list_add(&rq->queuelist, &q->queue_head);
627 break; 638 break;
628 639
@@ -662,36 +673,6 @@ void elv_insert(struct request_queue *q, struct request *rq, int where)
662 q->elevator->ops->elevator_add_req_fn(q, rq); 673 q->elevator->ops->elevator_add_req_fn(q, rq);
663 break; 674 break;
664 675
665 case ELEVATOR_INSERT_REQUEUE:
666 /*
667 * If ordered flush isn't in progress, we do front
668 * insertion; otherwise, requests should be requeued
669 * in ordseq order.
670 */
671 rq->cmd_flags |= REQ_SOFTBARRIER;
672
673 /*
674 * Most requeues happen because of a busy condition,
675 * don't force unplug of the queue for that case.
676 */
677 unplug_it = 0;
678
679 if (q->ordseq == 0) {
680 list_add(&rq->queuelist, &q->queue_head);
681 break;
682 }
683
684 ordseq = blk_ordered_req_seq(rq);
685
686 list_for_each(pos, &q->queue_head) {
687 struct request *pos_rq = list_entry_rq(pos);
688 if (ordseq <= blk_ordered_req_seq(pos_rq))
689 break;
690 }
691
692 list_add_tail(&rq->queuelist, pos);
693 break;
694
695 default: 676 default:
696 printk(KERN_ERR "%s: bad insertion point %d\n", 677 printk(KERN_ERR "%s: bad insertion point %d\n",
697 __func__, where); 678 __func__, where);
@@ -710,26 +691,8 @@ void elv_insert(struct request_queue *q, struct request *rq, int where)
710void __elv_add_request(struct request_queue *q, struct request *rq, int where, 691void __elv_add_request(struct request_queue *q, struct request *rq, int where,
711 int plug) 692 int plug)
712{ 693{
713 if (q->ordcolor)
714 rq->cmd_flags |= REQ_ORDERED_COLOR;
715
716 if (rq->cmd_flags & (REQ_SOFTBARRIER | REQ_HARDBARRIER)) { 694 if (rq->cmd_flags & (REQ_SOFTBARRIER | REQ_HARDBARRIER)) {
717 /* 695 /* barriers are scheduling boundary, update end_sector */
718 * toggle ordered color
719 */
720 if (rq->cmd_flags & REQ_HARDBARRIER)
721 q->ordcolor ^= 1;
722
723 /*
724 * barriers implicitly indicate back insertion
725 */
726 if (where == ELEVATOR_INSERT_SORT)
727 where = ELEVATOR_INSERT_BACK;
728
729 /*
730 * this request is scheduling boundary, update
731 * end_sector
732 */
733 if (rq->cmd_type == REQ_TYPE_FS || 696 if (rq->cmd_type == REQ_TYPE_FS ||
734 (rq->cmd_flags & REQ_DISCARD)) { 697 (rq->cmd_flags & REQ_DISCARD)) {
735 q->end_sector = rq_end_sector(rq); 698 q->end_sector = rq_end_sector(rq);
@@ -849,24 +812,6 @@ void elv_completed_request(struct request_queue *q, struct request *rq)
849 e->ops->elevator_completed_req_fn) 812 e->ops->elevator_completed_req_fn)
850 e->ops->elevator_completed_req_fn(q, rq); 813 e->ops->elevator_completed_req_fn(q, rq);
851 } 814 }
852
853 /*
854 * Check if the queue is waiting for fs requests to be
855 * drained for flush sequence.
856 */
857 if (unlikely(q->ordseq)) {
858 struct request *next = NULL;
859
860 if (!list_empty(&q->queue_head))
861 next = list_entry_rq(q->queue_head.next);
862
863 if (!queue_in_flight(q) &&
864 blk_ordered_cur_seq(q) == QUEUE_ORDSEQ_DRAIN &&
865 (!next || blk_ordered_req_seq(next) > QUEUE_ORDSEQ_DRAIN)) {
866 blk_ordered_complete_seq(q, QUEUE_ORDSEQ_DRAIN, 0);
867 __blk_run_queue(q);
868 }
869 }
870} 815}
871 816
872#define to_elv(atr) container_of((atr), struct elv_fs_entry, attr) 817#define to_elv(atr) container_of((atr), struct elv_fs_entry, attr)
@@ -932,6 +877,7 @@ int elv_register_queue(struct request_queue *q)
932 } 877 }
933 } 878 }
934 kobject_uevent(&e->kobj, KOBJ_ADD); 879 kobject_uevent(&e->kobj, KOBJ_ADD);
880 e->registered = 1;
935 } 881 }
936 return error; 882 return error;
937} 883}
@@ -941,6 +887,7 @@ static void __elv_unregister_queue(struct elevator_queue *e)
941{ 887{
942 kobject_uevent(&e->kobj, KOBJ_REMOVE); 888 kobject_uevent(&e->kobj, KOBJ_REMOVE);
943 kobject_del(&e->kobj); 889 kobject_del(&e->kobj);
890 e->registered = 0;
944} 891}
945 892
946void elv_unregister_queue(struct request_queue *q) 893void elv_unregister_queue(struct request_queue *q)
@@ -1003,18 +950,19 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
1003{ 950{
1004 struct elevator_queue *old_elevator, *e; 951 struct elevator_queue *old_elevator, *e;
1005 void *data; 952 void *data;
953 int err;
1006 954
1007 /* 955 /*
1008 * Allocate new elevator 956 * Allocate new elevator
1009 */ 957 */
1010 e = elevator_alloc(q, new_e); 958 e = elevator_alloc(q, new_e);
1011 if (!e) 959 if (!e)
1012 return 0; 960 return -ENOMEM;
1013 961
1014 data = elevator_init_queue(q, e); 962 data = elevator_init_queue(q, e);
1015 if (!data) { 963 if (!data) {
1016 kobject_put(&e->kobj); 964 kobject_put(&e->kobj);
1017 return 0; 965 return -ENOMEM;
1018 } 966 }
1019 967
1020 /* 968 /*
@@ -1035,10 +983,13 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
1035 983
1036 spin_unlock_irq(q->queue_lock); 984 spin_unlock_irq(q->queue_lock);
1037 985
1038 __elv_unregister_queue(old_elevator); 986 if (old_elevator->registered) {
987 __elv_unregister_queue(old_elevator);
1039 988
1040 if (elv_register_queue(q)) 989 err = elv_register_queue(q);
1041 goto fail_register; 990 if (err)
991 goto fail_register;
992 }
1042 993
1043 /* 994 /*
1044 * finally exit old elevator and turn off BYPASS. 995 * finally exit old elevator and turn off BYPASS.
@@ -1050,7 +1001,7 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
1050 1001
1051 blk_add_trace_msg(q, "elv switch: %s", e->elevator_type->elevator_name); 1002 blk_add_trace_msg(q, "elv switch: %s", e->elevator_type->elevator_name);
1052 1003
1053 return 1; 1004 return 0;
1054 1005
1055fail_register: 1006fail_register:
1056 /* 1007 /*
@@ -1065,17 +1016,19 @@ fail_register:
1065 queue_flag_clear(QUEUE_FLAG_ELVSWITCH, q); 1016 queue_flag_clear(QUEUE_FLAG_ELVSWITCH, q);
1066 spin_unlock_irq(q->queue_lock); 1017 spin_unlock_irq(q->queue_lock);
1067 1018
1068 return 0; 1019 return err;
1069} 1020}
1070 1021
1071ssize_t elv_iosched_store(struct request_queue *q, const char *name, 1022/*
1072 size_t count) 1023 * Switch this queue to the given IO scheduler.
1024 */
1025int elevator_change(struct request_queue *q, const char *name)
1073{ 1026{
1074 char elevator_name[ELV_NAME_MAX]; 1027 char elevator_name[ELV_NAME_MAX];
1075 struct elevator_type *e; 1028 struct elevator_type *e;
1076 1029
1077 if (!q->elevator) 1030 if (!q->elevator)
1078 return count; 1031 return -ENXIO;
1079 1032
1080 strlcpy(elevator_name, name, sizeof(elevator_name)); 1033 strlcpy(elevator_name, name, sizeof(elevator_name));
1081 e = elevator_get(strstrip(elevator_name)); 1034 e = elevator_get(strstrip(elevator_name));
@@ -1086,13 +1039,27 @@ ssize_t elv_iosched_store(struct request_queue *q, const char *name,
1086 1039
1087 if (!strcmp(elevator_name, q->elevator->elevator_type->elevator_name)) { 1040 if (!strcmp(elevator_name, q->elevator->elevator_type->elevator_name)) {
1088 elevator_put(e); 1041 elevator_put(e);
1089 return count; 1042 return 0;
1090 } 1043 }
1091 1044
1092 if (!elevator_switch(q, e)) 1045 return elevator_switch(q, e);
1093 printk(KERN_ERR "elevator: switch to %s failed\n", 1046}
1094 elevator_name); 1047EXPORT_SYMBOL(elevator_change);
1095 return count; 1048
1049ssize_t elv_iosched_store(struct request_queue *q, const char *name,
1050 size_t count)
1051{
1052 int ret;
1053
1054 if (!q->elevator)
1055 return count;
1056
1057 ret = elevator_change(q, name);
1058 if (!ret)
1059 return count;
1060
1061 printk(KERN_ERR "elevator: switch to %s failed\n", name);
1062 return ret;
1096} 1063}
1097 1064
1098ssize_t elv_iosched_show(struct request_queue *q, char *name) 1065ssize_t elv_iosched_show(struct request_queue *q, char *name)
diff --git a/block/genhd.c b/block/genhd.c
index 59a2db6fecef..a8adf96a4b41 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -22,9 +22,7 @@
22#include "blk.h" 22#include "blk.h"
23 23
24static DEFINE_MUTEX(block_class_lock); 24static DEFINE_MUTEX(block_class_lock);
25#ifndef CONFIG_SYSFS_DEPRECATED
26struct kobject *block_depr; 25struct kobject *block_depr;
27#endif
28 26
29/* for extended dynamic devt allocation, currently only one major is used */ 27/* for extended dynamic devt allocation, currently only one major is used */
30#define MAX_EXT_DEVT (1 << MINORBITS) 28#define MAX_EXT_DEVT (1 << MINORBITS)
@@ -541,13 +539,15 @@ void add_disk(struct gendisk *disk)
541 disk->major = MAJOR(devt); 539 disk->major = MAJOR(devt);
542 disk->first_minor = MINOR(devt); 540 disk->first_minor = MINOR(devt);
543 541
542 /* Register BDI before referencing it from bdev */
543 bdi = &disk->queue->backing_dev_info;
544 bdi_register_dev(bdi, disk_devt(disk));
545
544 blk_register_region(disk_devt(disk), disk->minors, NULL, 546 blk_register_region(disk_devt(disk), disk->minors, NULL,
545 exact_match, exact_lock, disk); 547 exact_match, exact_lock, disk);
546 register_disk(disk); 548 register_disk(disk);
547 blk_register_queue(disk); 549 blk_register_queue(disk);
548 550
549 bdi = &disk->queue->backing_dev_info;
550 bdi_register_dev(bdi, disk_devt(disk));
551 retval = sysfs_create_link(&disk_to_dev(disk)->kobj, &bdi->dev->kobj, 551 retval = sysfs_create_link(&disk_to_dev(disk)->kobj, &bdi->dev->kobj,
552 "bdi"); 552 "bdi");
553 WARN_ON(retval); 553 WARN_ON(retval);
@@ -642,6 +642,7 @@ void __init printk_all_partitions(void)
642 struct hd_struct *part; 642 struct hd_struct *part;
643 char name_buf[BDEVNAME_SIZE]; 643 char name_buf[BDEVNAME_SIZE];
644 char devt_buf[BDEVT_SIZE]; 644 char devt_buf[BDEVT_SIZE];
645 u8 uuid[PARTITION_META_INFO_UUIDLTH * 2 + 1];
645 646
646 /* 647 /*
647 * Don't show empty devices or things that have been 648 * Don't show empty devices or things that have been
@@ -660,10 +661,14 @@ void __init printk_all_partitions(void)
660 while ((part = disk_part_iter_next(&piter))) { 661 while ((part = disk_part_iter_next(&piter))) {
661 bool is_part0 = part == &disk->part0; 662 bool is_part0 = part == &disk->part0;
662 663
663 printk("%s%s %10llu %s", is_part0 ? "" : " ", 664 uuid[0] = 0;
665 if (part->info)
666 part_unpack_uuid(part->info->uuid, uuid);
667
668 printk("%s%s %10llu %s %s", is_part0 ? "" : " ",
664 bdevt_str(part_devt(part), devt_buf), 669 bdevt_str(part_devt(part), devt_buf),
665 (unsigned long long)part->nr_sects >> 1, 670 (unsigned long long)part->nr_sects >> 1,
666 disk_name(disk, part->partno, name_buf)); 671 disk_name(disk, part->partno, name_buf), uuid);
667 if (is_part0) { 672 if (is_part0) {
668 if (disk->driverfs_dev != NULL && 673 if (disk->driverfs_dev != NULL &&
669 disk->driverfs_dev->driver != NULL) 674 disk->driverfs_dev->driver != NULL)
@@ -803,10 +808,9 @@ static int __init genhd_device_init(void)
803 808
804 register_blkdev(BLOCK_EXT_MAJOR, "blkext"); 809 register_blkdev(BLOCK_EXT_MAJOR, "blkext");
805 810
806#ifndef CONFIG_SYSFS_DEPRECATED
807 /* create top-level block dir */ 811 /* create top-level block dir */
808 block_depr = kobject_create_and_add("block", NULL); 812 if (!sysfs_deprecated)
809#endif 813 block_depr = kobject_create_and_add("block", NULL);
810 return 0; 814 return 0;
811} 815}
812 816
@@ -925,8 +929,15 @@ static void disk_free_ptbl_rcu_cb(struct rcu_head *head)
925{ 929{
926 struct disk_part_tbl *ptbl = 930 struct disk_part_tbl *ptbl =
927 container_of(head, struct disk_part_tbl, rcu_head); 931 container_of(head, struct disk_part_tbl, rcu_head);
932 struct gendisk *disk = ptbl->disk;
933 struct request_queue *q = disk->queue;
934 unsigned long flags;
928 935
929 kfree(ptbl); 936 kfree(ptbl);
937
938 spin_lock_irqsave(q->queue_lock, flags);
939 elv_quiesce_end(q);
940 spin_unlock_irqrestore(q->queue_lock, flags);
930} 941}
931 942
932/** 943/**
@@ -944,11 +955,17 @@ static void disk_replace_part_tbl(struct gendisk *disk,
944 struct disk_part_tbl *new_ptbl) 955 struct disk_part_tbl *new_ptbl)
945{ 956{
946 struct disk_part_tbl *old_ptbl = disk->part_tbl; 957 struct disk_part_tbl *old_ptbl = disk->part_tbl;
958 struct request_queue *q = disk->queue;
947 959
948 rcu_assign_pointer(disk->part_tbl, new_ptbl); 960 rcu_assign_pointer(disk->part_tbl, new_ptbl);
949 961
950 if (old_ptbl) { 962 if (old_ptbl) {
951 rcu_assign_pointer(old_ptbl->last_lookup, NULL); 963 rcu_assign_pointer(old_ptbl->last_lookup, NULL);
964
965 spin_lock_irq(q->queue_lock);
966 elv_quiesce_start(q);
967 spin_unlock_irq(q->queue_lock);
968
952 call_rcu(&old_ptbl->rcu_head, disk_free_ptbl_rcu_cb); 969 call_rcu(&old_ptbl->rcu_head, disk_free_ptbl_rcu_cb);
953 } 970 }
954} 971}
@@ -989,6 +1006,7 @@ int disk_expand_part_tbl(struct gendisk *disk, int partno)
989 return -ENOMEM; 1006 return -ENOMEM;
990 1007
991 new_ptbl->len = target; 1008 new_ptbl->len = target;
1009 new_ptbl->disk = disk;
992 1010
993 for (i = 0; i < len; i++) 1011 for (i = 0; i < len; i++)
994 rcu_assign_pointer(new_ptbl->part[i], old_ptbl->part[i]); 1012 rcu_assign_pointer(new_ptbl->part[i], old_ptbl->part[i]);
@@ -1004,6 +1022,7 @@ static void disk_release(struct device *dev)
1004 kfree(disk->random); 1022 kfree(disk->random);
1005 disk_replace_part_tbl(disk, NULL); 1023 disk_replace_part_tbl(disk, NULL);
1006 free_part_stats(&disk->part0); 1024 free_part_stats(&disk->part0);
1025 free_part_info(&disk->part0);
1007 kfree(disk); 1026 kfree(disk);
1008} 1027}
1009struct class block_class = { 1028struct class block_class = {
diff --git a/block/ioctl.c b/block/ioctl.c
index 09fd7f1ef23a..d724ceb1d465 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -62,7 +62,7 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user
62 62
63 /* all seems OK */ 63 /* all seems OK */
64 part = add_partition(disk, partno, start, length, 64 part = add_partition(disk, partno, start, length,
65 ADDPART_FLAG_NONE); 65 ADDPART_FLAG_NONE, NULL);
66 mutex_unlock(&bdev->bd_mutex); 66 mutex_unlock(&bdev->bd_mutex);
67 return IS_ERR(part) ? PTR_ERR(part) : 0; 67 return IS_ERR(part) ? PTR_ERR(part) : 0;
68 case BLKPG_DEL_PARTITION: 68 case BLKPG_DEL_PARTITION:
@@ -114,8 +114,10 @@ static int blkdev_reread_part(struct block_device *bdev)
114} 114}
115 115
116static int blk_ioctl_discard(struct block_device *bdev, uint64_t start, 116static int blk_ioctl_discard(struct block_device *bdev, uint64_t start,
117 uint64_t len) 117 uint64_t len, int secure)
118{ 118{
119 unsigned long flags = 0;
120
119 if (start & 511) 121 if (start & 511)
120 return -EINVAL; 122 return -EINVAL;
121 if (len & 511) 123 if (len & 511)
@@ -125,8 +127,9 @@ static int blk_ioctl_discard(struct block_device *bdev, uint64_t start,
125 127
126 if (start + len > (bdev->bd_inode->i_size >> 9)) 128 if (start + len > (bdev->bd_inode->i_size >> 9))
127 return -EINVAL; 129 return -EINVAL;
128 return blkdev_issue_discard(bdev, start, len, GFP_KERNEL, 130 if (secure)
129 BLKDEV_IFL_WAIT); 131 flags |= BLKDEV_DISCARD_SECURE;
132 return blkdev_issue_discard(bdev, start, len, GFP_KERNEL, flags);
130} 133}
131 134
132static int put_ushort(unsigned long arg, unsigned short val) 135static int put_ushort(unsigned long arg, unsigned short val)
@@ -213,7 +216,8 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
213 set_device_ro(bdev, n); 216 set_device_ro(bdev, n);
214 return 0; 217 return 0;
215 218
216 case BLKDISCARD: { 219 case BLKDISCARD:
220 case BLKSECDISCARD: {
217 uint64_t range[2]; 221 uint64_t range[2];
218 222
219 if (!(mode & FMODE_WRITE)) 223 if (!(mode & FMODE_WRITE))
@@ -222,7 +226,8 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
222 if (copy_from_user(range, (void __user *)arg, sizeof(range))) 226 if (copy_from_user(range, (void __user *)arg, sizeof(range)))
223 return -EFAULT; 227 return -EFAULT;
224 228
225 return blk_ioctl_discard(bdev, range[0], range[1]); 229 return blk_ioctl_discard(bdev, range[0], range[1],
230 cmd == BLKSECDISCARD);
226 } 231 }
227 232
228 case HDIO_GETGEO: { 233 case HDIO_GETGEO: {