aboutsummaryrefslogtreecommitdiffstats
path: root/block
diff options
context:
space:
mode:
Diffstat (limited to 'block')
-rw-r--r--block/Kconfig14
-rw-r--r--block/Makefile3
-rw-r--r--block/blk-barrier.c350
-rw-r--r--block/blk-cgroup.c997
-rw-r--r--block/blk-cgroup.h138
-rw-r--r--block/blk-core.c829
-rw-r--r--block/blk-exec.c15
-rw-r--r--block/blk-flush.c443
-rw-r--r--block/blk-integrity.c106
-rw-r--r--block/blk-ioc.c26
-rw-r--r--block/blk-lib.c134
-rw-r--r--block/blk-map.c8
-rw-r--r--block/blk-merge.c38
-rw-r--r--block/blk-settings.c107
-rw-r--r--block/blk-sysfs.c27
-rw-r--r--block/blk-throttle.c1312
-rw-r--r--block/blk.h45
-rw-r--r--block/bsg.c12
-rw-r--r--block/cfq-iosched.c624
-rw-r--r--block/cfq.h8
-rw-r--r--block/compat_ioctl.c5
-rw-r--r--block/deadline-iosched.c9
-rw-r--r--block/elevator.c202
-rw-r--r--block/genhd.c608
-rw-r--r--block/ioctl.c23
-rw-r--r--block/noop-iosched.c8
-rw-r--r--block/scsi_ioctl.c34
27 files changed, 4553 insertions, 1572 deletions
diff --git a/block/Kconfig b/block/Kconfig
index 9be0b56eaee1..60be1e0455da 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -2,7 +2,7 @@
2# Block layer core configuration 2# Block layer core configuration
3# 3#
4menuconfig BLOCK 4menuconfig BLOCK
5 bool "Enable the block layer" if EMBEDDED 5 bool "Enable the block layer" if EXPERT
6 default y 6 default y
7 help 7 help
8 Provide block layer support for the kernel. 8 Provide block layer support for the kernel.
@@ -77,6 +77,18 @@ config BLK_DEV_INTEGRITY
77 T10/SCSI Data Integrity Field or the T13/ATA External Path 77 T10/SCSI Data Integrity Field or the T13/ATA External Path
78 Protection. If in doubt, say N. 78 Protection. If in doubt, say N.
79 79
80config BLK_DEV_THROTTLING
81 bool "Block layer bio throttling support"
82 depends on BLK_CGROUP=y && EXPERIMENTAL
83 default n
84 ---help---
85 Block layer bio throttling support. It can be used to limit
86 the IO rate to a device. IO rate policies are per cgroup and
87 one needs to mount and use blkio cgroup controller for creating
88 cgroups and specifying per device IO rate policies.
89
90 See Documentation/cgroups/blkio-controller.txt for more information.
91
80endif # BLOCK 92endif # BLOCK
81 93
82config BLOCK_COMPAT 94config BLOCK_COMPAT
diff --git a/block/Makefile b/block/Makefile
index 0bb499a739cd..0fec4b3fab51 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -3,12 +3,13 @@
3# 3#
4 4
5obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \ 5obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \
6 blk-barrier.o blk-settings.o blk-ioc.o blk-map.o \ 6 blk-flush.o blk-settings.o blk-ioc.o blk-map.o \
7 blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \ 7 blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \
8 blk-iopoll.o blk-lib.o ioctl.o genhd.o scsi_ioctl.o 8 blk-iopoll.o blk-lib.o ioctl.o genhd.o scsi_ioctl.o
9 9
10obj-$(CONFIG_BLK_DEV_BSG) += bsg.o 10obj-$(CONFIG_BLK_DEV_BSG) += bsg.o
11obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o 11obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o
12obj-$(CONFIG_BLK_DEV_THROTTLING) += blk-throttle.o
12obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o 13obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o
13obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o 14obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o
14obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o 15obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o
diff --git a/block/blk-barrier.c b/block/blk-barrier.c
deleted file mode 100644
index f0faefca032f..000000000000
--- a/block/blk-barrier.c
+++ /dev/null
@@ -1,350 +0,0 @@
1/*
2 * Functions related to barrier IO handling
3 */
4#include <linux/kernel.h>
5#include <linux/module.h>
6#include <linux/bio.h>
7#include <linux/blkdev.h>
8#include <linux/gfp.h>
9
10#include "blk.h"
11
12/**
13 * blk_queue_ordered - does this queue support ordered writes
14 * @q: the request queue
15 * @ordered: one of QUEUE_ORDERED_*
16 *
17 * Description:
18 * For journalled file systems, doing ordered writes on a commit
19 * block instead of explicitly doing wait_on_buffer (which is bad
20 * for performance) can be a big win. Block drivers supporting this
21 * feature should call this function and indicate so.
22 *
23 **/
24int blk_queue_ordered(struct request_queue *q, unsigned ordered)
25{
26 if (ordered != QUEUE_ORDERED_NONE &&
27 ordered != QUEUE_ORDERED_DRAIN &&
28 ordered != QUEUE_ORDERED_DRAIN_FLUSH &&
29 ordered != QUEUE_ORDERED_DRAIN_FUA &&
30 ordered != QUEUE_ORDERED_TAG &&
31 ordered != QUEUE_ORDERED_TAG_FLUSH &&
32 ordered != QUEUE_ORDERED_TAG_FUA) {
33 printk(KERN_ERR "blk_queue_ordered: bad value %d\n", ordered);
34 return -EINVAL;
35 }
36
37 q->ordered = ordered;
38 q->next_ordered = ordered;
39
40 return 0;
41}
42EXPORT_SYMBOL(blk_queue_ordered);
43
44/*
45 * Cache flushing for ordered writes handling
46 */
47unsigned blk_ordered_cur_seq(struct request_queue *q)
48{
49 if (!q->ordseq)
50 return 0;
51 return 1 << ffz(q->ordseq);
52}
53
54unsigned blk_ordered_req_seq(struct request *rq)
55{
56 struct request_queue *q = rq->q;
57
58 BUG_ON(q->ordseq == 0);
59
60 if (rq == &q->pre_flush_rq)
61 return QUEUE_ORDSEQ_PREFLUSH;
62 if (rq == &q->bar_rq)
63 return QUEUE_ORDSEQ_BAR;
64 if (rq == &q->post_flush_rq)
65 return QUEUE_ORDSEQ_POSTFLUSH;
66
67 /*
68 * !fs requests don't need to follow barrier ordering. Always
69 * put them at the front. This fixes the following deadlock.
70 *
71 * http://thread.gmane.org/gmane.linux.kernel/537473
72 */
73 if (rq->cmd_type != REQ_TYPE_FS)
74 return QUEUE_ORDSEQ_DRAIN;
75
76 if ((rq->cmd_flags & REQ_ORDERED_COLOR) ==
77 (q->orig_bar_rq->cmd_flags & REQ_ORDERED_COLOR))
78 return QUEUE_ORDSEQ_DRAIN;
79 else
80 return QUEUE_ORDSEQ_DONE;
81}
82
83bool blk_ordered_complete_seq(struct request_queue *q, unsigned seq, int error)
84{
85 struct request *rq;
86
87 if (error && !q->orderr)
88 q->orderr = error;
89
90 BUG_ON(q->ordseq & seq);
91 q->ordseq |= seq;
92
93 if (blk_ordered_cur_seq(q) != QUEUE_ORDSEQ_DONE)
94 return false;
95
96 /*
97 * Okay, sequence complete.
98 */
99 q->ordseq = 0;
100 rq = q->orig_bar_rq;
101 __blk_end_request_all(rq, q->orderr);
102 return true;
103}
104
105static void pre_flush_end_io(struct request *rq, int error)
106{
107 elv_completed_request(rq->q, rq);
108 blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_PREFLUSH, error);
109}
110
111static void bar_end_io(struct request *rq, int error)
112{
113 elv_completed_request(rq->q, rq);
114 blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_BAR, error);
115}
116
117static void post_flush_end_io(struct request *rq, int error)
118{
119 elv_completed_request(rq->q, rq);
120 blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_POSTFLUSH, error);
121}
122
123static void queue_flush(struct request_queue *q, unsigned which)
124{
125 struct request *rq;
126 rq_end_io_fn *end_io;
127
128 if (which == QUEUE_ORDERED_DO_PREFLUSH) {
129 rq = &q->pre_flush_rq;
130 end_io = pre_flush_end_io;
131 } else {
132 rq = &q->post_flush_rq;
133 end_io = post_flush_end_io;
134 }
135
136 blk_rq_init(q, rq);
137 rq->cmd_type = REQ_TYPE_FS;
138 rq->cmd_flags = REQ_HARDBARRIER | REQ_FLUSH;
139 rq->rq_disk = q->orig_bar_rq->rq_disk;
140 rq->end_io = end_io;
141
142 elv_insert(q, rq, ELEVATOR_INSERT_FRONT);
143}
144
145static inline bool start_ordered(struct request_queue *q, struct request **rqp)
146{
147 struct request *rq = *rqp;
148 unsigned skip = 0;
149
150 q->orderr = 0;
151 q->ordered = q->next_ordered;
152 q->ordseq |= QUEUE_ORDSEQ_STARTED;
153
154 /*
155 * For an empty barrier, there's no actual BAR request, which
156 * in turn makes POSTFLUSH unnecessary. Mask them off.
157 */
158 if (!blk_rq_sectors(rq)) {
159 q->ordered &= ~(QUEUE_ORDERED_DO_BAR |
160 QUEUE_ORDERED_DO_POSTFLUSH);
161 /*
162 * Empty barrier on a write-through device w/ ordered
163 * tag has no command to issue and without any command
164 * to issue, ordering by tag can't be used. Drain
165 * instead.
166 */
167 if ((q->ordered & QUEUE_ORDERED_BY_TAG) &&
168 !(q->ordered & QUEUE_ORDERED_DO_PREFLUSH)) {
169 q->ordered &= ~QUEUE_ORDERED_BY_TAG;
170 q->ordered |= QUEUE_ORDERED_BY_DRAIN;
171 }
172 }
173
174 /* stash away the original request */
175 blk_dequeue_request(rq);
176 q->orig_bar_rq = rq;
177 rq = NULL;
178
179 /*
180 * Queue ordered sequence. As we stack them at the head, we
181 * need to queue in reverse order. Note that we rely on that
182 * no fs request uses ELEVATOR_INSERT_FRONT and thus no fs
183 * request gets inbetween ordered sequence.
184 */
185 if (q->ordered & QUEUE_ORDERED_DO_POSTFLUSH) {
186 queue_flush(q, QUEUE_ORDERED_DO_POSTFLUSH);
187 rq = &q->post_flush_rq;
188 } else
189 skip |= QUEUE_ORDSEQ_POSTFLUSH;
190
191 if (q->ordered & QUEUE_ORDERED_DO_BAR) {
192 rq = &q->bar_rq;
193
194 /* initialize proxy request and queue it */
195 blk_rq_init(q, rq);
196 if (bio_data_dir(q->orig_bar_rq->bio) == WRITE)
197 rq->cmd_flags |= REQ_WRITE;
198 if (q->ordered & QUEUE_ORDERED_DO_FUA)
199 rq->cmd_flags |= REQ_FUA;
200 init_request_from_bio(rq, q->orig_bar_rq->bio);
201 rq->end_io = bar_end_io;
202
203 elv_insert(q, rq, ELEVATOR_INSERT_FRONT);
204 } else
205 skip |= QUEUE_ORDSEQ_BAR;
206
207 if (q->ordered & QUEUE_ORDERED_DO_PREFLUSH) {
208 queue_flush(q, QUEUE_ORDERED_DO_PREFLUSH);
209 rq = &q->pre_flush_rq;
210 } else
211 skip |= QUEUE_ORDSEQ_PREFLUSH;
212
213 if ((q->ordered & QUEUE_ORDERED_BY_DRAIN) && queue_in_flight(q))
214 rq = NULL;
215 else
216 skip |= QUEUE_ORDSEQ_DRAIN;
217
218 *rqp = rq;
219
220 /*
221 * Complete skipped sequences. If whole sequence is complete,
222 * return false to tell elevator that this request is gone.
223 */
224 return !blk_ordered_complete_seq(q, skip, 0);
225}
226
227bool blk_do_ordered(struct request_queue *q, struct request **rqp)
228{
229 struct request *rq = *rqp;
230 const int is_barrier = rq->cmd_type == REQ_TYPE_FS &&
231 (rq->cmd_flags & REQ_HARDBARRIER);
232
233 if (!q->ordseq) {
234 if (!is_barrier)
235 return true;
236
237 if (q->next_ordered != QUEUE_ORDERED_NONE)
238 return start_ordered(q, rqp);
239 else {
240 /*
241 * Queue ordering not supported. Terminate
242 * with prejudice.
243 */
244 blk_dequeue_request(rq);
245 __blk_end_request_all(rq, -EOPNOTSUPP);
246 *rqp = NULL;
247 return false;
248 }
249 }
250
251 /*
252 * Ordered sequence in progress
253 */
254
255 /* Special requests are not subject to ordering rules. */
256 if (rq->cmd_type != REQ_TYPE_FS &&
257 rq != &q->pre_flush_rq && rq != &q->post_flush_rq)
258 return true;
259
260 if (q->ordered & QUEUE_ORDERED_BY_TAG) {
261 /* Ordered by tag. Blocking the next barrier is enough. */
262 if (is_barrier && rq != &q->bar_rq)
263 *rqp = NULL;
264 } else {
265 /* Ordered by draining. Wait for turn. */
266 WARN_ON(blk_ordered_req_seq(rq) < blk_ordered_cur_seq(q));
267 if (blk_ordered_req_seq(rq) > blk_ordered_cur_seq(q))
268 *rqp = NULL;
269 }
270
271 return true;
272}
273
274static void bio_end_empty_barrier(struct bio *bio, int err)
275{
276 if (err) {
277 if (err == -EOPNOTSUPP)
278 set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
279 clear_bit(BIO_UPTODATE, &bio->bi_flags);
280 }
281 if (bio->bi_private)
282 complete(bio->bi_private);
283 bio_put(bio);
284}
285
286/**
287 * blkdev_issue_flush - queue a flush
288 * @bdev: blockdev to issue flush for
289 * @gfp_mask: memory allocation flags (for bio_alloc)
290 * @error_sector: error sector
291 * @flags: BLKDEV_IFL_* flags to control behaviour
292 *
293 * Description:
294 * Issue a flush for the block device in question. Caller can supply
295 * room for storing the error offset in case of a flush error, if they
296 * wish to. If WAIT flag is not passed then caller may check only what
297 * request was pushed in some internal queue for later handling.
298 */
299int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask,
300 sector_t *error_sector, unsigned long flags)
301{
302 DECLARE_COMPLETION_ONSTACK(wait);
303 struct request_queue *q;
304 struct bio *bio;
305 int ret = 0;
306
307 if (bdev->bd_disk == NULL)
308 return -ENXIO;
309
310 q = bdev_get_queue(bdev);
311 if (!q)
312 return -ENXIO;
313
314 /*
315 * some block devices may not have their queue correctly set up here
316 * (e.g. loop device without a backing file) and so issuing a flush
317 * here will panic. Ensure there is a request function before issuing
318 * the barrier.
319 */
320 if (!q->make_request_fn)
321 return -ENXIO;
322
323 bio = bio_alloc(gfp_mask, 0);
324 bio->bi_end_io = bio_end_empty_barrier;
325 bio->bi_bdev = bdev;
326 if (test_bit(BLKDEV_WAIT, &flags))
327 bio->bi_private = &wait;
328
329 bio_get(bio);
330 submit_bio(WRITE_BARRIER, bio);
331 if (test_bit(BLKDEV_WAIT, &flags)) {
332 wait_for_completion(&wait);
333 /*
334 * The driver must store the error location in ->bi_sector, if
335 * it supports it. For non-stacked drivers, this should be
336 * copied from blk_rq_pos(rq).
337 */
338 if (error_sector)
339 *error_sector = bio->bi_sector;
340 }
341
342 if (bio_flagged(bio, BIO_EOPNOTSUPP))
343 ret = -EOPNOTSUPP;
344 else if (!bio_flagged(bio, BIO_UPTODATE))
345 ret = -EIO;
346
347 bio_put(bio);
348 return ret;
349}
350EXPORT_SYMBOL(blkdev_issue_flush);
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 2fef1ef931a0..bcaf16ee6ad1 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -30,18 +30,22 @@ EXPORT_SYMBOL_GPL(blkio_root_cgroup);
30 30
31static struct cgroup_subsys_state *blkiocg_create(struct cgroup_subsys *, 31static struct cgroup_subsys_state *blkiocg_create(struct cgroup_subsys *,
32 struct cgroup *); 32 struct cgroup *);
33static int blkiocg_can_attach(struct cgroup_subsys *, struct cgroup *, 33static int blkiocg_can_attach_task(struct cgroup *, struct task_struct *);
34 struct task_struct *, bool); 34static void blkiocg_attach_task(struct cgroup *, struct task_struct *);
35static void blkiocg_attach(struct cgroup_subsys *, struct cgroup *,
36 struct cgroup *, struct task_struct *, bool);
37static void blkiocg_destroy(struct cgroup_subsys *, struct cgroup *); 35static void blkiocg_destroy(struct cgroup_subsys *, struct cgroup *);
38static int blkiocg_populate(struct cgroup_subsys *, struct cgroup *); 36static int blkiocg_populate(struct cgroup_subsys *, struct cgroup *);
39 37
38/* for encoding cft->private value on file */
39#define BLKIOFILE_PRIVATE(x, val) (((x) << 16) | (val))
40/* What policy owns the file, proportional or throttle */
41#define BLKIOFILE_POLICY(val) (((val) >> 16) & 0xffff)
42#define BLKIOFILE_ATTR(val) ((val) & 0xffff)
43
40struct cgroup_subsys blkio_subsys = { 44struct cgroup_subsys blkio_subsys = {
41 .name = "blkio", 45 .name = "blkio",
42 .create = blkiocg_create, 46 .create = blkiocg_create,
43 .can_attach = blkiocg_can_attach, 47 .can_attach_task = blkiocg_can_attach_task,
44 .attach = blkiocg_attach, 48 .attach_task = blkiocg_attach_task,
45 .destroy = blkiocg_destroy, 49 .destroy = blkiocg_destroy,
46 .populate = blkiocg_populate, 50 .populate = blkiocg_populate,
47#ifdef CONFIG_BLK_CGROUP 51#ifdef CONFIG_BLK_CGROUP
@@ -59,6 +63,27 @@ static inline void blkio_policy_insert_node(struct blkio_cgroup *blkcg,
59 list_add(&pn->node, &blkcg->policy_list); 63 list_add(&pn->node, &blkcg->policy_list);
60} 64}
61 65
66static inline bool cftype_blkg_same_policy(struct cftype *cft,
67 struct blkio_group *blkg)
68{
69 enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
70
71 if (blkg->plid == plid)
72 return 1;
73
74 return 0;
75}
76
77/* Determines if policy node matches cgroup file being accessed */
78static inline bool pn_matches_cftype(struct cftype *cft,
79 struct blkio_policy_node *pn)
80{
81 enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
82 int fileid = BLKIOFILE_ATTR(cft->private);
83
84 return (plid == pn->plid && fileid == pn->fileid);
85}
86
62/* Must be called with blkcg->lock held */ 87/* Must be called with blkcg->lock held */
63static inline void blkio_policy_delete_node(struct blkio_policy_node *pn) 88static inline void blkio_policy_delete_node(struct blkio_policy_node *pn)
64{ 89{
@@ -67,12 +92,13 @@ static inline void blkio_policy_delete_node(struct blkio_policy_node *pn)
67 92
68/* Must be called with blkcg->lock held */ 93/* Must be called with blkcg->lock held */
69static struct blkio_policy_node * 94static struct blkio_policy_node *
70blkio_policy_search_node(const struct blkio_cgroup *blkcg, dev_t dev) 95blkio_policy_search_node(const struct blkio_cgroup *blkcg, dev_t dev,
96 enum blkio_policy_id plid, int fileid)
71{ 97{
72 struct blkio_policy_node *pn; 98 struct blkio_policy_node *pn;
73 99
74 list_for_each_entry(pn, &blkcg->policy_list, node) { 100 list_for_each_entry(pn, &blkcg->policy_list, node) {
75 if (pn->dev == dev) 101 if (pn->dev == dev && pn->plid == plid && pn->fileid == fileid)
76 return pn; 102 return pn;
77 } 103 }
78 104
@@ -86,6 +112,74 @@ struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup)
86} 112}
87EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup); 113EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup);
88 114
115struct blkio_cgroup *task_blkio_cgroup(struct task_struct *tsk)
116{
117 return container_of(task_subsys_state(tsk, blkio_subsys_id),
118 struct blkio_cgroup, css);
119}
120EXPORT_SYMBOL_GPL(task_blkio_cgroup);
121
122static inline void
123blkio_update_group_weight(struct blkio_group *blkg, unsigned int weight)
124{
125 struct blkio_policy_type *blkiop;
126
127 list_for_each_entry(blkiop, &blkio_list, list) {
128 /* If this policy does not own the blkg, do not send updates */
129 if (blkiop->plid != blkg->plid)
130 continue;
131 if (blkiop->ops.blkio_update_group_weight_fn)
132 blkiop->ops.blkio_update_group_weight_fn(blkg->key,
133 blkg, weight);
134 }
135}
136
137static inline void blkio_update_group_bps(struct blkio_group *blkg, u64 bps,
138 int fileid)
139{
140 struct blkio_policy_type *blkiop;
141
142 list_for_each_entry(blkiop, &blkio_list, list) {
143
144 /* If this policy does not own the blkg, do not send updates */
145 if (blkiop->plid != blkg->plid)
146 continue;
147
148 if (fileid == BLKIO_THROTL_read_bps_device
149 && blkiop->ops.blkio_update_group_read_bps_fn)
150 blkiop->ops.blkio_update_group_read_bps_fn(blkg->key,
151 blkg, bps);
152
153 if (fileid == BLKIO_THROTL_write_bps_device
154 && blkiop->ops.blkio_update_group_write_bps_fn)
155 blkiop->ops.blkio_update_group_write_bps_fn(blkg->key,
156 blkg, bps);
157 }
158}
159
160static inline void blkio_update_group_iops(struct blkio_group *blkg,
161 unsigned int iops, int fileid)
162{
163 struct blkio_policy_type *blkiop;
164
165 list_for_each_entry(blkiop, &blkio_list, list) {
166
167 /* If this policy does not own the blkg, do not send updates */
168 if (blkiop->plid != blkg->plid)
169 continue;
170
171 if (fileid == BLKIO_THROTL_read_iops_device
172 && blkiop->ops.blkio_update_group_read_iops_fn)
173 blkiop->ops.blkio_update_group_read_iops_fn(blkg->key,
174 blkg, iops);
175
176 if (fileid == BLKIO_THROTL_write_iops_device
177 && blkiop->ops.blkio_update_group_write_iops_fn)
178 blkiop->ops.blkio_update_group_write_iops_fn(blkg->key,
179 blkg,iops);
180 }
181}
182
89/* 183/*
90 * Add to the appropriate stat variable depending on the request type. 184 * Add to the appropriate stat variable depending on the request type.
91 * This should be called with the blkg->stats_lock held. 185 * This should be called with the blkg->stats_lock held.
@@ -282,30 +376,47 @@ void blkiocg_update_io_remove_stats(struct blkio_group *blkg,
282} 376}
283EXPORT_SYMBOL_GPL(blkiocg_update_io_remove_stats); 377EXPORT_SYMBOL_GPL(blkiocg_update_io_remove_stats);
284 378
285void blkiocg_update_timeslice_used(struct blkio_group *blkg, unsigned long time) 379void blkiocg_update_timeslice_used(struct blkio_group *blkg, unsigned long time,
380 unsigned long unaccounted_time)
286{ 381{
287 unsigned long flags; 382 unsigned long flags;
288 383
289 spin_lock_irqsave(&blkg->stats_lock, flags); 384 spin_lock_irqsave(&blkg->stats_lock, flags);
290 blkg->stats.time += time; 385 blkg->stats.time += time;
386#ifdef CONFIG_DEBUG_BLK_CGROUP
387 blkg->stats.unaccounted_time += unaccounted_time;
388#endif
291 spin_unlock_irqrestore(&blkg->stats_lock, flags); 389 spin_unlock_irqrestore(&blkg->stats_lock, flags);
292} 390}
293EXPORT_SYMBOL_GPL(blkiocg_update_timeslice_used); 391EXPORT_SYMBOL_GPL(blkiocg_update_timeslice_used);
294 392
393/*
394 * should be called under rcu read lock or queue lock to make sure blkg pointer
395 * is valid.
396 */
295void blkiocg_update_dispatch_stats(struct blkio_group *blkg, 397void blkiocg_update_dispatch_stats(struct blkio_group *blkg,
296 uint64_t bytes, bool direction, bool sync) 398 uint64_t bytes, bool direction, bool sync)
297{ 399{
298 struct blkio_group_stats *stats; 400 struct blkio_group_stats_cpu *stats_cpu;
299 unsigned long flags; 401 unsigned long flags;
300 402
301 spin_lock_irqsave(&blkg->stats_lock, flags); 403 /*
302 stats = &blkg->stats; 404 * Disabling interrupts to provide mutual exclusion between two
303 stats->sectors += bytes >> 9; 405 * writes on same cpu. It probably is not needed for 64bit. Not
304 blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICED], 1, direction, 406 * optimizing that case yet.
305 sync); 407 */
306 blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICE_BYTES], bytes, 408 local_irq_save(flags);
307 direction, sync); 409
308 spin_unlock_irqrestore(&blkg->stats_lock, flags); 410 stats_cpu = this_cpu_ptr(blkg->stats_cpu);
411
412 u64_stats_update_begin(&stats_cpu->syncp);
413 stats_cpu->sectors += bytes >> 9;
414 blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_SERVICED],
415 1, direction, sync);
416 blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_SERVICE_BYTES],
417 bytes, direction, sync);
418 u64_stats_update_end(&stats_cpu->syncp);
419 local_irq_restore(flags);
309} 420}
310EXPORT_SYMBOL_GPL(blkiocg_update_dispatch_stats); 421EXPORT_SYMBOL_GPL(blkiocg_update_dispatch_stats);
311 422
@@ -328,20 +439,47 @@ void blkiocg_update_completion_stats(struct blkio_group *blkg,
328} 439}
329EXPORT_SYMBOL_GPL(blkiocg_update_completion_stats); 440EXPORT_SYMBOL_GPL(blkiocg_update_completion_stats);
330 441
442/* Merged stats are per cpu. */
331void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction, 443void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction,
332 bool sync) 444 bool sync)
333{ 445{
446 struct blkio_group_stats_cpu *stats_cpu;
334 unsigned long flags; 447 unsigned long flags;
335 448
336 spin_lock_irqsave(&blkg->stats_lock, flags); 449 /*
337 blkio_add_stat(blkg->stats.stat_arr[BLKIO_STAT_MERGED], 1, direction, 450 * Disabling interrupts to provide mutual exclusion between two
338 sync); 451 * writes on same cpu. It probably is not needed for 64bit. Not
339 spin_unlock_irqrestore(&blkg->stats_lock, flags); 452 * optimizing that case yet.
453 */
454 local_irq_save(flags);
455
456 stats_cpu = this_cpu_ptr(blkg->stats_cpu);
457
458 u64_stats_update_begin(&stats_cpu->syncp);
459 blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_MERGED], 1,
460 direction, sync);
461 u64_stats_update_end(&stats_cpu->syncp);
462 local_irq_restore(flags);
340} 463}
341EXPORT_SYMBOL_GPL(blkiocg_update_io_merged_stats); 464EXPORT_SYMBOL_GPL(blkiocg_update_io_merged_stats);
342 465
466/*
467 * This function allocates the per cpu stats for blkio_group. Should be called
468 * from sleepable context as alloc_per_cpu() requires that.
469 */
470int blkio_alloc_blkg_stats(struct blkio_group *blkg)
471{
472 /* Allocate memory for per cpu stats */
473 blkg->stats_cpu = alloc_percpu(struct blkio_group_stats_cpu);
474 if (!blkg->stats_cpu)
475 return -ENOMEM;
476 return 0;
477}
478EXPORT_SYMBOL_GPL(blkio_alloc_blkg_stats);
479
343void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, 480void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
344 struct blkio_group *blkg, void *key, dev_t dev) 481 struct blkio_group *blkg, void *key, dev_t dev,
482 enum blkio_policy_id plid)
345{ 483{
346 unsigned long flags; 484 unsigned long flags;
347 485
@@ -350,6 +488,7 @@ void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
350 rcu_assign_pointer(blkg->key, key); 488 rcu_assign_pointer(blkg->key, key);
351 blkg->blkcg_id = css_id(&blkcg->css); 489 blkg->blkcg_id = css_id(&blkcg->css);
352 hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list); 490 hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
491 blkg->plid = plid;
353 spin_unlock_irqrestore(&blkcg->lock, flags); 492 spin_unlock_irqrestore(&blkcg->lock, flags);
354 /* Need to take css reference ? */ 493 /* Need to take css reference ? */
355 cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path)); 494 cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path));
@@ -408,49 +547,28 @@ struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key)
408} 547}
409EXPORT_SYMBOL_GPL(blkiocg_lookup_group); 548EXPORT_SYMBOL_GPL(blkiocg_lookup_group);
410 549
411#define SHOW_FUNCTION(__VAR) \ 550static void blkio_reset_stats_cpu(struct blkio_group *blkg)
412static u64 blkiocg_##__VAR##_read(struct cgroup *cgroup, \
413 struct cftype *cftype) \
414{ \
415 struct blkio_cgroup *blkcg; \
416 \
417 blkcg = cgroup_to_blkio_cgroup(cgroup); \
418 return (u64)blkcg->__VAR; \
419}
420
421SHOW_FUNCTION(weight);
422#undef SHOW_FUNCTION
423
424static int
425blkiocg_weight_write(struct cgroup *cgroup, struct cftype *cftype, u64 val)
426{ 551{
427 struct blkio_cgroup *blkcg; 552 struct blkio_group_stats_cpu *stats_cpu;
428 struct blkio_group *blkg; 553 int i, j, k;
429 struct hlist_node *n; 554 /*
430 struct blkio_policy_type *blkiop; 555 * Note: On 64 bit arch this should not be an issue. This has the
431 struct blkio_policy_node *pn; 556 * possibility of returning some inconsistent value on 32bit arch
432 557 * as 64bit update on 32bit is non atomic. Taking care of this
433 if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX) 558 * corner case makes code very complicated, like sending IPIs to
434 return -EINVAL; 559 * cpus, taking care of stats of offline cpus etc.
435 560 *
436 blkcg = cgroup_to_blkio_cgroup(cgroup); 561 * reset stats is anyway more of a debug feature and this sounds a
437 spin_lock(&blkio_list_lock); 562 * corner case. So I am not complicating the code yet until and
438 spin_lock_irq(&blkcg->lock); 563 * unless this becomes a real issue.
439 blkcg->weight = (unsigned int)val; 564 */
440 565 for_each_possible_cpu(i) {
441 hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) { 566 stats_cpu = per_cpu_ptr(blkg->stats_cpu, i);
442 pn = blkio_policy_search_node(blkcg, blkg->dev); 567 stats_cpu->sectors = 0;
443 568 for(j = 0; j < BLKIO_STAT_CPU_NR; j++)
444 if (pn) 569 for (k = 0; k < BLKIO_STAT_TOTAL; k++)
445 continue; 570 stats_cpu->stat_arr_cpu[j][k] = 0;
446
447 list_for_each_entry(blkiop, &blkio_list, list)
448 blkiop->ops.blkio_update_group_weight_fn(blkg,
449 blkcg->weight);
450 } 571 }
451 spin_unlock_irq(&blkcg->lock);
452 spin_unlock(&blkio_list_lock);
453 return 0;
454} 572}
455 573
456static int 574static int
@@ -497,7 +615,11 @@ blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val)
497 } 615 }
498#endif 616#endif
499 spin_unlock(&blkg->stats_lock); 617 spin_unlock(&blkg->stats_lock);
618
619 /* Reset Per cpu stats which don't take blkg->stats_lock */
620 blkio_reset_stats_cpu(blkg);
500 } 621 }
622
501 spin_unlock_irq(&blkcg->lock); 623 spin_unlock_irq(&blkcg->lock);
502 return 0; 624 return 0;
503} 625}
@@ -543,6 +665,59 @@ static uint64_t blkio_fill_stat(char *str, int chars_left, uint64_t val,
543 return val; 665 return val;
544} 666}
545 667
668
669static uint64_t blkio_read_stat_cpu(struct blkio_group *blkg,
670 enum stat_type_cpu type, enum stat_sub_type sub_type)
671{
672 int cpu;
673 struct blkio_group_stats_cpu *stats_cpu;
674 u64 val = 0, tval;
675
676 for_each_possible_cpu(cpu) {
677 unsigned int start;
678 stats_cpu = per_cpu_ptr(blkg->stats_cpu, cpu);
679
680 do {
681 start = u64_stats_fetch_begin(&stats_cpu->syncp);
682 if (type == BLKIO_STAT_CPU_SECTORS)
683 tval = stats_cpu->sectors;
684 else
685 tval = stats_cpu->stat_arr_cpu[type][sub_type];
686 } while(u64_stats_fetch_retry(&stats_cpu->syncp, start));
687
688 val += tval;
689 }
690
691 return val;
692}
693
694static uint64_t blkio_get_stat_cpu(struct blkio_group *blkg,
695 struct cgroup_map_cb *cb, dev_t dev, enum stat_type_cpu type)
696{
697 uint64_t disk_total, val;
698 char key_str[MAX_KEY_LEN];
699 enum stat_sub_type sub_type;
700
701 if (type == BLKIO_STAT_CPU_SECTORS) {
702 val = blkio_read_stat_cpu(blkg, type, 0);
703 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, val, cb, dev);
704 }
705
706 for (sub_type = BLKIO_STAT_READ; sub_type < BLKIO_STAT_TOTAL;
707 sub_type++) {
708 blkio_get_key_name(sub_type, dev, key_str, MAX_KEY_LEN, false);
709 val = blkio_read_stat_cpu(blkg, type, sub_type);
710 cb->fill(cb, key_str, val);
711 }
712
713 disk_total = blkio_read_stat_cpu(blkg, type, BLKIO_STAT_READ) +
714 blkio_read_stat_cpu(blkg, type, BLKIO_STAT_WRITE);
715
716 blkio_get_key_name(BLKIO_STAT_TOTAL, dev, key_str, MAX_KEY_LEN, false);
717 cb->fill(cb, key_str, disk_total);
718 return disk_total;
719}
720
546/* This should be called with blkg->stats_lock held */ 721/* This should be called with blkg->stats_lock held */
547static uint64_t blkio_get_stat(struct blkio_group *blkg, 722static uint64_t blkio_get_stat(struct blkio_group *blkg,
548 struct cgroup_map_cb *cb, dev_t dev, enum stat_type type) 723 struct cgroup_map_cb *cb, dev_t dev, enum stat_type type)
@@ -554,10 +729,10 @@ static uint64_t blkio_get_stat(struct blkio_group *blkg,
554 if (type == BLKIO_STAT_TIME) 729 if (type == BLKIO_STAT_TIME)
555 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, 730 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
556 blkg->stats.time, cb, dev); 731 blkg->stats.time, cb, dev);
557 if (type == BLKIO_STAT_SECTORS)
558 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
559 blkg->stats.sectors, cb, dev);
560#ifdef CONFIG_DEBUG_BLK_CGROUP 732#ifdef CONFIG_DEBUG_BLK_CGROUP
733 if (type == BLKIO_STAT_UNACCOUNTED_TIME)
734 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
735 blkg->stats.unaccounted_time, cb, dev);
561 if (type == BLKIO_STAT_AVG_QUEUE_SIZE) { 736 if (type == BLKIO_STAT_AVG_QUEUE_SIZE) {
562 uint64_t sum = blkg->stats.avg_queue_size_sum; 737 uint64_t sum = blkg->stats.avg_queue_size_sum;
563 uint64_t samples = blkg->stats.avg_queue_size_samples; 738 uint64_t samples = blkg->stats.avg_queue_size_samples;
@@ -593,52 +768,6 @@ static uint64_t blkio_get_stat(struct blkio_group *blkg,
593 return disk_total; 768 return disk_total;
594} 769}
595 770
596#define SHOW_FUNCTION_PER_GROUP(__VAR, type, show_total) \
597static int blkiocg_##__VAR##_read(struct cgroup *cgroup, \
598 struct cftype *cftype, struct cgroup_map_cb *cb) \
599{ \
600 struct blkio_cgroup *blkcg; \
601 struct blkio_group *blkg; \
602 struct hlist_node *n; \
603 uint64_t cgroup_total = 0; \
604 \
605 if (!cgroup_lock_live_group(cgroup)) \
606 return -ENODEV; \
607 \
608 blkcg = cgroup_to_blkio_cgroup(cgroup); \
609 rcu_read_lock(); \
610 hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {\
611 if (blkg->dev) { \
612 spin_lock_irq(&blkg->stats_lock); \
613 cgroup_total += blkio_get_stat(blkg, cb, \
614 blkg->dev, type); \
615 spin_unlock_irq(&blkg->stats_lock); \
616 } \
617 } \
618 if (show_total) \
619 cb->fill(cb, "Total", cgroup_total); \
620 rcu_read_unlock(); \
621 cgroup_unlock(); \
622 return 0; \
623}
624
625SHOW_FUNCTION_PER_GROUP(time, BLKIO_STAT_TIME, 0);
626SHOW_FUNCTION_PER_GROUP(sectors, BLKIO_STAT_SECTORS, 0);
627SHOW_FUNCTION_PER_GROUP(io_service_bytes, BLKIO_STAT_SERVICE_BYTES, 1);
628SHOW_FUNCTION_PER_GROUP(io_serviced, BLKIO_STAT_SERVICED, 1);
629SHOW_FUNCTION_PER_GROUP(io_service_time, BLKIO_STAT_SERVICE_TIME, 1);
630SHOW_FUNCTION_PER_GROUP(io_wait_time, BLKIO_STAT_WAIT_TIME, 1);
631SHOW_FUNCTION_PER_GROUP(io_merged, BLKIO_STAT_MERGED, 1);
632SHOW_FUNCTION_PER_GROUP(io_queued, BLKIO_STAT_QUEUED, 1);
633#ifdef CONFIG_DEBUG_BLK_CGROUP
634SHOW_FUNCTION_PER_GROUP(dequeue, BLKIO_STAT_DEQUEUE, 0);
635SHOW_FUNCTION_PER_GROUP(avg_queue_size, BLKIO_STAT_AVG_QUEUE_SIZE, 0);
636SHOW_FUNCTION_PER_GROUP(group_wait_time, BLKIO_STAT_GROUP_WAIT_TIME, 0);
637SHOW_FUNCTION_PER_GROUP(idle_time, BLKIO_STAT_IDLE_TIME, 0);
638SHOW_FUNCTION_PER_GROUP(empty_time, BLKIO_STAT_EMPTY_TIME, 0);
639#endif
640#undef SHOW_FUNCTION_PER_GROUP
641
642static int blkio_check_dev_num(dev_t dev) 771static int blkio_check_dev_num(dev_t dev)
643{ 772{
644 int part = 0; 773 int part = 0;
@@ -652,13 +781,14 @@ static int blkio_check_dev_num(dev_t dev)
652} 781}
653 782
654static int blkio_policy_parse_and_set(char *buf, 783static int blkio_policy_parse_and_set(char *buf,
655 struct blkio_policy_node *newpn) 784 struct blkio_policy_node *newpn, enum blkio_policy_id plid, int fileid)
656{ 785{
657 char *s[4], *p, *major_s = NULL, *minor_s = NULL; 786 char *s[4], *p, *major_s = NULL, *minor_s = NULL;
658 int ret; 787 int ret;
659 unsigned long major, minor, temp; 788 unsigned long major, minor, temp;
660 int i = 0; 789 int i = 0;
661 dev_t dev; 790 dev_t dev;
791 u64 bps, iops;
662 792
663 memset(s, 0, sizeof(s)); 793 memset(s, 0, sizeof(s));
664 794
@@ -705,12 +835,47 @@ static int blkio_policy_parse_and_set(char *buf,
705 if (s[1] == NULL) 835 if (s[1] == NULL)
706 return -EINVAL; 836 return -EINVAL;
707 837
708 ret = strict_strtoul(s[1], 10, &temp); 838 switch (plid) {
709 if (ret || (temp < BLKIO_WEIGHT_MIN && temp > 0) || 839 case BLKIO_POLICY_PROP:
710 temp > BLKIO_WEIGHT_MAX) 840 ret = strict_strtoul(s[1], 10, &temp);
711 return -EINVAL; 841 if (ret || (temp < BLKIO_WEIGHT_MIN && temp > 0) ||
842 temp > BLKIO_WEIGHT_MAX)
843 return -EINVAL;
712 844
713 newpn->weight = temp; 845 newpn->plid = plid;
846 newpn->fileid = fileid;
847 newpn->val.weight = temp;
848 break;
849 case BLKIO_POLICY_THROTL:
850 switch(fileid) {
851 case BLKIO_THROTL_read_bps_device:
852 case BLKIO_THROTL_write_bps_device:
853 ret = strict_strtoull(s[1], 10, &bps);
854 if (ret)
855 return -EINVAL;
856
857 newpn->plid = plid;
858 newpn->fileid = fileid;
859 newpn->val.bps = bps;
860 break;
861 case BLKIO_THROTL_read_iops_device:
862 case BLKIO_THROTL_write_iops_device:
863 ret = strict_strtoull(s[1], 10, &iops);
864 if (ret)
865 return -EINVAL;
866
867 if (iops > THROTL_IOPS_MAX)
868 return -EINVAL;
869
870 newpn->plid = plid;
871 newpn->fileid = fileid;
872 newpn->val.iops = (unsigned int)iops;
873 break;
874 }
875 break;
876 default:
877 BUG();
878 }
714 879
715 return 0; 880 return 0;
716} 881}
@@ -720,26 +885,180 @@ unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
720{ 885{
721 struct blkio_policy_node *pn; 886 struct blkio_policy_node *pn;
722 887
723 pn = blkio_policy_search_node(blkcg, dev); 888 pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_PROP,
889 BLKIO_PROP_weight_device);
724 if (pn) 890 if (pn)
725 return pn->weight; 891 return pn->val.weight;
726 else 892 else
727 return blkcg->weight; 893 return blkcg->weight;
728} 894}
729EXPORT_SYMBOL_GPL(blkcg_get_weight); 895EXPORT_SYMBOL_GPL(blkcg_get_weight);
730 896
897uint64_t blkcg_get_read_bps(struct blkio_cgroup *blkcg, dev_t dev)
898{
899 struct blkio_policy_node *pn;
900
901 pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
902 BLKIO_THROTL_read_bps_device);
903 if (pn)
904 return pn->val.bps;
905 else
906 return -1;
907}
908
909uint64_t blkcg_get_write_bps(struct blkio_cgroup *blkcg, dev_t dev)
910{
911 struct blkio_policy_node *pn;
912 pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
913 BLKIO_THROTL_write_bps_device);
914 if (pn)
915 return pn->val.bps;
916 else
917 return -1;
918}
919
920unsigned int blkcg_get_read_iops(struct blkio_cgroup *blkcg, dev_t dev)
921{
922 struct blkio_policy_node *pn;
923
924 pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
925 BLKIO_THROTL_read_iops_device);
926 if (pn)
927 return pn->val.iops;
928 else
929 return -1;
930}
931
932unsigned int blkcg_get_write_iops(struct blkio_cgroup *blkcg, dev_t dev)
933{
934 struct blkio_policy_node *pn;
935 pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
936 BLKIO_THROTL_write_iops_device);
937 if (pn)
938 return pn->val.iops;
939 else
940 return -1;
941}
942
943/* Checks whether user asked for deleting a policy rule */
944static bool blkio_delete_rule_command(struct blkio_policy_node *pn)
945{
946 switch(pn->plid) {
947 case BLKIO_POLICY_PROP:
948 if (pn->val.weight == 0)
949 return 1;
950 break;
951 case BLKIO_POLICY_THROTL:
952 switch(pn->fileid) {
953 case BLKIO_THROTL_read_bps_device:
954 case BLKIO_THROTL_write_bps_device:
955 if (pn->val.bps == 0)
956 return 1;
957 break;
958 case BLKIO_THROTL_read_iops_device:
959 case BLKIO_THROTL_write_iops_device:
960 if (pn->val.iops == 0)
961 return 1;
962 }
963 break;
964 default:
965 BUG();
966 }
967
968 return 0;
969}
970
971static void blkio_update_policy_rule(struct blkio_policy_node *oldpn,
972 struct blkio_policy_node *newpn)
973{
974 switch(oldpn->plid) {
975 case BLKIO_POLICY_PROP:
976 oldpn->val.weight = newpn->val.weight;
977 break;
978 case BLKIO_POLICY_THROTL:
979 switch(newpn->fileid) {
980 case BLKIO_THROTL_read_bps_device:
981 case BLKIO_THROTL_write_bps_device:
982 oldpn->val.bps = newpn->val.bps;
983 break;
984 case BLKIO_THROTL_read_iops_device:
985 case BLKIO_THROTL_write_iops_device:
986 oldpn->val.iops = newpn->val.iops;
987 }
988 break;
989 default:
990 BUG();
991 }
992}
993
994/*
995 * Some rules/values in blkg have changed. Propagate those to respective
996 * policies.
997 */
998static void blkio_update_blkg_policy(struct blkio_cgroup *blkcg,
999 struct blkio_group *blkg, struct blkio_policy_node *pn)
1000{
1001 unsigned int weight, iops;
1002 u64 bps;
1003
1004 switch(pn->plid) {
1005 case BLKIO_POLICY_PROP:
1006 weight = pn->val.weight ? pn->val.weight :
1007 blkcg->weight;
1008 blkio_update_group_weight(blkg, weight);
1009 break;
1010 case BLKIO_POLICY_THROTL:
1011 switch(pn->fileid) {
1012 case BLKIO_THROTL_read_bps_device:
1013 case BLKIO_THROTL_write_bps_device:
1014 bps = pn->val.bps ? pn->val.bps : (-1);
1015 blkio_update_group_bps(blkg, bps, pn->fileid);
1016 break;
1017 case BLKIO_THROTL_read_iops_device:
1018 case BLKIO_THROTL_write_iops_device:
1019 iops = pn->val.iops ? pn->val.iops : (-1);
1020 blkio_update_group_iops(blkg, iops, pn->fileid);
1021 break;
1022 }
1023 break;
1024 default:
1025 BUG();
1026 }
1027}
731 1028
732static int blkiocg_weight_device_write(struct cgroup *cgrp, struct cftype *cft, 1029/*
733 const char *buffer) 1030 * A policy node rule has been updated. Propagate this update to all the
1031 * block groups which might be affected by this update.
1032 */
1033static void blkio_update_policy_node_blkg(struct blkio_cgroup *blkcg,
1034 struct blkio_policy_node *pn)
1035{
1036 struct blkio_group *blkg;
1037 struct hlist_node *n;
1038
1039 spin_lock(&blkio_list_lock);
1040 spin_lock_irq(&blkcg->lock);
1041
1042 hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
1043 if (pn->dev != blkg->dev || pn->plid != blkg->plid)
1044 continue;
1045 blkio_update_blkg_policy(blkcg, blkg, pn);
1046 }
1047
1048 spin_unlock_irq(&blkcg->lock);
1049 spin_unlock(&blkio_list_lock);
1050}
1051
1052static int blkiocg_file_write(struct cgroup *cgrp, struct cftype *cft,
1053 const char *buffer)
734{ 1054{
735 int ret = 0; 1055 int ret = 0;
736 char *buf; 1056 char *buf;
737 struct blkio_policy_node *newpn, *pn; 1057 struct blkio_policy_node *newpn, *pn;
738 struct blkio_cgroup *blkcg; 1058 struct blkio_cgroup *blkcg;
739 struct blkio_group *blkg;
740 int keep_newpn = 0; 1059 int keep_newpn = 0;
741 struct hlist_node *n; 1060 enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
742 struct blkio_policy_type *blkiop; 1061 int fileid = BLKIOFILE_ATTR(cft->private);
743 1062
744 buf = kstrdup(buffer, GFP_KERNEL); 1063 buf = kstrdup(buffer, GFP_KERNEL);
745 if (!buf) 1064 if (!buf)
@@ -751,7 +1070,7 @@ static int blkiocg_weight_device_write(struct cgroup *cgrp, struct cftype *cft,
751 goto free_buf; 1070 goto free_buf;
752 } 1071 }
753 1072
754 ret = blkio_policy_parse_and_set(buf, newpn); 1073 ret = blkio_policy_parse_and_set(buf, newpn, plid, fileid);
755 if (ret) 1074 if (ret)
756 goto free_newpn; 1075 goto free_newpn;
757 1076
@@ -759,9 +1078,9 @@ static int blkiocg_weight_device_write(struct cgroup *cgrp, struct cftype *cft,
759 1078
760 spin_lock_irq(&blkcg->lock); 1079 spin_lock_irq(&blkcg->lock);
761 1080
762 pn = blkio_policy_search_node(blkcg, newpn->dev); 1081 pn = blkio_policy_search_node(blkcg, newpn->dev, plid, fileid);
763 if (!pn) { 1082 if (!pn) {
764 if (newpn->weight != 0) { 1083 if (!blkio_delete_rule_command(newpn)) {
765 blkio_policy_insert_node(blkcg, newpn); 1084 blkio_policy_insert_node(blkcg, newpn);
766 keep_newpn = 1; 1085 keep_newpn = 1;
767 } 1086 }
@@ -769,33 +1088,17 @@ static int blkiocg_weight_device_write(struct cgroup *cgrp, struct cftype *cft,
769 goto update_io_group; 1088 goto update_io_group;
770 } 1089 }
771 1090
772 if (newpn->weight == 0) { 1091 if (blkio_delete_rule_command(newpn)) {
773 /* weight == 0 means deleteing a specific weight */
774 blkio_policy_delete_node(pn); 1092 blkio_policy_delete_node(pn);
775 spin_unlock_irq(&blkcg->lock); 1093 spin_unlock_irq(&blkcg->lock);
776 goto update_io_group; 1094 goto update_io_group;
777 } 1095 }
778 spin_unlock_irq(&blkcg->lock); 1096 spin_unlock_irq(&blkcg->lock);
779 1097
780 pn->weight = newpn->weight; 1098 blkio_update_policy_rule(pn, newpn);
781 1099
782update_io_group: 1100update_io_group:
783 /* update weight for each cfqg */ 1101 blkio_update_policy_node_blkg(blkcg, newpn);
784 spin_lock(&blkio_list_lock);
785 spin_lock_irq(&blkcg->lock);
786
787 hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
788 if (newpn->dev == blkg->dev) {
789 list_for_each_entry(blkiop, &blkio_list, list)
790 blkiop->ops.blkio_update_group_weight_fn(blkg,
791 newpn->weight ?
792 newpn->weight :
793 blkcg->weight);
794 }
795 }
796
797 spin_unlock_irq(&blkcg->lock);
798 spin_unlock(&blkio_list_lock);
799 1102
800free_newpn: 1103free_newpn:
801 if (!keep_newpn) 1104 if (!keep_newpn)
@@ -805,23 +1108,264 @@ free_buf:
805 return ret; 1108 return ret;
806} 1109}
807 1110
808static int blkiocg_weight_device_read(struct cgroup *cgrp, struct cftype *cft, 1111static void
809 struct seq_file *m) 1112blkio_print_policy_node(struct seq_file *m, struct blkio_policy_node *pn)
810{ 1113{
811 struct blkio_cgroup *blkcg; 1114 switch(pn->plid) {
812 struct blkio_policy_node *pn; 1115 case BLKIO_POLICY_PROP:
1116 if (pn->fileid == BLKIO_PROP_weight_device)
1117 seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev),
1118 MINOR(pn->dev), pn->val.weight);
1119 break;
1120 case BLKIO_POLICY_THROTL:
1121 switch(pn->fileid) {
1122 case BLKIO_THROTL_read_bps_device:
1123 case BLKIO_THROTL_write_bps_device:
1124 seq_printf(m, "%u:%u\t%llu\n", MAJOR(pn->dev),
1125 MINOR(pn->dev), pn->val.bps);
1126 break;
1127 case BLKIO_THROTL_read_iops_device:
1128 case BLKIO_THROTL_write_iops_device:
1129 seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev),
1130 MINOR(pn->dev), pn->val.iops);
1131 break;
1132 }
1133 break;
1134 default:
1135 BUG();
1136 }
1137}
813 1138
814 seq_printf(m, "dev\tweight\n"); 1139/* cgroup files which read their data from policy nodes end up here */
1140static void blkio_read_policy_node_files(struct cftype *cft,
1141 struct blkio_cgroup *blkcg, struct seq_file *m)
1142{
1143 struct blkio_policy_node *pn;
815 1144
816 blkcg = cgroup_to_blkio_cgroup(cgrp);
817 if (!list_empty(&blkcg->policy_list)) { 1145 if (!list_empty(&blkcg->policy_list)) {
818 spin_lock_irq(&blkcg->lock); 1146 spin_lock_irq(&blkcg->lock);
819 list_for_each_entry(pn, &blkcg->policy_list, node) { 1147 list_for_each_entry(pn, &blkcg->policy_list, node) {
820 seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev), 1148 if (!pn_matches_cftype(cft, pn))
821 MINOR(pn->dev), pn->weight); 1149 continue;
1150 blkio_print_policy_node(m, pn);
822 } 1151 }
823 spin_unlock_irq(&blkcg->lock); 1152 spin_unlock_irq(&blkcg->lock);
824 } 1153 }
1154}
1155
1156static int blkiocg_file_read(struct cgroup *cgrp, struct cftype *cft,
1157 struct seq_file *m)
1158{
1159 struct blkio_cgroup *blkcg;
1160 enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
1161 int name = BLKIOFILE_ATTR(cft->private);
1162
1163 blkcg = cgroup_to_blkio_cgroup(cgrp);
1164
1165 switch(plid) {
1166 case BLKIO_POLICY_PROP:
1167 switch(name) {
1168 case BLKIO_PROP_weight_device:
1169 blkio_read_policy_node_files(cft, blkcg, m);
1170 return 0;
1171 default:
1172 BUG();
1173 }
1174 break;
1175 case BLKIO_POLICY_THROTL:
1176 switch(name){
1177 case BLKIO_THROTL_read_bps_device:
1178 case BLKIO_THROTL_write_bps_device:
1179 case BLKIO_THROTL_read_iops_device:
1180 case BLKIO_THROTL_write_iops_device:
1181 blkio_read_policy_node_files(cft, blkcg, m);
1182 return 0;
1183 default:
1184 BUG();
1185 }
1186 break;
1187 default:
1188 BUG();
1189 }
1190
1191 return 0;
1192}
1193
1194static int blkio_read_blkg_stats(struct blkio_cgroup *blkcg,
1195 struct cftype *cft, struct cgroup_map_cb *cb,
1196 enum stat_type type, bool show_total, bool pcpu)
1197{
1198 struct blkio_group *blkg;
1199 struct hlist_node *n;
1200 uint64_t cgroup_total = 0;
1201
1202 rcu_read_lock();
1203 hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {
1204 if (blkg->dev) {
1205 if (!cftype_blkg_same_policy(cft, blkg))
1206 continue;
1207 if (pcpu)
1208 cgroup_total += blkio_get_stat_cpu(blkg, cb,
1209 blkg->dev, type);
1210 else {
1211 spin_lock_irq(&blkg->stats_lock);
1212 cgroup_total += blkio_get_stat(blkg, cb,
1213 blkg->dev, type);
1214 spin_unlock_irq(&blkg->stats_lock);
1215 }
1216 }
1217 }
1218 if (show_total)
1219 cb->fill(cb, "Total", cgroup_total);
1220 rcu_read_unlock();
1221 return 0;
1222}
1223
1224/* All map kind of cgroup file get serviced by this function */
1225static int blkiocg_file_read_map(struct cgroup *cgrp, struct cftype *cft,
1226 struct cgroup_map_cb *cb)
1227{
1228 struct blkio_cgroup *blkcg;
1229 enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
1230 int name = BLKIOFILE_ATTR(cft->private);
1231
1232 blkcg = cgroup_to_blkio_cgroup(cgrp);
1233
1234 switch(plid) {
1235 case BLKIO_POLICY_PROP:
1236 switch(name) {
1237 case BLKIO_PROP_time:
1238 return blkio_read_blkg_stats(blkcg, cft, cb,
1239 BLKIO_STAT_TIME, 0, 0);
1240 case BLKIO_PROP_sectors:
1241 return blkio_read_blkg_stats(blkcg, cft, cb,
1242 BLKIO_STAT_CPU_SECTORS, 0, 1);
1243 case BLKIO_PROP_io_service_bytes:
1244 return blkio_read_blkg_stats(blkcg, cft, cb,
1245 BLKIO_STAT_CPU_SERVICE_BYTES, 1, 1);
1246 case BLKIO_PROP_io_serviced:
1247 return blkio_read_blkg_stats(blkcg, cft, cb,
1248 BLKIO_STAT_CPU_SERVICED, 1, 1);
1249 case BLKIO_PROP_io_service_time:
1250 return blkio_read_blkg_stats(blkcg, cft, cb,
1251 BLKIO_STAT_SERVICE_TIME, 1, 0);
1252 case BLKIO_PROP_io_wait_time:
1253 return blkio_read_blkg_stats(blkcg, cft, cb,
1254 BLKIO_STAT_WAIT_TIME, 1, 0);
1255 case BLKIO_PROP_io_merged:
1256 return blkio_read_blkg_stats(blkcg, cft, cb,
1257 BLKIO_STAT_CPU_MERGED, 1, 1);
1258 case BLKIO_PROP_io_queued:
1259 return blkio_read_blkg_stats(blkcg, cft, cb,
1260 BLKIO_STAT_QUEUED, 1, 0);
1261#ifdef CONFIG_DEBUG_BLK_CGROUP
1262 case BLKIO_PROP_unaccounted_time:
1263 return blkio_read_blkg_stats(blkcg, cft, cb,
1264 BLKIO_STAT_UNACCOUNTED_TIME, 0, 0);
1265 case BLKIO_PROP_dequeue:
1266 return blkio_read_blkg_stats(blkcg, cft, cb,
1267 BLKIO_STAT_DEQUEUE, 0, 0);
1268 case BLKIO_PROP_avg_queue_size:
1269 return blkio_read_blkg_stats(blkcg, cft, cb,
1270 BLKIO_STAT_AVG_QUEUE_SIZE, 0, 0);
1271 case BLKIO_PROP_group_wait_time:
1272 return blkio_read_blkg_stats(blkcg, cft, cb,
1273 BLKIO_STAT_GROUP_WAIT_TIME, 0, 0);
1274 case BLKIO_PROP_idle_time:
1275 return blkio_read_blkg_stats(blkcg, cft, cb,
1276 BLKIO_STAT_IDLE_TIME, 0, 0);
1277 case BLKIO_PROP_empty_time:
1278 return blkio_read_blkg_stats(blkcg, cft, cb,
1279 BLKIO_STAT_EMPTY_TIME, 0, 0);
1280#endif
1281 default:
1282 BUG();
1283 }
1284 break;
1285 case BLKIO_POLICY_THROTL:
1286 switch(name){
1287 case BLKIO_THROTL_io_service_bytes:
1288 return blkio_read_blkg_stats(blkcg, cft, cb,
1289 BLKIO_STAT_CPU_SERVICE_BYTES, 1, 1);
1290 case BLKIO_THROTL_io_serviced:
1291 return blkio_read_blkg_stats(blkcg, cft, cb,
1292 BLKIO_STAT_CPU_SERVICED, 1, 1);
1293 default:
1294 BUG();
1295 }
1296 break;
1297 default:
1298 BUG();
1299 }
1300
1301 return 0;
1302}
1303
1304static int blkio_weight_write(struct blkio_cgroup *blkcg, u64 val)
1305{
1306 struct blkio_group *blkg;
1307 struct hlist_node *n;
1308 struct blkio_policy_node *pn;
1309
1310 if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX)
1311 return -EINVAL;
1312
1313 spin_lock(&blkio_list_lock);
1314 spin_lock_irq(&blkcg->lock);
1315 blkcg->weight = (unsigned int)val;
1316
1317 hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
1318 pn = blkio_policy_search_node(blkcg, blkg->dev,
1319 BLKIO_POLICY_PROP, BLKIO_PROP_weight_device);
1320 if (pn)
1321 continue;
1322
1323 blkio_update_group_weight(blkg, blkcg->weight);
1324 }
1325 spin_unlock_irq(&blkcg->lock);
1326 spin_unlock(&blkio_list_lock);
1327 return 0;
1328}
1329
1330static u64 blkiocg_file_read_u64 (struct cgroup *cgrp, struct cftype *cft) {
1331 struct blkio_cgroup *blkcg;
1332 enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
1333 int name = BLKIOFILE_ATTR(cft->private);
1334
1335 blkcg = cgroup_to_blkio_cgroup(cgrp);
1336
1337 switch(plid) {
1338 case BLKIO_POLICY_PROP:
1339 switch(name) {
1340 case BLKIO_PROP_weight:
1341 return (u64)blkcg->weight;
1342 }
1343 break;
1344 default:
1345 BUG();
1346 }
1347 return 0;
1348}
1349
1350static int
1351blkiocg_file_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
1352{
1353 struct blkio_cgroup *blkcg;
1354 enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
1355 int name = BLKIOFILE_ATTR(cft->private);
1356
1357 blkcg = cgroup_to_blkio_cgroup(cgrp);
1358
1359 switch(plid) {
1360 case BLKIO_POLICY_PROP:
1361 switch(name) {
1362 case BLKIO_PROP_weight:
1363 return blkio_weight_write(blkcg, val);
1364 }
1365 break;
1366 default:
1367 BUG();
1368 }
825 1369
826 return 0; 1370 return 0;
827} 1371}
@@ -829,71 +1373,157 @@ static int blkiocg_weight_device_read(struct cgroup *cgrp, struct cftype *cft,
829struct cftype blkio_files[] = { 1373struct cftype blkio_files[] = {
830 { 1374 {
831 .name = "weight_device", 1375 .name = "weight_device",
832 .read_seq_string = blkiocg_weight_device_read, 1376 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
833 .write_string = blkiocg_weight_device_write, 1377 BLKIO_PROP_weight_device),
1378 .read_seq_string = blkiocg_file_read,
1379 .write_string = blkiocg_file_write,
834 .max_write_len = 256, 1380 .max_write_len = 256,
835 }, 1381 },
836 { 1382 {
837 .name = "weight", 1383 .name = "weight",
838 .read_u64 = blkiocg_weight_read, 1384 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
839 .write_u64 = blkiocg_weight_write, 1385 BLKIO_PROP_weight),
1386 .read_u64 = blkiocg_file_read_u64,
1387 .write_u64 = blkiocg_file_write_u64,
840 }, 1388 },
841 { 1389 {
842 .name = "time", 1390 .name = "time",
843 .read_map = blkiocg_time_read, 1391 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1392 BLKIO_PROP_time),
1393 .read_map = blkiocg_file_read_map,
844 }, 1394 },
845 { 1395 {
846 .name = "sectors", 1396 .name = "sectors",
847 .read_map = blkiocg_sectors_read, 1397 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1398 BLKIO_PROP_sectors),
1399 .read_map = blkiocg_file_read_map,
848 }, 1400 },
849 { 1401 {
850 .name = "io_service_bytes", 1402 .name = "io_service_bytes",
851 .read_map = blkiocg_io_service_bytes_read, 1403 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1404 BLKIO_PROP_io_service_bytes),
1405 .read_map = blkiocg_file_read_map,
852 }, 1406 },
853 { 1407 {
854 .name = "io_serviced", 1408 .name = "io_serviced",
855 .read_map = blkiocg_io_serviced_read, 1409 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1410 BLKIO_PROP_io_serviced),
1411 .read_map = blkiocg_file_read_map,
856 }, 1412 },
857 { 1413 {
858 .name = "io_service_time", 1414 .name = "io_service_time",
859 .read_map = blkiocg_io_service_time_read, 1415 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1416 BLKIO_PROP_io_service_time),
1417 .read_map = blkiocg_file_read_map,
860 }, 1418 },
861 { 1419 {
862 .name = "io_wait_time", 1420 .name = "io_wait_time",
863 .read_map = blkiocg_io_wait_time_read, 1421 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1422 BLKIO_PROP_io_wait_time),
1423 .read_map = blkiocg_file_read_map,
864 }, 1424 },
865 { 1425 {
866 .name = "io_merged", 1426 .name = "io_merged",
867 .read_map = blkiocg_io_merged_read, 1427 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1428 BLKIO_PROP_io_merged),
1429 .read_map = blkiocg_file_read_map,
868 }, 1430 },
869 { 1431 {
870 .name = "io_queued", 1432 .name = "io_queued",
871 .read_map = blkiocg_io_queued_read, 1433 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1434 BLKIO_PROP_io_queued),
1435 .read_map = blkiocg_file_read_map,
872 }, 1436 },
873 { 1437 {
874 .name = "reset_stats", 1438 .name = "reset_stats",
875 .write_u64 = blkiocg_reset_stats, 1439 .write_u64 = blkiocg_reset_stats,
876 }, 1440 },
1441#ifdef CONFIG_BLK_DEV_THROTTLING
1442 {
1443 .name = "throttle.read_bps_device",
1444 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1445 BLKIO_THROTL_read_bps_device),
1446 .read_seq_string = blkiocg_file_read,
1447 .write_string = blkiocg_file_write,
1448 .max_write_len = 256,
1449 },
1450
1451 {
1452 .name = "throttle.write_bps_device",
1453 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1454 BLKIO_THROTL_write_bps_device),
1455 .read_seq_string = blkiocg_file_read,
1456 .write_string = blkiocg_file_write,
1457 .max_write_len = 256,
1458 },
1459
1460 {
1461 .name = "throttle.read_iops_device",
1462 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1463 BLKIO_THROTL_read_iops_device),
1464 .read_seq_string = blkiocg_file_read,
1465 .write_string = blkiocg_file_write,
1466 .max_write_len = 256,
1467 },
1468
1469 {
1470 .name = "throttle.write_iops_device",
1471 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1472 BLKIO_THROTL_write_iops_device),
1473 .read_seq_string = blkiocg_file_read,
1474 .write_string = blkiocg_file_write,
1475 .max_write_len = 256,
1476 },
1477 {
1478 .name = "throttle.io_service_bytes",
1479 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1480 BLKIO_THROTL_io_service_bytes),
1481 .read_map = blkiocg_file_read_map,
1482 },
1483 {
1484 .name = "throttle.io_serviced",
1485 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1486 BLKIO_THROTL_io_serviced),
1487 .read_map = blkiocg_file_read_map,
1488 },
1489#endif /* CONFIG_BLK_DEV_THROTTLING */
1490
877#ifdef CONFIG_DEBUG_BLK_CGROUP 1491#ifdef CONFIG_DEBUG_BLK_CGROUP
878 { 1492 {
879 .name = "avg_queue_size", 1493 .name = "avg_queue_size",
880 .read_map = blkiocg_avg_queue_size_read, 1494 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1495 BLKIO_PROP_avg_queue_size),
1496 .read_map = blkiocg_file_read_map,
881 }, 1497 },
882 { 1498 {
883 .name = "group_wait_time", 1499 .name = "group_wait_time",
884 .read_map = blkiocg_group_wait_time_read, 1500 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1501 BLKIO_PROP_group_wait_time),
1502 .read_map = blkiocg_file_read_map,
885 }, 1503 },
886 { 1504 {
887 .name = "idle_time", 1505 .name = "idle_time",
888 .read_map = blkiocg_idle_time_read, 1506 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1507 BLKIO_PROP_idle_time),
1508 .read_map = blkiocg_file_read_map,
889 }, 1509 },
890 { 1510 {
891 .name = "empty_time", 1511 .name = "empty_time",
892 .read_map = blkiocg_empty_time_read, 1512 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1513 BLKIO_PROP_empty_time),
1514 .read_map = blkiocg_file_read_map,
893 }, 1515 },
894 { 1516 {
895 .name = "dequeue", 1517 .name = "dequeue",
896 .read_map = blkiocg_dequeue_read, 1518 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1519 BLKIO_PROP_dequeue),
1520 .read_map = blkiocg_file_read_map,
1521 },
1522 {
1523 .name = "unaccounted_time",
1524 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1525 BLKIO_PROP_unaccounted_time),
1526 .read_map = blkiocg_file_read_map,
897 }, 1527 },
898#endif 1528#endif
899}; 1529};
@@ -932,13 +1562,14 @@ static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup)
932 /* 1562 /*
933 * This blkio_group is being unlinked as associated cgroup is 1563 * This blkio_group is being unlinked as associated cgroup is
934 * going away. Let all the IO controlling policies know about 1564 * going away. Let all the IO controlling policies know about
935 * this event. Currently this is static call to one io 1565 * this event.
936 * controlling policy. Once we have more policies in place, we
937 * need some dynamic registration of callback function.
938 */ 1566 */
939 spin_lock(&blkio_list_lock); 1567 spin_lock(&blkio_list_lock);
940 list_for_each_entry(blkiop, &blkio_list, list) 1568 list_for_each_entry(blkiop, &blkio_list, list) {
1569 if (blkiop->plid != blkg->plid)
1570 continue;
941 blkiop->ops.blkio_unlink_group_fn(key, blkg); 1571 blkiop->ops.blkio_unlink_group_fn(key, blkg);
1572 }
942 spin_unlock(&blkio_list_lock); 1573 spin_unlock(&blkio_list_lock);
943 } while (1); 1574 } while (1);
944 1575
@@ -964,10 +1595,6 @@ blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup)
964 goto done; 1595 goto done;
965 } 1596 }
966 1597
967 /* Currently we do not support hierarchy deeper than two level (0,1) */
968 if (parent != cgroup->top_cgroup)
969 return ERR_PTR(-EPERM);
970
971 blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL); 1598 blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
972 if (!blkcg) 1599 if (!blkcg)
973 return ERR_PTR(-ENOMEM); 1600 return ERR_PTR(-ENOMEM);
@@ -987,9 +1614,7 @@ done:
987 * of the main cic data structures. For now we allow a task to change 1614 * of the main cic data structures. For now we allow a task to change
988 * its cgroup only if it's the only owner of its ioc. 1615 * its cgroup only if it's the only owner of its ioc.
989 */ 1616 */
990static int blkiocg_can_attach(struct cgroup_subsys *subsys, 1617static int blkiocg_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
991 struct cgroup *cgroup, struct task_struct *tsk,
992 bool threadgroup)
993{ 1618{
994 struct io_context *ioc; 1619 struct io_context *ioc;
995 int ret = 0; 1620 int ret = 0;
@@ -1004,9 +1629,7 @@ static int blkiocg_can_attach(struct cgroup_subsys *subsys,
1004 return ret; 1629 return ret;
1005} 1630}
1006 1631
1007static void blkiocg_attach(struct cgroup_subsys *subsys, struct cgroup *cgroup, 1632static void blkiocg_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1008 struct cgroup *prev, struct task_struct *tsk,
1009 bool threadgroup)
1010{ 1633{
1011 struct io_context *ioc; 1634 struct io_context *ioc;
1012 1635
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 2b866ec1dcea..a71d2904ffb9 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -14,6 +14,15 @@
14 */ 14 */
15 15
16#include <linux/cgroup.h> 16#include <linux/cgroup.h>
17#include <linux/u64_stats_sync.h>
18
19enum blkio_policy_id {
20 BLKIO_POLICY_PROP = 0, /* Proportional Bandwidth division */
21 BLKIO_POLICY_THROTL, /* Throttling */
22};
23
24/* Max limits for throttle policy */
25#define THROTL_IOPS_MAX UINT_MAX
17 26
18#if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE) 27#if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE)
19 28
@@ -28,20 +37,15 @@ enum stat_type {
28 * request completion for IOs doen by this cgroup. This may not be 37 * request completion for IOs doen by this cgroup. This may not be
29 * accurate when NCQ is turned on. */ 38 * accurate when NCQ is turned on. */
30 BLKIO_STAT_SERVICE_TIME = 0, 39 BLKIO_STAT_SERVICE_TIME = 0,
31 /* Total bytes transferred */
32 BLKIO_STAT_SERVICE_BYTES,
33 /* Total IOs serviced, post merge */
34 BLKIO_STAT_SERVICED,
35 /* Total time spent waiting in scheduler queue in ns */ 40 /* Total time spent waiting in scheduler queue in ns */
36 BLKIO_STAT_WAIT_TIME, 41 BLKIO_STAT_WAIT_TIME,
37 /* Number of IOs merged */
38 BLKIO_STAT_MERGED,
39 /* Number of IOs queued up */ 42 /* Number of IOs queued up */
40 BLKIO_STAT_QUEUED, 43 BLKIO_STAT_QUEUED,
41 /* All the single valued stats go below this */ 44 /* All the single valued stats go below this */
42 BLKIO_STAT_TIME, 45 BLKIO_STAT_TIME,
43 BLKIO_STAT_SECTORS,
44#ifdef CONFIG_DEBUG_BLK_CGROUP 46#ifdef CONFIG_DEBUG_BLK_CGROUP
47 /* Time not charged to this cgroup */
48 BLKIO_STAT_UNACCOUNTED_TIME,
45 BLKIO_STAT_AVG_QUEUE_SIZE, 49 BLKIO_STAT_AVG_QUEUE_SIZE,
46 BLKIO_STAT_IDLE_TIME, 50 BLKIO_STAT_IDLE_TIME,
47 BLKIO_STAT_EMPTY_TIME, 51 BLKIO_STAT_EMPTY_TIME,
@@ -50,6 +54,18 @@ enum stat_type {
50#endif 54#endif
51}; 55};
52 56
57/* Per cpu stats */
58enum stat_type_cpu {
59 BLKIO_STAT_CPU_SECTORS,
60 /* Total bytes transferred */
61 BLKIO_STAT_CPU_SERVICE_BYTES,
62 /* Total IOs serviced, post merge */
63 BLKIO_STAT_CPU_SERVICED,
64 /* Number of IOs merged */
65 BLKIO_STAT_CPU_MERGED,
66 BLKIO_STAT_CPU_NR
67};
68
53enum stat_sub_type { 69enum stat_sub_type {
54 BLKIO_STAT_READ = 0, 70 BLKIO_STAT_READ = 0,
55 BLKIO_STAT_WRITE, 71 BLKIO_STAT_WRITE,
@@ -65,6 +81,36 @@ enum blkg_state_flags {
65 BLKG_empty, 81 BLKG_empty,
66}; 82};
67 83
84/* cgroup files owned by proportional weight policy */
85enum blkcg_file_name_prop {
86 BLKIO_PROP_weight = 1,
87 BLKIO_PROP_weight_device,
88 BLKIO_PROP_io_service_bytes,
89 BLKIO_PROP_io_serviced,
90 BLKIO_PROP_time,
91 BLKIO_PROP_sectors,
92 BLKIO_PROP_unaccounted_time,
93 BLKIO_PROP_io_service_time,
94 BLKIO_PROP_io_wait_time,
95 BLKIO_PROP_io_merged,
96 BLKIO_PROP_io_queued,
97 BLKIO_PROP_avg_queue_size,
98 BLKIO_PROP_group_wait_time,
99 BLKIO_PROP_idle_time,
100 BLKIO_PROP_empty_time,
101 BLKIO_PROP_dequeue,
102};
103
104/* cgroup files owned by throttle policy */
105enum blkcg_file_name_throtl {
106 BLKIO_THROTL_read_bps_device,
107 BLKIO_THROTL_write_bps_device,
108 BLKIO_THROTL_read_iops_device,
109 BLKIO_THROTL_write_iops_device,
110 BLKIO_THROTL_io_service_bytes,
111 BLKIO_THROTL_io_serviced,
112};
113
68struct blkio_cgroup { 114struct blkio_cgroup {
69 struct cgroup_subsys_state css; 115 struct cgroup_subsys_state css;
70 unsigned int weight; 116 unsigned int weight;
@@ -76,9 +122,11 @@ struct blkio_cgroup {
76struct blkio_group_stats { 122struct blkio_group_stats {
77 /* total disk time and nr sectors dispatched by this group */ 123 /* total disk time and nr sectors dispatched by this group */
78 uint64_t time; 124 uint64_t time;
79 uint64_t sectors;
80 uint64_t stat_arr[BLKIO_STAT_QUEUED + 1][BLKIO_STAT_TOTAL]; 125 uint64_t stat_arr[BLKIO_STAT_QUEUED + 1][BLKIO_STAT_TOTAL];
81#ifdef CONFIG_DEBUG_BLK_CGROUP 126#ifdef CONFIG_DEBUG_BLK_CGROUP
127 /* Time not charged to this cgroup */
128 uint64_t unaccounted_time;
129
82 /* Sum of number of IOs queued across all samples */ 130 /* Sum of number of IOs queued across all samples */
83 uint64_t avg_queue_size_sum; 131 uint64_t avg_queue_size_sum;
84 /* Count of samples taken for average */ 132 /* Count of samples taken for average */
@@ -103,6 +151,13 @@ struct blkio_group_stats {
103#endif 151#endif
104}; 152};
105 153
154/* Per cpu blkio group stats */
155struct blkio_group_stats_cpu {
156 uint64_t sectors;
157 uint64_t stat_arr_cpu[BLKIO_STAT_CPU_NR][BLKIO_STAT_TOTAL];
158 struct u64_stats_sync syncp;
159};
160
106struct blkio_group { 161struct blkio_group {
107 /* An rcu protected unique identifier for the group */ 162 /* An rcu protected unique identifier for the group */
108 void *key; 163 void *key;
@@ -112,33 +167,73 @@ struct blkio_group {
112 char path[128]; 167 char path[128];
113 /* The device MKDEV(major, minor), this group has been created for */ 168 /* The device MKDEV(major, minor), this group has been created for */
114 dev_t dev; 169 dev_t dev;
170 /* policy which owns this blk group */
171 enum blkio_policy_id plid;
115 172
116 /* Need to serialize the stats in the case of reset/update */ 173 /* Need to serialize the stats in the case of reset/update */
117 spinlock_t stats_lock; 174 spinlock_t stats_lock;
118 struct blkio_group_stats stats; 175 struct blkio_group_stats stats;
176 /* Per cpu stats pointer */
177 struct blkio_group_stats_cpu __percpu *stats_cpu;
119}; 178};
120 179
121struct blkio_policy_node { 180struct blkio_policy_node {
122 struct list_head node; 181 struct list_head node;
123 dev_t dev; 182 dev_t dev;
124 unsigned int weight; 183 /* This node belongs to max bw policy or porportional weight policy */
184 enum blkio_policy_id plid;
185 /* cgroup file to which this rule belongs to */
186 int fileid;
187
188 union {
189 unsigned int weight;
190 /*
191 * Rate read/write in terms of byptes per second
192 * Whether this rate represents read or write is determined
193 * by file type "fileid".
194 */
195 u64 bps;
196 unsigned int iops;
197 } val;
125}; 198};
126 199
127extern unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg, 200extern unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
128 dev_t dev); 201 dev_t dev);
202extern uint64_t blkcg_get_read_bps(struct blkio_cgroup *blkcg,
203 dev_t dev);
204extern uint64_t blkcg_get_write_bps(struct blkio_cgroup *blkcg,
205 dev_t dev);
206extern unsigned int blkcg_get_read_iops(struct blkio_cgroup *blkcg,
207 dev_t dev);
208extern unsigned int blkcg_get_write_iops(struct blkio_cgroup *blkcg,
209 dev_t dev);
129 210
130typedef void (blkio_unlink_group_fn) (void *key, struct blkio_group *blkg); 211typedef void (blkio_unlink_group_fn) (void *key, struct blkio_group *blkg);
131typedef void (blkio_update_group_weight_fn) (struct blkio_group *blkg, 212
132 unsigned int weight); 213typedef void (blkio_update_group_weight_fn) (void *key,
214 struct blkio_group *blkg, unsigned int weight);
215typedef void (blkio_update_group_read_bps_fn) (void * key,
216 struct blkio_group *blkg, u64 read_bps);
217typedef void (blkio_update_group_write_bps_fn) (void *key,
218 struct blkio_group *blkg, u64 write_bps);
219typedef void (blkio_update_group_read_iops_fn) (void *key,
220 struct blkio_group *blkg, unsigned int read_iops);
221typedef void (blkio_update_group_write_iops_fn) (void *key,
222 struct blkio_group *blkg, unsigned int write_iops);
133 223
134struct blkio_policy_ops { 224struct blkio_policy_ops {
135 blkio_unlink_group_fn *blkio_unlink_group_fn; 225 blkio_unlink_group_fn *blkio_unlink_group_fn;
136 blkio_update_group_weight_fn *blkio_update_group_weight_fn; 226 blkio_update_group_weight_fn *blkio_update_group_weight_fn;
227 blkio_update_group_read_bps_fn *blkio_update_group_read_bps_fn;
228 blkio_update_group_write_bps_fn *blkio_update_group_write_bps_fn;
229 blkio_update_group_read_iops_fn *blkio_update_group_read_iops_fn;
230 blkio_update_group_write_iops_fn *blkio_update_group_write_iops_fn;
137}; 231};
138 232
139struct blkio_policy_type { 233struct blkio_policy_type {
140 struct list_head list; 234 struct list_head list;
141 struct blkio_policy_ops ops; 235 struct blkio_policy_ops ops;
236 enum blkio_policy_id plid;
142}; 237};
143 238
144/* Blkio controller policy registration */ 239/* Blkio controller policy registration */
@@ -165,7 +260,7 @@ static inline char *blkg_path(struct blkio_group *blkg) { return NULL; }
165 260
166#endif 261#endif
167 262
168#define BLKIO_WEIGHT_MIN 100 263#define BLKIO_WEIGHT_MIN 10
169#define BLKIO_WEIGHT_MAX 1000 264#define BLKIO_WEIGHT_MAX 1000
170#define BLKIO_WEIGHT_DEFAULT 500 265#define BLKIO_WEIGHT_DEFAULT 500
171 266
@@ -211,13 +306,17 @@ static inline void blkiocg_set_start_empty_time(struct blkio_group *blkg) {}
211#if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE) 306#if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE)
212extern struct blkio_cgroup blkio_root_cgroup; 307extern struct blkio_cgroup blkio_root_cgroup;
213extern struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup); 308extern struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup);
309extern struct blkio_cgroup *task_blkio_cgroup(struct task_struct *tsk);
214extern void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, 310extern void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
215 struct blkio_group *blkg, void *key, dev_t dev); 311 struct blkio_group *blkg, void *key, dev_t dev,
312 enum blkio_policy_id plid);
313extern int blkio_alloc_blkg_stats(struct blkio_group *blkg);
216extern int blkiocg_del_blkio_group(struct blkio_group *blkg); 314extern int blkiocg_del_blkio_group(struct blkio_group *blkg);
217extern struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, 315extern struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg,
218 void *key); 316 void *key);
219void blkiocg_update_timeslice_used(struct blkio_group *blkg, 317void blkiocg_update_timeslice_used(struct blkio_group *blkg,
220 unsigned long time); 318 unsigned long time,
319 unsigned long unaccounted_time);
221void blkiocg_update_dispatch_stats(struct blkio_group *blkg, uint64_t bytes, 320void blkiocg_update_dispatch_stats(struct blkio_group *blkg, uint64_t bytes,
222 bool direction, bool sync); 321 bool direction, bool sync);
223void blkiocg_update_completion_stats(struct blkio_group *blkg, 322void blkiocg_update_completion_stats(struct blkio_group *blkg,
@@ -232,9 +331,14 @@ void blkiocg_update_io_remove_stats(struct blkio_group *blkg,
232struct cgroup; 331struct cgroup;
233static inline struct blkio_cgroup * 332static inline struct blkio_cgroup *
234cgroup_to_blkio_cgroup(struct cgroup *cgroup) { return NULL; } 333cgroup_to_blkio_cgroup(struct cgroup *cgroup) { return NULL; }
334static inline struct blkio_cgroup *
335task_blkio_cgroup(struct task_struct *tsk) { return NULL; }
235 336
236static inline void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, 337static inline void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
237 struct blkio_group *blkg, void *key, dev_t dev) {} 338 struct blkio_group *blkg, void *key, dev_t dev,
339 enum blkio_policy_id plid) {}
340
341static inline int blkio_alloc_blkg_stats(struct blkio_group *blkg) { return 0; }
238 342
239static inline int 343static inline int
240blkiocg_del_blkio_group(struct blkio_group *blkg) { return 0; } 344blkiocg_del_blkio_group(struct blkio_group *blkg) { return 0; }
@@ -242,7 +346,9 @@ blkiocg_del_blkio_group(struct blkio_group *blkg) { return 0; }
242static inline struct blkio_group * 346static inline struct blkio_group *
243blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key) { return NULL; } 347blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key) { return NULL; }
244static inline void blkiocg_update_timeslice_used(struct blkio_group *blkg, 348static inline void blkiocg_update_timeslice_used(struct blkio_group *blkg,
245 unsigned long time) {} 349 unsigned long time,
350 unsigned long unaccounted_time)
351{}
246static inline void blkiocg_update_dispatch_stats(struct blkio_group *blkg, 352static inline void blkiocg_update_dispatch_stats(struct blkio_group *blkg,
247 uint64_t bytes, bool direction, bool sync) {} 353 uint64_t bytes, bool direction, bool sync) {}
248static inline void blkiocg_update_completion_stats(struct blkio_group *blkg, 354static inline void blkiocg_update_completion_stats(struct blkio_group *blkg,
diff --git a/block/blk-core.c b/block/blk-core.c
index 32a1c123dfb3..d2f8f4049abd 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -27,13 +27,14 @@
27#include <linux/writeback.h> 27#include <linux/writeback.h>
28#include <linux/task_io_accounting_ops.h> 28#include <linux/task_io_accounting_ops.h>
29#include <linux/fault-inject.h> 29#include <linux/fault-inject.h>
30#include <linux/list_sort.h>
30 31
31#define CREATE_TRACE_POINTS 32#define CREATE_TRACE_POINTS
32#include <trace/events/block.h> 33#include <trace/events/block.h>
33 34
34#include "blk.h" 35#include "blk.h"
35 36
36EXPORT_TRACEPOINT_SYMBOL_GPL(block_remap); 37EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);
37EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap); 38EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap);
38EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete); 39EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete);
39 40
@@ -64,13 +65,27 @@ static void drive_stat_acct(struct request *rq, int new_io)
64 return; 65 return;
65 66
66 cpu = part_stat_lock(); 67 cpu = part_stat_lock();
67 part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
68 68
69 if (!new_io) 69 if (!new_io) {
70 part = rq->part;
70 part_stat_inc(cpu, part, merges[rw]); 71 part_stat_inc(cpu, part, merges[rw]);
71 else { 72 } else {
73 part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
74 if (!hd_struct_try_get(part)) {
75 /*
76 * The partition is already being removed,
77 * the request will be accounted on the disk only
78 *
79 * We take a reference on disk->part0 although that
80 * partition will never be deleted, so we can treat
81 * it as any other partition.
82 */
83 part = &rq->rq_disk->part0;
84 hd_struct_get(part);
85 }
72 part_round_stats(cpu, part); 86 part_round_stats(cpu, part);
73 part_inc_in_flight(part, rw); 87 part_inc_in_flight(part, rw);
88 rq->part = part;
74 } 89 }
75 90
76 part_stat_unlock(); 91 part_stat_unlock();
@@ -128,46 +143,36 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
128 rq->ref_count = 1; 143 rq->ref_count = 1;
129 rq->start_time = jiffies; 144 rq->start_time = jiffies;
130 set_start_time_ns(rq); 145 set_start_time_ns(rq);
146 rq->part = NULL;
131} 147}
132EXPORT_SYMBOL(blk_rq_init); 148EXPORT_SYMBOL(blk_rq_init);
133 149
134static void req_bio_endio(struct request *rq, struct bio *bio, 150static void req_bio_endio(struct request *rq, struct bio *bio,
135 unsigned int nbytes, int error) 151 unsigned int nbytes, int error)
136{ 152{
137 struct request_queue *q = rq->q; 153 if (error)
138 154 clear_bit(BIO_UPTODATE, &bio->bi_flags);
139 if (&q->bar_rq != rq) { 155 else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
140 if (error) 156 error = -EIO;
141 clear_bit(BIO_UPTODATE, &bio->bi_flags);
142 else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
143 error = -EIO;
144
145 if (unlikely(nbytes > bio->bi_size)) {
146 printk(KERN_ERR "%s: want %u bytes done, %u left\n",
147 __func__, nbytes, bio->bi_size);
148 nbytes = bio->bi_size;
149 }
150 157
151 if (unlikely(rq->cmd_flags & REQ_QUIET)) 158 if (unlikely(nbytes > bio->bi_size)) {
152 set_bit(BIO_QUIET, &bio->bi_flags); 159 printk(KERN_ERR "%s: want %u bytes done, %u left\n",
160 __func__, nbytes, bio->bi_size);
161 nbytes = bio->bi_size;
162 }
153 163
154 bio->bi_size -= nbytes; 164 if (unlikely(rq->cmd_flags & REQ_QUIET))
155 bio->bi_sector += (nbytes >> 9); 165 set_bit(BIO_QUIET, &bio->bi_flags);
156 166
157 if (bio_integrity(bio)) 167 bio->bi_size -= nbytes;
158 bio_integrity_advance(bio, nbytes); 168 bio->bi_sector += (nbytes >> 9);
159 169
160 if (bio->bi_size == 0) 170 if (bio_integrity(bio))
161 bio_endio(bio, error); 171 bio_integrity_advance(bio, nbytes);
162 } else {
163 172
164 /* 173 /* don't actually finish bio if it's part of flush sequence */
165 * Okay, this is the barrier request in progress, just 174 if (bio->bi_size == 0 && !(rq->cmd_flags & REQ_FLUSH_SEQ))
166 * record the error; 175 bio_endio(bio, error);
167 */
168 if (error && !q->orderr)
169 q->orderr = error;
170 }
171} 176}
172 177
173void blk_dump_rq_flags(struct request *rq, char *msg) 178void blk_dump_rq_flags(struct request *rq, char *msg)
@@ -193,136 +198,32 @@ void blk_dump_rq_flags(struct request *rq, char *msg)
193} 198}
194EXPORT_SYMBOL(blk_dump_rq_flags); 199EXPORT_SYMBOL(blk_dump_rq_flags);
195 200
196/* 201static void blk_delay_work(struct work_struct *work)
197 * "plug" the device if there are no outstanding requests: this will
198 * force the transfer to start only after we have put all the requests
199 * on the list.
200 *
201 * This is called with interrupts off and no requests on the queue and
202 * with the queue lock held.
203 */
204void blk_plug_device(struct request_queue *q)
205{ 202{
206 WARN_ON(!irqs_disabled()); 203 struct request_queue *q;
207
208 /*
209 * don't plug a stopped queue, it must be paired with blk_start_queue()
210 * which will restart the queueing
211 */
212 if (blk_queue_stopped(q))
213 return;
214 204
215 if (!queue_flag_test_and_set(QUEUE_FLAG_PLUGGED, q)) { 205 q = container_of(work, struct request_queue, delay_work.work);
216 mod_timer(&q->unplug_timer, jiffies + q->unplug_delay); 206 spin_lock_irq(q->queue_lock);
217 trace_block_plug(q); 207 __blk_run_queue(q);
218 } 208 spin_unlock_irq(q->queue_lock);
219} 209}
220EXPORT_SYMBOL(blk_plug_device);
221 210
222/** 211/**
223 * blk_plug_device_unlocked - plug a device without queue lock held 212 * blk_delay_queue - restart queueing after defined interval
224 * @q: The &struct request_queue to plug 213 * @q: The &struct request_queue in question
214 * @msecs: Delay in msecs
225 * 215 *
226 * Description: 216 * Description:
227 * Like @blk_plug_device(), but grabs the queue lock and disables 217 * Sometimes queueing needs to be postponed for a little while, to allow
228 * interrupts. 218 * resources to come back. This function will make sure that queueing is
229 **/ 219 * restarted around the specified time.
230void blk_plug_device_unlocked(struct request_queue *q)
231{
232 unsigned long flags;
233
234 spin_lock_irqsave(q->queue_lock, flags);
235 blk_plug_device(q);
236 spin_unlock_irqrestore(q->queue_lock, flags);
237}
238EXPORT_SYMBOL(blk_plug_device_unlocked);
239
240/*
241 * remove the queue from the plugged list, if present. called with
242 * queue lock held and interrupts disabled.
243 */
244int blk_remove_plug(struct request_queue *q)
245{
246 WARN_ON(!irqs_disabled());
247
248 if (!queue_flag_test_and_clear(QUEUE_FLAG_PLUGGED, q))
249 return 0;
250
251 del_timer(&q->unplug_timer);
252 return 1;
253}
254EXPORT_SYMBOL(blk_remove_plug);
255
256/*
257 * remove the plug and let it rip..
258 */ 220 */
259void __generic_unplug_device(struct request_queue *q) 221void blk_delay_queue(struct request_queue *q, unsigned long msecs)
260{ 222{
261 if (unlikely(blk_queue_stopped(q))) 223 queue_delayed_work(kblockd_workqueue, &q->delay_work,
262 return; 224 msecs_to_jiffies(msecs));
263 if (!blk_remove_plug(q) && !blk_queue_nonrot(q))
264 return;
265
266 q->request_fn(q);
267} 225}
268 226EXPORT_SYMBOL(blk_delay_queue);
269/**
270 * generic_unplug_device - fire a request queue
271 * @q: The &struct request_queue in question
272 *
273 * Description:
274 * Linux uses plugging to build bigger requests queues before letting
275 * the device have at them. If a queue is plugged, the I/O scheduler
276 * is still adding and merging requests on the queue. Once the queue
277 * gets unplugged, the request_fn defined for the queue is invoked and
278 * transfers started.
279 **/
280void generic_unplug_device(struct request_queue *q)
281{
282 if (blk_queue_plugged(q)) {
283 spin_lock_irq(q->queue_lock);
284 __generic_unplug_device(q);
285 spin_unlock_irq(q->queue_lock);
286 }
287}
288EXPORT_SYMBOL(generic_unplug_device);
289
290static void blk_backing_dev_unplug(struct backing_dev_info *bdi,
291 struct page *page)
292{
293 struct request_queue *q = bdi->unplug_io_data;
294
295 blk_unplug(q);
296}
297
298void blk_unplug_work(struct work_struct *work)
299{
300 struct request_queue *q =
301 container_of(work, struct request_queue, unplug_work);
302
303 trace_block_unplug_io(q);
304 q->unplug_fn(q);
305}
306
307void blk_unplug_timeout(unsigned long data)
308{
309 struct request_queue *q = (struct request_queue *)data;
310
311 trace_block_unplug_timer(q);
312 kblockd_schedule_work(q, &q->unplug_work);
313}
314
315void blk_unplug(struct request_queue *q)
316{
317 /*
318 * devices don't necessarily have an ->unplug_fn defined
319 */
320 if (q->unplug_fn) {
321 trace_block_unplug_io(q);
322 q->unplug_fn(q);
323 }
324}
325EXPORT_SYMBOL(blk_unplug);
326 227
327/** 228/**
328 * blk_start_queue - restart a previously stopped queue 229 * blk_start_queue - restart a previously stopped queue
@@ -358,7 +259,7 @@ EXPORT_SYMBOL(blk_start_queue);
358 **/ 259 **/
359void blk_stop_queue(struct request_queue *q) 260void blk_stop_queue(struct request_queue *q)
360{ 261{
361 blk_remove_plug(q); 262 __cancel_delayed_work(&q->delay_work);
362 queue_flag_set(QUEUE_FLAG_STOPPED, q); 263 queue_flag_set(QUEUE_FLAG_STOPPED, q);
363} 264}
364EXPORT_SYMBOL(blk_stop_queue); 265EXPORT_SYMBOL(blk_stop_queue);
@@ -376,12 +277,15 @@ EXPORT_SYMBOL(blk_stop_queue);
376 * that its ->make_request_fn will not re-add plugging prior to calling 277 * that its ->make_request_fn will not re-add plugging prior to calling
377 * this function. 278 * this function.
378 * 279 *
280 * This function does not cancel any asynchronous activity arising
281 * out of elevator or throttling code. That would require elevaotor_exit()
282 * and blk_throtl_exit() to be called with queue lock initialized.
283 *
379 */ 284 */
380void blk_sync_queue(struct request_queue *q) 285void blk_sync_queue(struct request_queue *q)
381{ 286{
382 del_timer_sync(&q->unplug_timer);
383 del_timer_sync(&q->timeout); 287 del_timer_sync(&q->timeout);
384 cancel_work_sync(&q->unplug_work); 288 cancel_delayed_work_sync(&q->delay_work);
385} 289}
386EXPORT_SYMBOL(blk_sync_queue); 290EXPORT_SYMBOL(blk_sync_queue);
387 291
@@ -392,31 +296,32 @@ EXPORT_SYMBOL(blk_sync_queue);
392 * Description: 296 * Description:
393 * See @blk_run_queue. This variant must be called with the queue lock 297 * See @blk_run_queue. This variant must be called with the queue lock
394 * held and interrupts disabled. 298 * held and interrupts disabled.
395 *
396 */ 299 */
397void __blk_run_queue(struct request_queue *q) 300void __blk_run_queue(struct request_queue *q)
398{ 301{
399 blk_remove_plug(q);
400
401 if (unlikely(blk_queue_stopped(q))) 302 if (unlikely(blk_queue_stopped(q)))
402 return; 303 return;
403 304
404 if (elv_queue_empty(q)) 305 q->request_fn(q);
405 return; 306}
307EXPORT_SYMBOL(__blk_run_queue);
406 308
407 /* 309/**
408 * Only recurse once to avoid overrunning the stack, let the unplug 310 * blk_run_queue_async - run a single device queue in workqueue context
409 * handling reinvoke the handler shortly if we already got there. 311 * @q: The queue to run
410 */ 312 *
411 if (!queue_flag_test_and_set(QUEUE_FLAG_REENTER, q)) { 313 * Description:
412 q->request_fn(q); 314 * Tells kblockd to perform the equivalent of @blk_run_queue on behalf
413 queue_flag_clear(QUEUE_FLAG_REENTER, q); 315 * of us.
414 } else { 316 */
415 queue_flag_set(QUEUE_FLAG_PLUGGED, q); 317void blk_run_queue_async(struct request_queue *q)
416 kblockd_schedule_work(q, &q->unplug_work); 318{
319 if (likely(!blk_queue_stopped(q))) {
320 __cancel_delayed_work(&q->delay_work);
321 queue_delayed_work(kblockd_workqueue, &q->delay_work, 0);
417 } 322 }
418} 323}
419EXPORT_SYMBOL(__blk_run_queue); 324EXPORT_SYMBOL(blk_run_queue_async);
420 325
421/** 326/**
422 * blk_run_queue - run a single device queue 327 * blk_run_queue - run a single device queue
@@ -440,7 +345,13 @@ void blk_put_queue(struct request_queue *q)
440{ 345{
441 kobject_put(&q->kobj); 346 kobject_put(&q->kobj);
442} 347}
348EXPORT_SYMBOL(blk_put_queue);
443 349
350/*
351 * Note: If a driver supplied the queue lock, it should not zap that lock
352 * unexpectedly as some queue cleanup components like elevator_exit() and
353 * blk_throtl_exit() need queue lock.
354 */
444void blk_cleanup_queue(struct request_queue *q) 355void blk_cleanup_queue(struct request_queue *q)
445{ 356{
446 /* 357 /*
@@ -459,6 +370,8 @@ void blk_cleanup_queue(struct request_queue *q)
459 if (q->elevator) 370 if (q->elevator)
460 elevator_exit(q->elevator); 371 elevator_exit(q->elevator);
461 372
373 blk_throtl_exit(q);
374
462 blk_put_queue(q); 375 blk_put_queue(q);
463} 376}
464EXPORT_SYMBOL(blk_cleanup_queue); 377EXPORT_SYMBOL(blk_cleanup_queue);
@@ -501,8 +414,6 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
501 if (!q) 414 if (!q)
502 return NULL; 415 return NULL;
503 416
504 q->backing_dev_info.unplug_io_fn = blk_backing_dev_unplug;
505 q->backing_dev_info.unplug_io_data = q;
506 q->backing_dev_info.ra_pages = 417 q->backing_dev_info.ra_pages =
507 (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; 418 (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
508 q->backing_dev_info.state = 0; 419 q->backing_dev_info.state = 0;
@@ -515,18 +426,31 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
515 return NULL; 426 return NULL;
516 } 427 }
517 428
429 if (blk_throtl_init(q)) {
430 kmem_cache_free(blk_requestq_cachep, q);
431 return NULL;
432 }
433
518 setup_timer(&q->backing_dev_info.laptop_mode_wb_timer, 434 setup_timer(&q->backing_dev_info.laptop_mode_wb_timer,
519 laptop_mode_timer_fn, (unsigned long) q); 435 laptop_mode_timer_fn, (unsigned long) q);
520 init_timer(&q->unplug_timer);
521 setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q); 436 setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q);
522 INIT_LIST_HEAD(&q->timeout_list); 437 INIT_LIST_HEAD(&q->timeout_list);
523 INIT_WORK(&q->unplug_work, blk_unplug_work); 438 INIT_LIST_HEAD(&q->flush_queue[0]);
439 INIT_LIST_HEAD(&q->flush_queue[1]);
440 INIT_LIST_HEAD(&q->flush_data_in_flight);
441 INIT_DELAYED_WORK(&q->delay_work, blk_delay_work);
524 442
525 kobject_init(&q->kobj, &blk_queue_ktype); 443 kobject_init(&q->kobj, &blk_queue_ktype);
526 444
527 mutex_init(&q->sysfs_lock); 445 mutex_init(&q->sysfs_lock);
528 spin_lock_init(&q->__queue_lock); 446 spin_lock_init(&q->__queue_lock);
529 447
448 /*
449 * By default initialize queue_lock to internal lock and driver can
450 * override it later if need be.
451 */
452 q->queue_lock = &q->__queue_lock;
453
530 return q; 454 return q;
531} 455}
532EXPORT_SYMBOL(blk_alloc_queue_node); 456EXPORT_SYMBOL(blk_alloc_queue_node);
@@ -609,9 +533,11 @@ blk_init_allocated_queue_node(struct request_queue *q, request_fn_proc *rfn,
609 q->request_fn = rfn; 533 q->request_fn = rfn;
610 q->prep_rq_fn = NULL; 534 q->prep_rq_fn = NULL;
611 q->unprep_rq_fn = NULL; 535 q->unprep_rq_fn = NULL;
612 q->unplug_fn = generic_unplug_device;
613 q->queue_flags = QUEUE_FLAG_DEFAULT; 536 q->queue_flags = QUEUE_FLAG_DEFAULT;
614 q->queue_lock = lock; 537
538 /* Override internal queue lock with supplied lock pointer */
539 if (lock)
540 q->queue_lock = lock;
615 541
616 /* 542 /*
617 * This also sets hw/phys segments, boundary and size 543 * This also sets hw/phys segments, boundary and size
@@ -641,6 +567,7 @@ int blk_get_queue(struct request_queue *q)
641 567
642 return 1; 568 return 1;
643} 569}
570EXPORT_SYMBOL(blk_get_queue);
644 571
645static inline void blk_free_request(struct request_queue *q, struct request *rq) 572static inline void blk_free_request(struct request_queue *q, struct request *rq)
646{ 573{
@@ -740,6 +667,25 @@ static void freed_request(struct request_queue *q, int sync, int priv)
740} 667}
741 668
742/* 669/*
670 * Determine if elevator data should be initialized when allocating the
671 * request associated with @bio.
672 */
673static bool blk_rq_should_init_elevator(struct bio *bio)
674{
675 if (!bio)
676 return true;
677
678 /*
679 * Flush requests do not use the elevator so skip initialization.
680 * This allows a request to share the flush and elevator data.
681 */
682 if (bio->bi_rw & (REQ_FLUSH | REQ_FUA))
683 return false;
684
685 return true;
686}
687
688/*
743 * Get a free request, queue_lock must be held. 689 * Get a free request, queue_lock must be held.
744 * Returns NULL on failure, with queue_lock held. 690 * Returns NULL on failure, with queue_lock held.
745 * Returns !NULL on success, with queue_lock *not held*. 691 * Returns !NULL on success, with queue_lock *not held*.
@@ -751,7 +697,7 @@ static struct request *get_request(struct request_queue *q, int rw_flags,
751 struct request_list *rl = &q->rq; 697 struct request_list *rl = &q->rq;
752 struct io_context *ioc = NULL; 698 struct io_context *ioc = NULL;
753 const bool is_sync = rw_is_sync(rw_flags) != 0; 699 const bool is_sync = rw_is_sync(rw_flags) != 0;
754 int may_queue, priv; 700 int may_queue, priv = 0;
755 701
756 may_queue = elv_may_queue(q, rw_flags); 702 may_queue = elv_may_queue(q, rw_flags);
757 if (may_queue == ELV_MQUEUE_NO) 703 if (may_queue == ELV_MQUEUE_NO)
@@ -795,9 +741,11 @@ static struct request *get_request(struct request_queue *q, int rw_flags,
795 rl->count[is_sync]++; 741 rl->count[is_sync]++;
796 rl->starved[is_sync] = 0; 742 rl->starved[is_sync] = 0;
797 743
798 priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags); 744 if (blk_rq_should_init_elevator(bio)) {
799 if (priv) 745 priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
800 rl->elvpriv++; 746 if (priv)
747 rl->elvpriv++;
748 }
801 749
802 if (blk_queue_io_stat(q)) 750 if (blk_queue_io_stat(q))
803 rw_flags |= REQ_IO_STAT; 751 rw_flags |= REQ_IO_STAT;
@@ -844,8 +792,8 @@ out:
844} 792}
845 793
846/* 794/*
847 * No available requests for this queue, unplug the device and wait for some 795 * No available requests for this queue, wait for some requests to become
848 * requests to become available. 796 * available.
849 * 797 *
850 * Called with q->queue_lock held, and returns with it unlocked. 798 * Called with q->queue_lock held, and returns with it unlocked.
851 */ 799 */
@@ -866,7 +814,6 @@ static struct request *get_request_wait(struct request_queue *q, int rw_flags,
866 814
867 trace_block_sleeprq(q, bio, rw_flags & 1); 815 trace_block_sleeprq(q, bio, rw_flags & 1);
868 816
869 __generic_unplug_device(q);
870 spin_unlock_irq(q->queue_lock); 817 spin_unlock_irq(q->queue_lock);
871 io_schedule(); 818 io_schedule();
872 819
@@ -988,6 +935,13 @@ void blk_requeue_request(struct request_queue *q, struct request *rq)
988} 935}
989EXPORT_SYMBOL(blk_requeue_request); 936EXPORT_SYMBOL(blk_requeue_request);
990 937
938static void add_acct_request(struct request_queue *q, struct request *rq,
939 int where)
940{
941 drive_stat_acct(rq, 1);
942 __elv_add_request(q, rq, where);
943}
944
991/** 945/**
992 * blk_insert_request - insert a special request into a request queue 946 * blk_insert_request - insert a special request into a request queue
993 * @q: request queue where request should be inserted 947 * @q: request queue where request should be inserted
@@ -1030,29 +984,12 @@ void blk_insert_request(struct request_queue *q, struct request *rq,
1030 if (blk_rq_tagged(rq)) 984 if (blk_rq_tagged(rq))
1031 blk_queue_end_tag(q, rq); 985 blk_queue_end_tag(q, rq);
1032 986
1033 drive_stat_acct(rq, 1); 987 add_acct_request(q, rq, where);
1034 __elv_add_request(q, rq, where, 0);
1035 __blk_run_queue(q); 988 __blk_run_queue(q);
1036 spin_unlock_irqrestore(q->queue_lock, flags); 989 spin_unlock_irqrestore(q->queue_lock, flags);
1037} 990}
1038EXPORT_SYMBOL(blk_insert_request); 991EXPORT_SYMBOL(blk_insert_request);
1039 992
1040/*
1041 * add-request adds a request to the linked list.
1042 * queue lock is held and interrupts disabled, as we muck with the
1043 * request queue list.
1044 */
1045static inline void add_request(struct request_queue *q, struct request *req)
1046{
1047 drive_stat_acct(req, 1);
1048
1049 /*
1050 * elevator indicated where it wants this request to be
1051 * inserted at elevator_merge time
1052 */
1053 __elv_add_request(q, req, ELEVATOR_INSERT_SORT, 0);
1054}
1055
1056static void part_round_stats_single(int cpu, struct hd_struct *part, 993static void part_round_stats_single(int cpu, struct hd_struct *part,
1057 unsigned long now) 994 unsigned long now)
1058{ 995{
@@ -1168,6 +1105,96 @@ void blk_add_request_payload(struct request *rq, struct page *page,
1168} 1105}
1169EXPORT_SYMBOL_GPL(blk_add_request_payload); 1106EXPORT_SYMBOL_GPL(blk_add_request_payload);
1170 1107
1108static bool bio_attempt_back_merge(struct request_queue *q, struct request *req,
1109 struct bio *bio)
1110{
1111 const int ff = bio->bi_rw & REQ_FAILFAST_MASK;
1112
1113 if (!ll_back_merge_fn(q, req, bio))
1114 return false;
1115
1116 trace_block_bio_backmerge(q, bio);
1117
1118 if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)
1119 blk_rq_set_mixed_merge(req);
1120
1121 req->biotail->bi_next = bio;
1122 req->biotail = bio;
1123 req->__data_len += bio->bi_size;
1124 req->ioprio = ioprio_best(req->ioprio, bio_prio(bio));
1125
1126 drive_stat_acct(req, 0);
1127 elv_bio_merged(q, req, bio);
1128 return true;
1129}
1130
1131static bool bio_attempt_front_merge(struct request_queue *q,
1132 struct request *req, struct bio *bio)
1133{
1134 const int ff = bio->bi_rw & REQ_FAILFAST_MASK;
1135
1136 if (!ll_front_merge_fn(q, req, bio))
1137 return false;
1138
1139 trace_block_bio_frontmerge(q, bio);
1140
1141 if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)
1142 blk_rq_set_mixed_merge(req);
1143
1144 bio->bi_next = req->bio;
1145 req->bio = bio;
1146
1147 /*
1148 * may not be valid. if the low level driver said
1149 * it didn't need a bounce buffer then it better
1150 * not touch req->buffer either...
1151 */
1152 req->buffer = bio_data(bio);
1153 req->__sector = bio->bi_sector;
1154 req->__data_len += bio->bi_size;
1155 req->ioprio = ioprio_best(req->ioprio, bio_prio(bio));
1156
1157 drive_stat_acct(req, 0);
1158 elv_bio_merged(q, req, bio);
1159 return true;
1160}
1161
1162/*
1163 * Attempts to merge with the plugged list in the current process. Returns
1164 * true if merge was successful, otherwise false.
1165 */
1166static bool attempt_plug_merge(struct task_struct *tsk, struct request_queue *q,
1167 struct bio *bio)
1168{
1169 struct blk_plug *plug;
1170 struct request *rq;
1171 bool ret = false;
1172
1173 plug = tsk->plug;
1174 if (!plug)
1175 goto out;
1176
1177 list_for_each_entry_reverse(rq, &plug->list, queuelist) {
1178 int el_ret;
1179
1180 if (rq->q != q)
1181 continue;
1182
1183 el_ret = elv_try_merge(rq, bio);
1184 if (el_ret == ELEVATOR_BACK_MERGE) {
1185 ret = bio_attempt_back_merge(q, rq, bio);
1186 if (ret)
1187 break;
1188 } else if (el_ret == ELEVATOR_FRONT_MERGE) {
1189 ret = bio_attempt_front_merge(q, rq, bio);
1190 if (ret)
1191 break;
1192 }
1193 }
1194out:
1195 return ret;
1196}
1197
1171void init_request_from_bio(struct request *req, struct bio *bio) 1198void init_request_from_bio(struct request *req, struct bio *bio)
1172{ 1199{
1173 req->cpu = bio->bi_comp_cpu; 1200 req->cpu = bio->bi_comp_cpu;
@@ -1183,31 +1210,13 @@ void init_request_from_bio(struct request *req, struct bio *bio)
1183 blk_rq_bio_prep(req->q, req, bio); 1210 blk_rq_bio_prep(req->q, req, bio);
1184} 1211}
1185 1212
1186/*
1187 * Only disabling plugging for non-rotational devices if it does tagging
1188 * as well, otherwise we do need the proper merging
1189 */
1190static inline bool queue_should_plug(struct request_queue *q)
1191{
1192 return !(blk_queue_nonrot(q) && blk_queue_tagged(q));
1193}
1194
1195static int __make_request(struct request_queue *q, struct bio *bio) 1213static int __make_request(struct request_queue *q, struct bio *bio)
1196{ 1214{
1197 struct request *req;
1198 int el_ret;
1199 unsigned int bytes = bio->bi_size;
1200 const unsigned short prio = bio_prio(bio);
1201 const bool sync = !!(bio->bi_rw & REQ_SYNC); 1215 const bool sync = !!(bio->bi_rw & REQ_SYNC);
1202 const bool unplug = !!(bio->bi_rw & REQ_UNPLUG); 1216 struct blk_plug *plug;
1203 const unsigned long ff = bio->bi_rw & REQ_FAILFAST_MASK; 1217 int el_ret, rw_flags, where = ELEVATOR_INSERT_SORT;
1204 int rw_flags; 1218 struct request *req;
1205 1219
1206 if ((bio->bi_rw & REQ_HARDBARRIER) &&
1207 (q->next_ordered == QUEUE_ORDERED_NONE)) {
1208 bio_endio(bio, -EOPNOTSUPP);
1209 return 0;
1210 }
1211 /* 1220 /*
1212 * low level driver can indicate that it wants pages above a 1221 * low level driver can indicate that it wants pages above a
1213 * certain limit bounced to low memory (ie for highmem, or even 1222 * certain limit bounced to low memory (ie for highmem, or even
@@ -1215,73 +1224,34 @@ static int __make_request(struct request_queue *q, struct bio *bio)
1215 */ 1224 */
1216 blk_queue_bounce(q, &bio); 1225 blk_queue_bounce(q, &bio);
1217 1226
1218 spin_lock_irq(q->queue_lock); 1227 if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) {
1219 1228 spin_lock_irq(q->queue_lock);
1220 if (unlikely((bio->bi_rw & REQ_HARDBARRIER)) || elv_queue_empty(q)) 1229 where = ELEVATOR_INSERT_FLUSH;
1221 goto get_rq; 1230 goto get_rq;
1231 }
1222 1232
1223 el_ret = elv_merge(q, &req, bio); 1233 /*
1224 switch (el_ret) { 1234 * Check if we can merge with the plugged list before grabbing
1225 case ELEVATOR_BACK_MERGE: 1235 * any locks.
1226 BUG_ON(!rq_mergeable(req)); 1236 */
1227 1237 if (attempt_plug_merge(current, q, bio))
1228 if (!ll_back_merge_fn(q, req, bio))
1229 break;
1230
1231 trace_block_bio_backmerge(q, bio);
1232
1233 if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)
1234 blk_rq_set_mixed_merge(req);
1235
1236 req->biotail->bi_next = bio;
1237 req->biotail = bio;
1238 req->__data_len += bytes;
1239 req->ioprio = ioprio_best(req->ioprio, prio);
1240 if (!blk_rq_cpu_valid(req))
1241 req->cpu = bio->bi_comp_cpu;
1242 drive_stat_acct(req, 0);
1243 elv_bio_merged(q, req, bio);
1244 if (!attempt_back_merge(q, req))
1245 elv_merged_request(q, req, el_ret);
1246 goto out; 1238 goto out;
1247 1239
1248 case ELEVATOR_FRONT_MERGE: 1240 spin_lock_irq(q->queue_lock);
1249 BUG_ON(!rq_mergeable(req));
1250
1251 if (!ll_front_merge_fn(q, req, bio))
1252 break;
1253
1254 trace_block_bio_frontmerge(q, bio);
1255 1241
1256 if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff) { 1242 el_ret = elv_merge(q, &req, bio);
1257 blk_rq_set_mixed_merge(req); 1243 if (el_ret == ELEVATOR_BACK_MERGE) {
1258 req->cmd_flags &= ~REQ_FAILFAST_MASK; 1244 if (bio_attempt_back_merge(q, req, bio)) {
1259 req->cmd_flags |= ff; 1245 if (!attempt_back_merge(q, req))
1246 elv_merged_request(q, req, el_ret);
1247 goto out_unlock;
1248 }
1249 } else if (el_ret == ELEVATOR_FRONT_MERGE) {
1250 if (bio_attempt_front_merge(q, req, bio)) {
1251 if (!attempt_front_merge(q, req))
1252 elv_merged_request(q, req, el_ret);
1253 goto out_unlock;
1260 } 1254 }
1261
1262 bio->bi_next = req->bio;
1263 req->bio = bio;
1264
1265 /*
1266 * may not be valid. if the low level driver said
1267 * it didn't need a bounce buffer then it better
1268 * not touch req->buffer either...
1269 */
1270 req->buffer = bio_data(bio);
1271 req->__sector = bio->bi_sector;
1272 req->__data_len += bytes;
1273 req->ioprio = ioprio_best(req->ioprio, prio);
1274 if (!blk_rq_cpu_valid(req))
1275 req->cpu = bio->bi_comp_cpu;
1276 drive_stat_acct(req, 0);
1277 elv_bio_merged(q, req, bio);
1278 if (!attempt_front_merge(q, req))
1279 elv_merged_request(q, req, el_ret);
1280 goto out;
1281
1282 /* ELV_NO_MERGE: elevator says don't/can't merge. */
1283 default:
1284 ;
1285 } 1255 }
1286 1256
1287get_rq: 1257get_rq:
@@ -1308,17 +1278,39 @@ get_rq:
1308 */ 1278 */
1309 init_request_from_bio(req, bio); 1279 init_request_from_bio(req, bio);
1310 1280
1311 spin_lock_irq(q->queue_lock);
1312 if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) || 1281 if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) ||
1313 bio_flagged(bio, BIO_CPU_AFFINE)) 1282 bio_flagged(bio, BIO_CPU_AFFINE)) {
1314 req->cpu = blk_cpu_to_group(smp_processor_id()); 1283 req->cpu = blk_cpu_to_group(get_cpu());
1315 if (queue_should_plug(q) && elv_queue_empty(q)) 1284 put_cpu();
1316 blk_plug_device(q); 1285 }
1317 add_request(q, req); 1286
1287 plug = current->plug;
1288 if (plug) {
1289 /*
1290 * If this is the first request added after a plug, fire
1291 * of a plug trace. If others have been added before, check
1292 * if we have multiple devices in this plug. If so, make a
1293 * note to sort the list before dispatch.
1294 */
1295 if (list_empty(&plug->list))
1296 trace_block_plug(q);
1297 else if (!plug->should_sort) {
1298 struct request *__rq;
1299
1300 __rq = list_entry_rq(plug->list.prev);
1301 if (__rq->q != q)
1302 plug->should_sort = 1;
1303 }
1304 list_add_tail(&req->queuelist, &plug->list);
1305 drive_stat_acct(req, 1);
1306 } else {
1307 spin_lock_irq(q->queue_lock);
1308 add_acct_request(q, req, where);
1309 __blk_run_queue(q);
1310out_unlock:
1311 spin_unlock_irq(q->queue_lock);
1312 }
1318out: 1313out:
1319 if (unplug || !queue_should_plug(q))
1320 __generic_unplug_device(q);
1321 spin_unlock_irq(q->queue_lock);
1322 return 0; 1314 return 0;
1323} 1315}
1324 1316
@@ -1335,9 +1327,9 @@ static inline void blk_partition_remap(struct bio *bio)
1335 bio->bi_sector += p->start_sect; 1327 bio->bi_sector += p->start_sect;
1336 bio->bi_bdev = bdev->bd_contains; 1328 bio->bi_bdev = bdev->bd_contains;
1337 1329
1338 trace_block_remap(bdev_get_queue(bio->bi_bdev), bio, 1330 trace_block_bio_remap(bdev_get_queue(bio->bi_bdev), bio,
1339 bdev->bd_dev, 1331 bdev->bd_dev,
1340 bio->bi_sector - p->start_sect); 1332 bio->bi_sector - p->start_sect);
1341 } 1333 }
1342} 1334}
1343 1335
@@ -1350,7 +1342,7 @@ static void handle_bad_sector(struct bio *bio)
1350 bdevname(bio->bi_bdev, b), 1342 bdevname(bio->bi_bdev, b),
1351 bio->bi_rw, 1343 bio->bi_rw,
1352 (unsigned long long)bio->bi_sector + bio_sectors(bio), 1344 (unsigned long long)bio->bi_sector + bio_sectors(bio),
1353 (long long)(bio->bi_bdev->bd_inode->i_size >> 9)); 1345 (long long)(i_size_read(bio->bi_bdev->bd_inode) >> 9));
1354 1346
1355 set_bit(BIO_EOF, &bio->bi_flags); 1347 set_bit(BIO_EOF, &bio->bi_flags);
1356} 1348}
@@ -1403,7 +1395,7 @@ static inline int bio_check_eod(struct bio *bio, unsigned int nr_sectors)
1403 return 0; 1395 return 0;
1404 1396
1405 /* Test device or partition size, when known. */ 1397 /* Test device or partition size, when known. */
1406 maxsector = bio->bi_bdev->bd_inode->i_size >> 9; 1398 maxsector = i_size_read(bio->bi_bdev->bd_inode) >> 9;
1407 if (maxsector) { 1399 if (maxsector) {
1408 sector_t sector = bio->bi_sector; 1400 sector_t sector = bio->bi_sector;
1409 1401
@@ -1506,7 +1498,7 @@ static inline void __generic_make_request(struct bio *bio)
1506 goto end_io; 1498 goto end_io;
1507 1499
1508 if (old_sector != -1) 1500 if (old_sector != -1)
1509 trace_block_remap(q, bio, old_dev, old_sector); 1501 trace_block_bio_remap(q, bio, old_dev, old_sector);
1510 1502
1511 old_sector = bio->bi_sector; 1503 old_sector = bio->bi_sector;
1512 old_dev = bio->bi_bdev->bd_dev; 1504 old_dev = bio->bi_bdev->bd_dev;
@@ -1514,6 +1506,19 @@ static inline void __generic_make_request(struct bio *bio)
1514 if (bio_check_eod(bio, nr_sectors)) 1506 if (bio_check_eod(bio, nr_sectors))
1515 goto end_io; 1507 goto end_io;
1516 1508
1509 /*
1510 * Filter flush bio's early so that make_request based
1511 * drivers without flush support don't have to worry
1512 * about them.
1513 */
1514 if ((bio->bi_rw & (REQ_FLUSH | REQ_FUA)) && !q->flush_flags) {
1515 bio->bi_rw &= ~(REQ_FLUSH | REQ_FUA);
1516 if (!nr_sectors) {
1517 err = 0;
1518 goto end_io;
1519 }
1520 }
1521
1517 if ((bio->bi_rw & REQ_DISCARD) && 1522 if ((bio->bi_rw & REQ_DISCARD) &&
1518 (!blk_queue_discard(q) || 1523 (!blk_queue_discard(q) ||
1519 ((bio->bi_rw & REQ_SECURE) && 1524 ((bio->bi_rw & REQ_SECURE) &&
@@ -1522,6 +1527,16 @@ static inline void __generic_make_request(struct bio *bio)
1522 goto end_io; 1527 goto end_io;
1523 } 1528 }
1524 1529
1530 if (blk_throtl_bio(q, &bio))
1531 goto end_io;
1532
1533 /*
1534 * If bio = NULL, bio has been throttled and will be submitted
1535 * later.
1536 */
1537 if (!bio)
1538 break;
1539
1525 trace_block_bio_queue(q, bio); 1540 trace_block_bio_queue(q, bio);
1526 1541
1527 ret = q->make_request_fn(q, bio); 1542 ret = q->make_request_fn(q, bio);
@@ -1612,11 +1627,12 @@ void submit_bio(int rw, struct bio *bio)
1612 1627
1613 if (unlikely(block_dump)) { 1628 if (unlikely(block_dump)) {
1614 char b[BDEVNAME_SIZE]; 1629 char b[BDEVNAME_SIZE];
1615 printk(KERN_DEBUG "%s(%d): %s block %Lu on %s\n", 1630 printk(KERN_DEBUG "%s(%d): %s block %Lu on %s (%u sectors)\n",
1616 current->comm, task_pid_nr(current), 1631 current->comm, task_pid_nr(current),
1617 (rw & WRITE) ? "WRITE" : "READ", 1632 (rw & WRITE) ? "WRITE" : "READ",
1618 (unsigned long long)bio->bi_sector, 1633 (unsigned long long)bio->bi_sector,
1619 bdevname(bio->bi_bdev, b)); 1634 bdevname(bio->bi_bdev, b),
1635 count);
1620 } 1636 }
1621 } 1637 }
1622 1638
@@ -1637,7 +1653,7 @@ EXPORT_SYMBOL(submit_bio);
1637 * the insertion using this generic function. 1653 * the insertion using this generic function.
1638 * 1654 *
1639 * This function should also be useful for request stacking drivers 1655 * This function should also be useful for request stacking drivers
1640 * in some cases below, so export this fuction. 1656 * in some cases below, so export this function.
1641 * Request stacking drivers like request-based dm may change the queue 1657 * Request stacking drivers like request-based dm may change the queue
1642 * limits while requests are in the queue (e.g. dm's table swapping). 1658 * limits while requests are in the queue (e.g. dm's table swapping).
1643 * Such request stacking drivers should check those requests agaist 1659 * Such request stacking drivers should check those requests agaist
@@ -1698,9 +1714,7 @@ int blk_insert_cloned_request(struct request_queue *q, struct request *rq)
1698 */ 1714 */
1699 BUG_ON(blk_queued_rq(rq)); 1715 BUG_ON(blk_queued_rq(rq));
1700 1716
1701 drive_stat_acct(rq, 1); 1717 add_acct_request(q, rq, ELEVATOR_INSERT_BACK);
1702 __elv_add_request(q, rq, ELEVATOR_INSERT_BACK, 0);
1703
1704 spin_unlock_irqrestore(q->queue_lock, flags); 1718 spin_unlock_irqrestore(q->queue_lock, flags);
1705 1719
1706 return 0; 1720 return 0;
@@ -1759,7 +1773,7 @@ static void blk_account_io_completion(struct request *req, unsigned int bytes)
1759 int cpu; 1773 int cpu;
1760 1774
1761 cpu = part_stat_lock(); 1775 cpu = part_stat_lock();
1762 part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req)); 1776 part = req->part;
1763 part_stat_add(cpu, part, sectors[rw], bytes >> 9); 1777 part_stat_add(cpu, part, sectors[rw], bytes >> 9);
1764 part_stat_unlock(); 1778 part_stat_unlock();
1765 } 1779 }
@@ -1768,24 +1782,25 @@ static void blk_account_io_completion(struct request *req, unsigned int bytes)
1768static void blk_account_io_done(struct request *req) 1782static void blk_account_io_done(struct request *req)
1769{ 1783{
1770 /* 1784 /*
1771 * Account IO completion. bar_rq isn't accounted as a normal 1785 * Account IO completion. flush_rq isn't accounted as a
1772 * IO on queueing nor completion. Accounting the containing 1786 * normal IO on queueing nor completion. Accounting the
1773 * request is enough. 1787 * containing request is enough.
1774 */ 1788 */
1775 if (blk_do_io_stat(req) && req != &req->q->bar_rq) { 1789 if (blk_do_io_stat(req) && !(req->cmd_flags & REQ_FLUSH_SEQ)) {
1776 unsigned long duration = jiffies - req->start_time; 1790 unsigned long duration = jiffies - req->start_time;
1777 const int rw = rq_data_dir(req); 1791 const int rw = rq_data_dir(req);
1778 struct hd_struct *part; 1792 struct hd_struct *part;
1779 int cpu; 1793 int cpu;
1780 1794
1781 cpu = part_stat_lock(); 1795 cpu = part_stat_lock();
1782 part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req)); 1796 part = req->part;
1783 1797
1784 part_stat_inc(cpu, part, ios[rw]); 1798 part_stat_inc(cpu, part, ios[rw]);
1785 part_stat_add(cpu, part, ticks[rw], duration); 1799 part_stat_add(cpu, part, ticks[rw], duration);
1786 part_round_stats(cpu, part); 1800 part_round_stats(cpu, part);
1787 part_dec_in_flight(part, rw); 1801 part_dec_in_flight(part, rw);
1788 1802
1803 hd_struct_put(part);
1789 part_stat_unlock(); 1804 part_stat_unlock();
1790 } 1805 }
1791} 1806}
@@ -2011,9 +2026,26 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
2011 2026
2012 if (error && req->cmd_type == REQ_TYPE_FS && 2027 if (error && req->cmd_type == REQ_TYPE_FS &&
2013 !(req->cmd_flags & REQ_QUIET)) { 2028 !(req->cmd_flags & REQ_QUIET)) {
2014 printk(KERN_ERR "end_request: I/O error, dev %s, sector %llu\n", 2029 char *error_type;
2015 req->rq_disk ? req->rq_disk->disk_name : "?", 2030
2016 (unsigned long long)blk_rq_pos(req)); 2031 switch (error) {
2032 case -ENOLINK:
2033 error_type = "recoverable transport";
2034 break;
2035 case -EREMOTEIO:
2036 error_type = "critical target";
2037 break;
2038 case -EBADE:
2039 error_type = "critical nexus";
2040 break;
2041 case -EIO:
2042 default:
2043 error_type = "I/O";
2044 break;
2045 }
2046 printk(KERN_ERR "end_request: %s error, dev %s, sector %llu\n",
2047 error_type, req->rq_disk ? req->rq_disk->disk_name : "?",
2048 (unsigned long long)blk_rq_pos(req));
2017 } 2049 }
2018 2050
2019 blk_account_io_completion(req, nr_bytes); 2051 blk_account_io_completion(req, nr_bytes);
@@ -2111,7 +2143,7 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
2111 * size, something has gone terribly wrong. 2143 * size, something has gone terribly wrong.
2112 */ 2144 */
2113 if (blk_rq_bytes(req) < blk_rq_cur_bytes(req)) { 2145 if (blk_rq_bytes(req) < blk_rq_cur_bytes(req)) {
2114 printk(KERN_ERR "blk: request botched\n"); 2146 blk_dump_rq_flags(req, "request botched");
2115 req->__data_len = blk_rq_cur_bytes(req); 2147 req->__data_len = blk_rq_cur_bytes(req);
2116 } 2148 }
2117 2149
@@ -2497,9 +2529,7 @@ EXPORT_SYMBOL_GPL(blk_rq_unprep_clone);
2497static void __blk_rq_prep_clone(struct request *dst, struct request *src) 2529static void __blk_rq_prep_clone(struct request *dst, struct request *src)
2498{ 2530{
2499 dst->cpu = src->cpu; 2531 dst->cpu = src->cpu;
2500 dst->cmd_flags = (rq_data_dir(src) | REQ_NOMERGE); 2532 dst->cmd_flags = (src->cmd_flags & REQ_CLONE_MASK) | REQ_NOMERGE;
2501 if (src->cmd_flags & REQ_DISCARD)
2502 dst->cmd_flags |= REQ_DISCARD;
2503 dst->cmd_type = src->cmd_type; 2533 dst->cmd_type = src->cmd_type;
2504 dst->__sector = blk_rq_pos(src); 2534 dst->__sector = blk_rq_pos(src);
2505 dst->__data_len = blk_rq_bytes(src); 2535 dst->__data_len = blk_rq_bytes(src);
@@ -2579,12 +2609,171 @@ int kblockd_schedule_work(struct request_queue *q, struct work_struct *work)
2579} 2609}
2580EXPORT_SYMBOL(kblockd_schedule_work); 2610EXPORT_SYMBOL(kblockd_schedule_work);
2581 2611
2612int kblockd_schedule_delayed_work(struct request_queue *q,
2613 struct delayed_work *dwork, unsigned long delay)
2614{
2615 return queue_delayed_work(kblockd_workqueue, dwork, delay);
2616}
2617EXPORT_SYMBOL(kblockd_schedule_delayed_work);
2618
2619#define PLUG_MAGIC 0x91827364
2620
2621void blk_start_plug(struct blk_plug *plug)
2622{
2623 struct task_struct *tsk = current;
2624
2625 plug->magic = PLUG_MAGIC;
2626 INIT_LIST_HEAD(&plug->list);
2627 INIT_LIST_HEAD(&plug->cb_list);
2628 plug->should_sort = 0;
2629
2630 /*
2631 * If this is a nested plug, don't actually assign it. It will be
2632 * flushed on its own.
2633 */
2634 if (!tsk->plug) {
2635 /*
2636 * Store ordering should not be needed here, since a potential
2637 * preempt will imply a full memory barrier
2638 */
2639 tsk->plug = plug;
2640 }
2641}
2642EXPORT_SYMBOL(blk_start_plug);
2643
2644static int plug_rq_cmp(void *priv, struct list_head *a, struct list_head *b)
2645{
2646 struct request *rqa = container_of(a, struct request, queuelist);
2647 struct request *rqb = container_of(b, struct request, queuelist);
2648
2649 return !(rqa->q <= rqb->q);
2650}
2651
2652/*
2653 * If 'from_schedule' is true, then postpone the dispatch of requests
2654 * until a safe kblockd context. We due this to avoid accidental big
2655 * additional stack usage in driver dispatch, in places where the originally
2656 * plugger did not intend it.
2657 */
2658static void queue_unplugged(struct request_queue *q, unsigned int depth,
2659 bool from_schedule)
2660 __releases(q->queue_lock)
2661{
2662 trace_block_unplug(q, depth, !from_schedule);
2663
2664 /*
2665 * If we are punting this to kblockd, then we can safely drop
2666 * the queue_lock before waking kblockd (which needs to take
2667 * this lock).
2668 */
2669 if (from_schedule) {
2670 spin_unlock(q->queue_lock);
2671 blk_run_queue_async(q);
2672 } else {
2673 __blk_run_queue(q);
2674 spin_unlock(q->queue_lock);
2675 }
2676
2677}
2678
2679static void flush_plug_callbacks(struct blk_plug *plug)
2680{
2681 LIST_HEAD(callbacks);
2682
2683 if (list_empty(&plug->cb_list))
2684 return;
2685
2686 list_splice_init(&plug->cb_list, &callbacks);
2687
2688 while (!list_empty(&callbacks)) {
2689 struct blk_plug_cb *cb = list_first_entry(&callbacks,
2690 struct blk_plug_cb,
2691 list);
2692 list_del(&cb->list);
2693 cb->callback(cb);
2694 }
2695}
2696
2697void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
2698{
2699 struct request_queue *q;
2700 unsigned long flags;
2701 struct request *rq;
2702 LIST_HEAD(list);
2703 unsigned int depth;
2704
2705 BUG_ON(plug->magic != PLUG_MAGIC);
2706
2707 flush_plug_callbacks(plug);
2708 if (list_empty(&plug->list))
2709 return;
2710
2711 list_splice_init(&plug->list, &list);
2712
2713 if (plug->should_sort) {
2714 list_sort(NULL, &list, plug_rq_cmp);
2715 plug->should_sort = 0;
2716 }
2717
2718 q = NULL;
2719 depth = 0;
2720
2721 /*
2722 * Save and disable interrupts here, to avoid doing it for every
2723 * queue lock we have to take.
2724 */
2725 local_irq_save(flags);
2726 while (!list_empty(&list)) {
2727 rq = list_entry_rq(list.next);
2728 list_del_init(&rq->queuelist);
2729 BUG_ON(!rq->q);
2730 if (rq->q != q) {
2731 /*
2732 * This drops the queue lock
2733 */
2734 if (q)
2735 queue_unplugged(q, depth, from_schedule);
2736 q = rq->q;
2737 depth = 0;
2738 spin_lock(q->queue_lock);
2739 }
2740 /*
2741 * rq is already accounted, so use raw insert
2742 */
2743 if (rq->cmd_flags & (REQ_FLUSH | REQ_FUA))
2744 __elv_add_request(q, rq, ELEVATOR_INSERT_FLUSH);
2745 else
2746 __elv_add_request(q, rq, ELEVATOR_INSERT_SORT_MERGE);
2747
2748 depth++;
2749 }
2750
2751 /*
2752 * This drops the queue lock
2753 */
2754 if (q)
2755 queue_unplugged(q, depth, from_schedule);
2756
2757 local_irq_restore(flags);
2758}
2759
2760void blk_finish_plug(struct blk_plug *plug)
2761{
2762 blk_flush_plug_list(plug, false);
2763
2764 if (plug == current->plug)
2765 current->plug = NULL;
2766}
2767EXPORT_SYMBOL(blk_finish_plug);
2768
2582int __init blk_dev_init(void) 2769int __init blk_dev_init(void)
2583{ 2770{
2584 BUILD_BUG_ON(__REQ_NR_BITS > 8 * 2771 BUILD_BUG_ON(__REQ_NR_BITS > 8 *
2585 sizeof(((struct request *)0)->cmd_flags)); 2772 sizeof(((struct request *)0)->cmd_flags));
2586 2773
2587 kblockd_workqueue = create_workqueue("kblockd"); 2774 /* used for unplugging and affects IO latency/throughput - HIGHPRI */
2775 kblockd_workqueue = alloc_workqueue("kblockd",
2776 WQ_MEM_RECLAIM | WQ_HIGHPRI, 0);
2588 if (!kblockd_workqueue) 2777 if (!kblockd_workqueue)
2589 panic("Failed to create kblockd\n"); 2778 panic("Failed to create kblockd\n");
2590 2779
diff --git a/block/blk-exec.c b/block/blk-exec.c
index e1672f14840e..8a0e7ec056e7 100644
--- a/block/blk-exec.c
+++ b/block/blk-exec.c
@@ -54,9 +54,9 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
54 rq->end_io = done; 54 rq->end_io = done;
55 WARN_ON(irqs_disabled()); 55 WARN_ON(irqs_disabled());
56 spin_lock_irq(q->queue_lock); 56 spin_lock_irq(q->queue_lock);
57 __elv_add_request(q, rq, where, 1); 57 __elv_add_request(q, rq, where);
58 __generic_unplug_device(q); 58 __blk_run_queue(q);
59 /* the queue is stopped so it won't be plugged+unplugged */ 59 /* the queue is stopped so it won't be run */
60 if (rq->cmd_type == REQ_TYPE_PM_RESUME) 60 if (rq->cmd_type == REQ_TYPE_PM_RESUME)
61 q->request_fn(q); 61 q->request_fn(q);
62 spin_unlock_irq(q->queue_lock); 62 spin_unlock_irq(q->queue_lock);
@@ -80,6 +80,7 @@ int blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk,
80 DECLARE_COMPLETION_ONSTACK(wait); 80 DECLARE_COMPLETION_ONSTACK(wait);
81 char sense[SCSI_SENSE_BUFFERSIZE]; 81 char sense[SCSI_SENSE_BUFFERSIZE];
82 int err = 0; 82 int err = 0;
83 unsigned long hang_check;
83 84
84 /* 85 /*
85 * we need an extra reference to the request, so we can look at 86 * we need an extra reference to the request, so we can look at
@@ -95,7 +96,13 @@ int blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk,
95 96
96 rq->end_io_data = &wait; 97 rq->end_io_data = &wait;
97 blk_execute_rq_nowait(q, bd_disk, rq, at_head, blk_end_sync_rq); 98 blk_execute_rq_nowait(q, bd_disk, rq, at_head, blk_end_sync_rq);
98 wait_for_completion(&wait); 99
100 /* Prevent hang_check timer from firing at us during very long I/O */
101 hang_check = sysctl_hung_task_timeout_secs;
102 if (hang_check)
103 while (!wait_for_completion_timeout(&wait, hang_check * (HZ/2)));
104 else
105 wait_for_completion(&wait);
99 106
100 if (rq->errors) 107 if (rq->errors)
101 err = -EIO; 108 err = -EIO;
diff --git a/block/blk-flush.c b/block/blk-flush.c
new file mode 100644
index 000000000000..bb21e4c36f70
--- /dev/null
+++ b/block/blk-flush.c
@@ -0,0 +1,443 @@
1/*
2 * Functions to sequence FLUSH and FUA writes.
3 *
4 * Copyright (C) 2011 Max Planck Institute for Gravitational Physics
5 * Copyright (C) 2011 Tejun Heo <tj@kernel.org>
6 *
7 * This file is released under the GPLv2.
8 *
9 * REQ_{FLUSH|FUA} requests are decomposed to sequences consisted of three
10 * optional steps - PREFLUSH, DATA and POSTFLUSH - according to the request
11 * properties and hardware capability.
12 *
13 * If a request doesn't have data, only REQ_FLUSH makes sense, which
14 * indicates a simple flush request. If there is data, REQ_FLUSH indicates
15 * that the device cache should be flushed before the data is executed, and
16 * REQ_FUA means that the data must be on non-volatile media on request
17 * completion.
18 *
19 * If the device doesn't have writeback cache, FLUSH and FUA don't make any
20 * difference. The requests are either completed immediately if there's no
21 * data or executed as normal requests otherwise.
22 *
23 * If the device has writeback cache and supports FUA, REQ_FLUSH is
24 * translated to PREFLUSH but REQ_FUA is passed down directly with DATA.
25 *
26 * If the device has writeback cache and doesn't support FUA, REQ_FLUSH is
27 * translated to PREFLUSH and REQ_FUA to POSTFLUSH.
28 *
29 * The actual execution of flush is double buffered. Whenever a request
30 * needs to execute PRE or POSTFLUSH, it queues at
31 * q->flush_queue[q->flush_pending_idx]. Once certain criteria are met, a
32 * flush is issued and the pending_idx is toggled. When the flush
33 * completes, all the requests which were pending are proceeded to the next
34 * step. This allows arbitrary merging of different types of FLUSH/FUA
35 * requests.
36 *
37 * Currently, the following conditions are used to determine when to issue
38 * flush.
39 *
40 * C1. At any given time, only one flush shall be in progress. This makes
41 * double buffering sufficient.
42 *
43 * C2. Flush is deferred if any request is executing DATA of its sequence.
44 * This avoids issuing separate POSTFLUSHes for requests which shared
45 * PREFLUSH.
46 *
47 * C3. The second condition is ignored if there is a request which has
48 * waited longer than FLUSH_PENDING_TIMEOUT. This is to avoid
49 * starvation in the unlikely case where there are continuous stream of
50 * FUA (without FLUSH) requests.
51 *
52 * For devices which support FUA, it isn't clear whether C2 (and thus C3)
53 * is beneficial.
54 *
55 * Note that a sequenced FLUSH/FUA request with DATA is completed twice.
56 * Once while executing DATA and again after the whole sequence is
57 * complete. The first completion updates the contained bio but doesn't
58 * finish it so that the bio submitter is notified only after the whole
59 * sequence is complete. This is implemented by testing REQ_FLUSH_SEQ in
60 * req_bio_endio().
61 *
62 * The above peculiarity requires that each FLUSH/FUA request has only one
63 * bio attached to it, which is guaranteed as they aren't allowed to be
64 * merged in the usual way.
65 */
66
67#include <linux/kernel.h>
68#include <linux/module.h>
69#include <linux/bio.h>
70#include <linux/blkdev.h>
71#include <linux/gfp.h>
72
73#include "blk.h"
74
75/* FLUSH/FUA sequences */
76enum {
77 REQ_FSEQ_PREFLUSH = (1 << 0), /* pre-flushing in progress */
78 REQ_FSEQ_DATA = (1 << 1), /* data write in progress */
79 REQ_FSEQ_POSTFLUSH = (1 << 2), /* post-flushing in progress */
80 REQ_FSEQ_DONE = (1 << 3),
81
82 REQ_FSEQ_ACTIONS = REQ_FSEQ_PREFLUSH | REQ_FSEQ_DATA |
83 REQ_FSEQ_POSTFLUSH,
84
85 /*
86 * If flush has been pending longer than the following timeout,
87 * it's issued even if flush_data requests are still in flight.
88 */
89 FLUSH_PENDING_TIMEOUT = 5 * HZ,
90};
91
92static bool blk_kick_flush(struct request_queue *q);
93
94static unsigned int blk_flush_policy(unsigned int fflags, struct request *rq)
95{
96 unsigned int policy = 0;
97
98 if (fflags & REQ_FLUSH) {
99 if (rq->cmd_flags & REQ_FLUSH)
100 policy |= REQ_FSEQ_PREFLUSH;
101 if (blk_rq_sectors(rq))
102 policy |= REQ_FSEQ_DATA;
103 if (!(fflags & REQ_FUA) && (rq->cmd_flags & REQ_FUA))
104 policy |= REQ_FSEQ_POSTFLUSH;
105 }
106 return policy;
107}
108
109static unsigned int blk_flush_cur_seq(struct request *rq)
110{
111 return 1 << ffz(rq->flush.seq);
112}
113
114static void blk_flush_restore_request(struct request *rq)
115{
116 /*
117 * After flush data completion, @rq->bio is %NULL but we need to
118 * complete the bio again. @rq->biotail is guaranteed to equal the
119 * original @rq->bio. Restore it.
120 */
121 rq->bio = rq->biotail;
122
123 /* make @rq a normal request */
124 rq->cmd_flags &= ~REQ_FLUSH_SEQ;
125 rq->end_io = NULL;
126}
127
128/**
129 * blk_flush_complete_seq - complete flush sequence
130 * @rq: FLUSH/FUA request being sequenced
131 * @seq: sequences to complete (mask of %REQ_FSEQ_*, can be zero)
132 * @error: whether an error occurred
133 *
134 * @rq just completed @seq part of its flush sequence, record the
135 * completion and trigger the next step.
136 *
137 * CONTEXT:
138 * spin_lock_irq(q->queue_lock)
139 *
140 * RETURNS:
141 * %true if requests were added to the dispatch queue, %false otherwise.
142 */
143static bool blk_flush_complete_seq(struct request *rq, unsigned int seq,
144 int error)
145{
146 struct request_queue *q = rq->q;
147 struct list_head *pending = &q->flush_queue[q->flush_pending_idx];
148 bool queued = false;
149
150 BUG_ON(rq->flush.seq & seq);
151 rq->flush.seq |= seq;
152
153 if (likely(!error))
154 seq = blk_flush_cur_seq(rq);
155 else
156 seq = REQ_FSEQ_DONE;
157
158 switch (seq) {
159 case REQ_FSEQ_PREFLUSH:
160 case REQ_FSEQ_POSTFLUSH:
161 /* queue for flush */
162 if (list_empty(pending))
163 q->flush_pending_since = jiffies;
164 list_move_tail(&rq->flush.list, pending);
165 break;
166
167 case REQ_FSEQ_DATA:
168 list_move_tail(&rq->flush.list, &q->flush_data_in_flight);
169 list_add(&rq->queuelist, &q->queue_head);
170 queued = true;
171 break;
172
173 case REQ_FSEQ_DONE:
174 /*
175 * @rq was previously adjusted by blk_flush_issue() for
176 * flush sequencing and may already have gone through the
177 * flush data request completion path. Restore @rq for
178 * normal completion and end it.
179 */
180 BUG_ON(!list_empty(&rq->queuelist));
181 list_del_init(&rq->flush.list);
182 blk_flush_restore_request(rq);
183 __blk_end_request_all(rq, error);
184 break;
185
186 default:
187 BUG();
188 }
189
190 return blk_kick_flush(q) | queued;
191}
192
193static void flush_end_io(struct request *flush_rq, int error)
194{
195 struct request_queue *q = flush_rq->q;
196 struct list_head *running = &q->flush_queue[q->flush_running_idx];
197 bool queued = false;
198 struct request *rq, *n;
199
200 BUG_ON(q->flush_pending_idx == q->flush_running_idx);
201
202 /* account completion of the flush request */
203 q->flush_running_idx ^= 1;
204 elv_completed_request(q, flush_rq);
205
206 /* and push the waiting requests to the next stage */
207 list_for_each_entry_safe(rq, n, running, flush.list) {
208 unsigned int seq = blk_flush_cur_seq(rq);
209
210 BUG_ON(seq != REQ_FSEQ_PREFLUSH && seq != REQ_FSEQ_POSTFLUSH);
211 queued |= blk_flush_complete_seq(rq, seq, error);
212 }
213
214 /*
215 * Kick the queue to avoid stall for two cases:
216 * 1. Moving a request silently to empty queue_head may stall the
217 * queue.
218 * 2. When flush request is running in non-queueable queue, the
219 * queue is hold. Restart the queue after flush request is finished
220 * to avoid stall.
221 * This function is called from request completion path and calling
222 * directly into request_fn may confuse the driver. Always use
223 * kblockd.
224 */
225 if (queued || q->flush_queue_delayed)
226 blk_run_queue_async(q);
227 q->flush_queue_delayed = 0;
228}
229
230/**
231 * blk_kick_flush - consider issuing flush request
232 * @q: request_queue being kicked
233 *
234 * Flush related states of @q have changed, consider issuing flush request.
235 * Please read the comment at the top of this file for more info.
236 *
237 * CONTEXT:
238 * spin_lock_irq(q->queue_lock)
239 *
240 * RETURNS:
241 * %true if flush was issued, %false otherwise.
242 */
243static bool blk_kick_flush(struct request_queue *q)
244{
245 struct list_head *pending = &q->flush_queue[q->flush_pending_idx];
246 struct request *first_rq =
247 list_first_entry(pending, struct request, flush.list);
248
249 /* C1 described at the top of this file */
250 if (q->flush_pending_idx != q->flush_running_idx || list_empty(pending))
251 return false;
252
253 /* C2 and C3 */
254 if (!list_empty(&q->flush_data_in_flight) &&
255 time_before(jiffies,
256 q->flush_pending_since + FLUSH_PENDING_TIMEOUT))
257 return false;
258
259 /*
260 * Issue flush and toggle pending_idx. This makes pending_idx
261 * different from running_idx, which means flush is in flight.
262 */
263 blk_rq_init(q, &q->flush_rq);
264 q->flush_rq.cmd_type = REQ_TYPE_FS;
265 q->flush_rq.cmd_flags = WRITE_FLUSH | REQ_FLUSH_SEQ;
266 q->flush_rq.rq_disk = first_rq->rq_disk;
267 q->flush_rq.end_io = flush_end_io;
268
269 q->flush_pending_idx ^= 1;
270 list_add_tail(&q->flush_rq.queuelist, &q->queue_head);
271 return true;
272}
273
274static void flush_data_end_io(struct request *rq, int error)
275{
276 struct request_queue *q = rq->q;
277
278 /*
279 * After populating an empty queue, kick it to avoid stall. Read
280 * the comment in flush_end_io().
281 */
282 if (blk_flush_complete_seq(rq, REQ_FSEQ_DATA, error))
283 blk_run_queue_async(q);
284}
285
286/**
287 * blk_insert_flush - insert a new FLUSH/FUA request
288 * @rq: request to insert
289 *
290 * To be called from __elv_add_request() for %ELEVATOR_INSERT_FLUSH insertions.
291 * @rq is being submitted. Analyze what needs to be done and put it on the
292 * right queue.
293 *
294 * CONTEXT:
295 * spin_lock_irq(q->queue_lock)
296 */
297void blk_insert_flush(struct request *rq)
298{
299 struct request_queue *q = rq->q;
300 unsigned int fflags = q->flush_flags; /* may change, cache */
301 unsigned int policy = blk_flush_policy(fflags, rq);
302
303 BUG_ON(rq->end_io);
304 BUG_ON(!rq->bio || rq->bio != rq->biotail);
305
306 /*
307 * @policy now records what operations need to be done. Adjust
308 * REQ_FLUSH and FUA for the driver.
309 */
310 rq->cmd_flags &= ~REQ_FLUSH;
311 if (!(fflags & REQ_FUA))
312 rq->cmd_flags &= ~REQ_FUA;
313
314 /*
315 * If there's data but flush is not necessary, the request can be
316 * processed directly without going through flush machinery. Queue
317 * for normal execution.
318 */
319 if ((policy & REQ_FSEQ_DATA) &&
320 !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) {
321 list_add_tail(&rq->queuelist, &q->queue_head);
322 return;
323 }
324
325 /*
326 * @rq should go through flush machinery. Mark it part of flush
327 * sequence and submit for further processing.
328 */
329 memset(&rq->flush, 0, sizeof(rq->flush));
330 INIT_LIST_HEAD(&rq->flush.list);
331 rq->cmd_flags |= REQ_FLUSH_SEQ;
332 rq->end_io = flush_data_end_io;
333
334 blk_flush_complete_seq(rq, REQ_FSEQ_ACTIONS & ~policy, 0);
335}
336
337/**
338 * blk_abort_flushes - @q is being aborted, abort flush requests
339 * @q: request_queue being aborted
340 *
341 * To be called from elv_abort_queue(). @q is being aborted. Prepare all
342 * FLUSH/FUA requests for abortion.
343 *
344 * CONTEXT:
345 * spin_lock_irq(q->queue_lock)
346 */
347void blk_abort_flushes(struct request_queue *q)
348{
349 struct request *rq, *n;
350 int i;
351
352 /*
353 * Requests in flight for data are already owned by the dispatch
354 * queue or the device driver. Just restore for normal completion.
355 */
356 list_for_each_entry_safe(rq, n, &q->flush_data_in_flight, flush.list) {
357 list_del_init(&rq->flush.list);
358 blk_flush_restore_request(rq);
359 }
360
361 /*
362 * We need to give away requests on flush queues. Restore for
363 * normal completion and put them on the dispatch queue.
364 */
365 for (i = 0; i < ARRAY_SIZE(q->flush_queue); i++) {
366 list_for_each_entry_safe(rq, n, &q->flush_queue[i],
367 flush.list) {
368 list_del_init(&rq->flush.list);
369 blk_flush_restore_request(rq);
370 list_add_tail(&rq->queuelist, &q->queue_head);
371 }
372 }
373}
374
375static void bio_end_flush(struct bio *bio, int err)
376{
377 if (err)
378 clear_bit(BIO_UPTODATE, &bio->bi_flags);
379 if (bio->bi_private)
380 complete(bio->bi_private);
381 bio_put(bio);
382}
383
384/**
385 * blkdev_issue_flush - queue a flush
386 * @bdev: blockdev to issue flush for
387 * @gfp_mask: memory allocation flags (for bio_alloc)
388 * @error_sector: error sector
389 *
390 * Description:
391 * Issue a flush for the block device in question. Caller can supply
392 * room for storing the error offset in case of a flush error, if they
393 * wish to. If WAIT flag is not passed then caller may check only what
394 * request was pushed in some internal queue for later handling.
395 */
396int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask,
397 sector_t *error_sector)
398{
399 DECLARE_COMPLETION_ONSTACK(wait);
400 struct request_queue *q;
401 struct bio *bio;
402 int ret = 0;
403
404 if (bdev->bd_disk == NULL)
405 return -ENXIO;
406
407 q = bdev_get_queue(bdev);
408 if (!q)
409 return -ENXIO;
410
411 /*
412 * some block devices may not have their queue correctly set up here
413 * (e.g. loop device without a backing file) and so issuing a flush
414 * here will panic. Ensure there is a request function before issuing
415 * the flush.
416 */
417 if (!q->make_request_fn)
418 return -ENXIO;
419
420 bio = bio_alloc(gfp_mask, 0);
421 bio->bi_end_io = bio_end_flush;
422 bio->bi_bdev = bdev;
423 bio->bi_private = &wait;
424
425 bio_get(bio);
426 submit_bio(WRITE_FLUSH, bio);
427 wait_for_completion(&wait);
428
429 /*
430 * The driver must store the error location in ->bi_sector, if
431 * it supports it. For non-stacked drivers, this should be
432 * copied from blk_rq_pos(rq).
433 */
434 if (error_sector)
435 *error_sector = bio->bi_sector;
436
437 if (!bio_flagged(bio, BIO_UPTODATE))
438 ret = -EIO;
439
440 bio_put(bio);
441 return ret;
442}
443EXPORT_SYMBOL(blkdev_issue_flush);
diff --git a/block/blk-integrity.c b/block/blk-integrity.c
index edce1ef7933d..129b9e209a3b 100644
--- a/block/blk-integrity.c
+++ b/block/blk-integrity.c
@@ -30,26 +30,41 @@
30 30
31static struct kmem_cache *integrity_cachep; 31static struct kmem_cache *integrity_cachep;
32 32
33static const char *bi_unsupported_name = "unsupported";
34
33/** 35/**
34 * blk_rq_count_integrity_sg - Count number of integrity scatterlist elements 36 * blk_rq_count_integrity_sg - Count number of integrity scatterlist elements
35 * @rq: request with integrity metadata attached 37 * @q: request queue
38 * @bio: bio with integrity metadata attached
36 * 39 *
37 * Description: Returns the number of elements required in a 40 * Description: Returns the number of elements required in a
38 * scatterlist corresponding to the integrity metadata in a request. 41 * scatterlist corresponding to the integrity metadata in a bio.
39 */ 42 */
40int blk_rq_count_integrity_sg(struct request *rq) 43int blk_rq_count_integrity_sg(struct request_queue *q, struct bio *bio)
41{ 44{
42 struct bio_vec *iv, *ivprv; 45 struct bio_vec *iv, *ivprv = NULL;
43 struct req_iterator iter; 46 unsigned int segments = 0;
44 unsigned int segments; 47 unsigned int seg_size = 0;
48 unsigned int i = 0;
49
50 bio_for_each_integrity_vec(iv, bio, i) {
51
52 if (ivprv) {
53 if (!BIOVEC_PHYS_MERGEABLE(ivprv, iv))
54 goto new_segment;
45 55
46 ivprv = NULL; 56 if (!BIOVEC_SEG_BOUNDARY(q, ivprv, iv))
47 segments = 0; 57 goto new_segment;
48 58
49 rq_for_each_integrity_segment(iv, rq, iter) { 59 if (seg_size + iv->bv_len > queue_max_segment_size(q))
60 goto new_segment;
50 61
51 if (!ivprv || !BIOVEC_PHYS_MERGEABLE(ivprv, iv)) 62 seg_size += iv->bv_len;
63 } else {
64new_segment:
52 segments++; 65 segments++;
66 seg_size = iv->bv_len;
67 }
53 68
54 ivprv = iv; 69 ivprv = iv;
55 } 70 }
@@ -60,30 +75,34 @@ EXPORT_SYMBOL(blk_rq_count_integrity_sg);
60 75
61/** 76/**
62 * blk_rq_map_integrity_sg - Map integrity metadata into a scatterlist 77 * blk_rq_map_integrity_sg - Map integrity metadata into a scatterlist
63 * @rq: request with integrity metadata attached 78 * @q: request queue
79 * @bio: bio with integrity metadata attached
64 * @sglist: target scatterlist 80 * @sglist: target scatterlist
65 * 81 *
66 * Description: Map the integrity vectors in request into a 82 * Description: Map the integrity vectors in request into a
67 * scatterlist. The scatterlist must be big enough to hold all 83 * scatterlist. The scatterlist must be big enough to hold all
68 * elements. I.e. sized using blk_rq_count_integrity_sg(). 84 * elements. I.e. sized using blk_rq_count_integrity_sg().
69 */ 85 */
70int blk_rq_map_integrity_sg(struct request *rq, struct scatterlist *sglist) 86int blk_rq_map_integrity_sg(struct request_queue *q, struct bio *bio,
87 struct scatterlist *sglist)
71{ 88{
72 struct bio_vec *iv, *ivprv; 89 struct bio_vec *iv, *ivprv = NULL;
73 struct req_iterator iter; 90 struct scatterlist *sg = NULL;
74 struct scatterlist *sg; 91 unsigned int segments = 0;
75 unsigned int segments; 92 unsigned int i = 0;
76
77 ivprv = NULL;
78 sg = NULL;
79 segments = 0;
80 93
81 rq_for_each_integrity_segment(iv, rq, iter) { 94 bio_for_each_integrity_vec(iv, bio, i) {
82 95
83 if (ivprv) { 96 if (ivprv) {
84 if (!BIOVEC_PHYS_MERGEABLE(ivprv, iv)) 97 if (!BIOVEC_PHYS_MERGEABLE(ivprv, iv))
85 goto new_segment; 98 goto new_segment;
86 99
100 if (!BIOVEC_SEG_BOUNDARY(q, ivprv, iv))
101 goto new_segment;
102
103 if (sg->length + iv->bv_len > queue_max_segment_size(q))
104 goto new_segment;
105
87 sg->length += iv->bv_len; 106 sg->length += iv->bv_len;
88 } else { 107 } else {
89new_segment: 108new_segment:
@@ -162,6 +181,40 @@ int blk_integrity_compare(struct gendisk *gd1, struct gendisk *gd2)
162} 181}
163EXPORT_SYMBOL(blk_integrity_compare); 182EXPORT_SYMBOL(blk_integrity_compare);
164 183
184int blk_integrity_merge_rq(struct request_queue *q, struct request *req,
185 struct request *next)
186{
187 if (blk_integrity_rq(req) != blk_integrity_rq(next))
188 return -1;
189
190 if (req->nr_integrity_segments + next->nr_integrity_segments >
191 q->limits.max_integrity_segments)
192 return -1;
193
194 return 0;
195}
196EXPORT_SYMBOL(blk_integrity_merge_rq);
197
198int blk_integrity_merge_bio(struct request_queue *q, struct request *req,
199 struct bio *bio)
200{
201 int nr_integrity_segs;
202 struct bio *next = bio->bi_next;
203
204 bio->bi_next = NULL;
205 nr_integrity_segs = blk_rq_count_integrity_sg(q, bio);
206 bio->bi_next = next;
207
208 if (req->nr_integrity_segments + nr_integrity_segs >
209 q->limits.max_integrity_segments)
210 return -1;
211
212 req->nr_integrity_segments += nr_integrity_segs;
213
214 return 0;
215}
216EXPORT_SYMBOL(blk_integrity_merge_bio);
217
165struct integrity_sysfs_entry { 218struct integrity_sysfs_entry {
166 struct attribute attr; 219 struct attribute attr;
167 ssize_t (*show)(struct blk_integrity *, char *); 220 ssize_t (*show)(struct blk_integrity *, char *);
@@ -307,6 +360,14 @@ static struct kobj_type integrity_ktype = {
307 .release = blk_integrity_release, 360 .release = blk_integrity_release,
308}; 361};
309 362
363bool blk_integrity_is_initialized(struct gendisk *disk)
364{
365 struct blk_integrity *bi = blk_get_integrity(disk);
366
367 return (bi && bi->name && strcmp(bi->name, bi_unsupported_name) != 0);
368}
369EXPORT_SYMBOL(blk_integrity_is_initialized);
370
310/** 371/**
311 * blk_integrity_register - Register a gendisk as being integrity-capable 372 * blk_integrity_register - Register a gendisk as being integrity-capable
312 * @disk: struct gendisk pointer to make integrity-aware 373 * @disk: struct gendisk pointer to make integrity-aware
@@ -356,7 +417,7 @@ int blk_integrity_register(struct gendisk *disk, struct blk_integrity *template)
356 bi->get_tag_fn = template->get_tag_fn; 417 bi->get_tag_fn = template->get_tag_fn;
357 bi->tag_size = template->tag_size; 418 bi->tag_size = template->tag_size;
358 } else 419 } else
359 bi->name = "unsupported"; 420 bi->name = bi_unsupported_name;
360 421
361 return 0; 422 return 0;
362} 423}
@@ -381,7 +442,6 @@ void blk_integrity_unregister(struct gendisk *disk)
381 kobject_uevent(&bi->kobj, KOBJ_REMOVE); 442 kobject_uevent(&bi->kobj, KOBJ_REMOVE);
382 kobject_del(&bi->kobj); 443 kobject_del(&bi->kobj);
383 kobject_put(&bi->kobj); 444 kobject_put(&bi->kobj);
384 kmem_cache_free(integrity_cachep, bi);
385 disk->integrity = NULL; 445 disk->integrity = NULL;
386} 446}
387EXPORT_SYMBOL(blk_integrity_unregister); 447EXPORT_SYMBOL(blk_integrity_unregister);
diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index d22c4c55c406..342eae9b0d3c 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -21,7 +21,7 @@ static void cfq_dtor(struct io_context *ioc)
21 if (!hlist_empty(&ioc->cic_list)) { 21 if (!hlist_empty(&ioc->cic_list)) {
22 struct cfq_io_context *cic; 22 struct cfq_io_context *cic;
23 23
24 cic = list_entry(ioc->cic_list.first, struct cfq_io_context, 24 cic = hlist_entry(ioc->cic_list.first, struct cfq_io_context,
25 cic_list); 25 cic_list);
26 cic->dtor(ioc); 26 cic->dtor(ioc);
27 } 27 }
@@ -57,14 +57,14 @@ static void cfq_exit(struct io_context *ioc)
57 if (!hlist_empty(&ioc->cic_list)) { 57 if (!hlist_empty(&ioc->cic_list)) {
58 struct cfq_io_context *cic; 58 struct cfq_io_context *cic;
59 59
60 cic = list_entry(ioc->cic_list.first, struct cfq_io_context, 60 cic = hlist_entry(ioc->cic_list.first, struct cfq_io_context,
61 cic_list); 61 cic_list);
62 cic->exit(ioc); 62 cic->exit(ioc);
63 } 63 }
64 rcu_read_unlock(); 64 rcu_read_unlock();
65} 65}
66 66
67/* Called by the exitting task */ 67/* Called by the exiting task */
68void exit_io_context(struct task_struct *task) 68void exit_io_context(struct task_struct *task)
69{ 69{
70 struct io_context *ioc; 70 struct io_context *ioc;
@@ -74,10 +74,9 @@ void exit_io_context(struct task_struct *task)
74 task->io_context = NULL; 74 task->io_context = NULL;
75 task_unlock(task); 75 task_unlock(task);
76 76
77 if (atomic_dec_and_test(&ioc->nr_tasks)) { 77 if (atomic_dec_and_test(&ioc->nr_tasks))
78 cfq_exit(ioc); 78 cfq_exit(ioc);
79 79
80 }
81 put_io_context(ioc); 80 put_io_context(ioc);
82} 81}
83 82
@@ -97,6 +96,9 @@ struct io_context *alloc_io_context(gfp_t gfp_flags, int node)
97 INIT_RADIX_TREE(&ret->radix_root, GFP_ATOMIC | __GFP_HIGH); 96 INIT_RADIX_TREE(&ret->radix_root, GFP_ATOMIC | __GFP_HIGH);
98 INIT_HLIST_HEAD(&ret->cic_list); 97 INIT_HLIST_HEAD(&ret->cic_list);
99 ret->ioc_data = NULL; 98 ret->ioc_data = NULL;
99#if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE)
100 ret->cgroup_changed = 0;
101#endif
100 } 102 }
101 103
102 return ret; 104 return ret;
@@ -153,20 +155,6 @@ struct io_context *get_io_context(gfp_t gfp_flags, int node)
153} 155}
154EXPORT_SYMBOL(get_io_context); 156EXPORT_SYMBOL(get_io_context);
155 157
156void copy_io_context(struct io_context **pdst, struct io_context **psrc)
157{
158 struct io_context *src = *psrc;
159 struct io_context *dst = *pdst;
160
161 if (src) {
162 BUG_ON(atomic_long_read(&src->refcount) == 0);
163 atomic_long_inc(&src->refcount);
164 put_io_context(dst);
165 *pdst = src;
166 }
167}
168EXPORT_SYMBOL(copy_io_context);
169
170static int __init blk_ioc_init(void) 158static int __init blk_ioc_init(void)
171{ 159{
172 iocontext_cachep = kmem_cache_create("blkdev_ioc", 160 iocontext_cachep = kmem_cache_create("blkdev_ioc",
diff --git a/block/blk-lib.c b/block/blk-lib.c
index c392029a104e..78e627e2581d 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -9,17 +9,20 @@
9 9
10#include "blk.h" 10#include "blk.h"
11 11
12static void blkdev_discard_end_io(struct bio *bio, int err) 12struct bio_batch {
13{ 13 atomic_t done;
14 if (err) { 14 unsigned long flags;
15 if (err == -EOPNOTSUPP) 15 struct completion *wait;
16 set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); 16};
17 clear_bit(BIO_UPTODATE, &bio->bi_flags);
18 }
19 17
20 if (bio->bi_private) 18static void bio_batch_end_io(struct bio *bio, int err)
21 complete(bio->bi_private); 19{
20 struct bio_batch *bb = bio->bi_private;
22 21
22 if (err && (err != -EOPNOTSUPP))
23 clear_bit(BIO_UPTODATE, &bb->flags);
24 if (atomic_dec_and_test(&bb->done))
25 complete(bb->wait);
23 bio_put(bio); 26 bio_put(bio);
24} 27}
25 28
@@ -39,9 +42,9 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
39{ 42{
40 DECLARE_COMPLETION_ONSTACK(wait); 43 DECLARE_COMPLETION_ONSTACK(wait);
41 struct request_queue *q = bdev_get_queue(bdev); 44 struct request_queue *q = bdev_get_queue(bdev);
42 int type = flags & BLKDEV_IFL_BARRIER ? 45 int type = REQ_WRITE | REQ_DISCARD;
43 DISCARD_BARRIER : DISCARD_NOBARRIER;
44 unsigned int max_discard_sectors; 46 unsigned int max_discard_sectors;
47 struct bio_batch bb;
45 struct bio *bio; 48 struct bio *bio;
46 int ret = 0; 49 int ret = 0;
47 50
@@ -62,13 +65,17 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
62 max_discard_sectors &= ~(disc_sects - 1); 65 max_discard_sectors &= ~(disc_sects - 1);
63 } 66 }
64 67
65 if (flags & BLKDEV_IFL_SECURE) { 68 if (flags & BLKDEV_DISCARD_SECURE) {
66 if (!blk_queue_secdiscard(q)) 69 if (!blk_queue_secdiscard(q))
67 return -EOPNOTSUPP; 70 return -EOPNOTSUPP;
68 type |= DISCARD_SECURE; 71 type |= REQ_SECURE;
69 } 72 }
70 73
71 while (nr_sects && !ret) { 74 atomic_set(&bb.done, 1);
75 bb.flags = 1 << BIO_UPTODATE;
76 bb.wait = &wait;
77
78 while (nr_sects) {
72 bio = bio_alloc(gfp_mask, 1); 79 bio = bio_alloc(gfp_mask, 1);
73 if (!bio) { 80 if (!bio) {
74 ret = -ENOMEM; 81 ret = -ENOMEM;
@@ -76,10 +83,9 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
76 } 83 }
77 84
78 bio->bi_sector = sector; 85 bio->bi_sector = sector;
79 bio->bi_end_io = blkdev_discard_end_io; 86 bio->bi_end_io = bio_batch_end_io;
80 bio->bi_bdev = bdev; 87 bio->bi_bdev = bdev;
81 if (flags & BLKDEV_IFL_WAIT) 88 bio->bi_private = &bb;
82 bio->bi_private = &wait;
83 89
84 if (nr_sects > max_discard_sectors) { 90 if (nr_sects > max_discard_sectors) {
85 bio->bi_size = max_discard_sectors << 9; 91 bio->bi_size = max_discard_sectors << 9;
@@ -90,85 +96,45 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
90 nr_sects = 0; 96 nr_sects = 0;
91 } 97 }
92 98
93 bio_get(bio); 99 atomic_inc(&bb.done);
94 submit_bio(type, bio); 100 submit_bio(type, bio);
101 }
95 102
96 if (flags & BLKDEV_IFL_WAIT) 103 /* Wait for bios in-flight */
97 wait_for_completion(&wait); 104 if (!atomic_dec_and_test(&bb.done))
105 wait_for_completion(&wait);
98 106
99 if (bio_flagged(bio, BIO_EOPNOTSUPP)) 107 if (!test_bit(BIO_UPTODATE, &bb.flags))
100 ret = -EOPNOTSUPP; 108 ret = -EIO;
101 else if (!bio_flagged(bio, BIO_UPTODATE))
102 ret = -EIO;
103 bio_put(bio);
104 }
105 109
106 return ret; 110 return ret;
107} 111}
108EXPORT_SYMBOL(blkdev_issue_discard); 112EXPORT_SYMBOL(blkdev_issue_discard);
109 113
110struct bio_batch
111{
112 atomic_t done;
113 unsigned long flags;
114 struct completion *wait;
115 bio_end_io_t *end_io;
116};
117
118static void bio_batch_end_io(struct bio *bio, int err)
119{
120 struct bio_batch *bb = bio->bi_private;
121
122 if (err) {
123 if (err == -EOPNOTSUPP)
124 set_bit(BIO_EOPNOTSUPP, &bb->flags);
125 else
126 clear_bit(BIO_UPTODATE, &bb->flags);
127 }
128 if (bb) {
129 if (bb->end_io)
130 bb->end_io(bio, err);
131 atomic_inc(&bb->done);
132 complete(bb->wait);
133 }
134 bio_put(bio);
135}
136
137/** 114/**
138 * blkdev_issue_zeroout generate number of zero filed write bios 115 * blkdev_issue_zeroout - generate number of zero filed write bios
139 * @bdev: blockdev to issue 116 * @bdev: blockdev to issue
140 * @sector: start sector 117 * @sector: start sector
141 * @nr_sects: number of sectors to write 118 * @nr_sects: number of sectors to write
142 * @gfp_mask: memory allocation flags (for bio_alloc) 119 * @gfp_mask: memory allocation flags (for bio_alloc)
143 * @flags: BLKDEV_IFL_* flags to control behaviour
144 * 120 *
145 * Description: 121 * Description:
146 * Generate and issue number of bios with zerofiled pages. 122 * Generate and issue number of bios with zerofiled pages.
147 * Send barrier at the beginning and at the end if requested. This guarantie
148 * correct request ordering. Empty barrier allow us to avoid post queue flush.
149 */ 123 */
150 124
151int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, 125int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
152 sector_t nr_sects, gfp_t gfp_mask, unsigned long flags) 126 sector_t nr_sects, gfp_t gfp_mask)
153{ 127{
154 int ret; 128 int ret;
155 struct bio *bio; 129 struct bio *bio;
156 struct bio_batch bb; 130 struct bio_batch bb;
157 unsigned int sz, issued = 0; 131 unsigned int sz;
158 DECLARE_COMPLETION_ONSTACK(wait); 132 DECLARE_COMPLETION_ONSTACK(wait);
159 133
160 atomic_set(&bb.done, 0); 134 atomic_set(&bb.done, 1);
161 bb.flags = 1 << BIO_UPTODATE; 135 bb.flags = 1 << BIO_UPTODATE;
162 bb.wait = &wait; 136 bb.wait = &wait;
163 bb.end_io = NULL;
164 137
165 if (flags & BLKDEV_IFL_BARRIER) {
166 /* issue async barrier before the data */
167 ret = blkdev_issue_flush(bdev, gfp_mask, NULL, 0);
168 if (ret)
169 return ret;
170 }
171submit:
172 ret = 0; 138 ret = 0;
173 while (nr_sects != 0) { 139 while (nr_sects != 0) {
174 bio = bio_alloc(gfp_mask, 140 bio = bio_alloc(gfp_mask,
@@ -181,14 +147,10 @@ submit:
181 bio->bi_sector = sector; 147 bio->bi_sector = sector;
182 bio->bi_bdev = bdev; 148 bio->bi_bdev = bdev;
183 bio->bi_end_io = bio_batch_end_io; 149 bio->bi_end_io = bio_batch_end_io;
184 if (flags & BLKDEV_IFL_WAIT) 150 bio->bi_private = &bb;
185 bio->bi_private = &bb;
186 151
187 while (nr_sects != 0) { 152 while (nr_sects != 0) {
188 sz = min((sector_t) PAGE_SIZE >> 9 , nr_sects); 153 sz = min((sector_t) PAGE_SIZE >> 9 , nr_sects);
189 if (sz == 0)
190 /* bio has maximum size possible */
191 break;
192 ret = bio_add_page(bio, ZERO_PAGE(0), sz << 9, 0); 154 ret = bio_add_page(bio, ZERO_PAGE(0), sz << 9, 0);
193 nr_sects -= ret >> 9; 155 nr_sects -= ret >> 9;
194 sector += ret >> 9; 156 sector += ret >> 9;
@@ -196,36 +158,18 @@ submit:
196 break; 158 break;
197 } 159 }
198 ret = 0; 160 ret = 0;
199 issued++; 161 atomic_inc(&bb.done);
200 submit_bio(WRITE, bio); 162 submit_bio(WRITE, bio);
201 } 163 }
202 /*
203 * When all data bios are in flight. Send final barrier if requeted.
204 */
205 if (nr_sects == 0 && flags & BLKDEV_IFL_BARRIER)
206 ret = blkdev_issue_flush(bdev, gfp_mask, NULL,
207 flags & BLKDEV_IFL_WAIT);
208
209 164
210 if (flags & BLKDEV_IFL_WAIT) 165 /* Wait for bios in-flight */
211 /* Wait for bios in-flight */ 166 if (!atomic_dec_and_test(&bb.done))
212 while ( issued != atomic_read(&bb.done)) 167 wait_for_completion(&wait);
213 wait_for_completion(&wait);
214 168
215 if (!test_bit(BIO_UPTODATE, &bb.flags)) 169 if (!test_bit(BIO_UPTODATE, &bb.flags))
216 /* One of bios in the batch was completed with error.*/ 170 /* One of bios in the batch was completed with error.*/
217 ret = -EIO; 171 ret = -EIO;
218 172
219 if (ret)
220 goto out;
221
222 if (test_bit(BIO_EOPNOTSUPP, &bb.flags)) {
223 ret = -EOPNOTSUPP;
224 goto out;
225 }
226 if (nr_sects != 0)
227 goto submit;
228out:
229 return ret; 173 return ret;
230} 174}
231EXPORT_SYMBOL(blkdev_issue_zeroout); 175EXPORT_SYMBOL(blkdev_issue_zeroout);
diff --git a/block/blk-map.c b/block/blk-map.c
index ade0a08c9099..e663ac2d8e68 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -54,7 +54,7 @@ static int __blk_rq_map_user(struct request_queue *q, struct request *rq,
54 * direct dma. else, set up kernel bounce buffers 54 * direct dma. else, set up kernel bounce buffers
55 */ 55 */
56 uaddr = (unsigned long) ubuf; 56 uaddr = (unsigned long) ubuf;
57 if (blk_rq_aligned(q, ubuf, len) && !map_data) 57 if (blk_rq_aligned(q, uaddr, len) && !map_data)
58 bio = bio_map_user(q, NULL, uaddr, len, reading, gfp_mask); 58 bio = bio_map_user(q, NULL, uaddr, len, reading, gfp_mask);
59 else 59 else
60 bio = bio_copy_user(q, map_data, uaddr, len, reading, gfp_mask); 60 bio = bio_copy_user(q, map_data, uaddr, len, reading, gfp_mask);
@@ -201,6 +201,9 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
201 for (i = 0; i < iov_count; i++) { 201 for (i = 0; i < iov_count; i++) {
202 unsigned long uaddr = (unsigned long)iov[i].iov_base; 202 unsigned long uaddr = (unsigned long)iov[i].iov_base;
203 203
204 if (!iov[i].iov_len)
205 return -EINVAL;
206
204 if (uaddr & queue_dma_alignment(q)) { 207 if (uaddr & queue_dma_alignment(q)) {
205 unaligned = 1; 208 unaligned = 1;
206 break; 209 break;
@@ -288,6 +291,7 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
288 unsigned int len, gfp_t gfp_mask) 291 unsigned int len, gfp_t gfp_mask)
289{ 292{
290 int reading = rq_data_dir(rq) == READ; 293 int reading = rq_data_dir(rq) == READ;
294 unsigned long addr = (unsigned long) kbuf;
291 int do_copy = 0; 295 int do_copy = 0;
292 struct bio *bio; 296 struct bio *bio;
293 int ret; 297 int ret;
@@ -297,7 +301,7 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
297 if (!len || !kbuf) 301 if (!len || !kbuf)
298 return -EINVAL; 302 return -EINVAL;
299 303
300 do_copy = !blk_rq_aligned(q, kbuf, len) || object_is_on_stack(kbuf); 304 do_copy = !blk_rq_aligned(q, addr, len) || object_is_on_stack(kbuf);
301 if (do_copy) 305 if (do_copy)
302 bio = bio_copy_kern(q, kbuf, len, gfp_mask, reading); 306 bio = bio_copy_kern(q, kbuf, len, gfp_mask, reading);
303 else 307 else
diff --git a/block/blk-merge.c b/block/blk-merge.c
index eafc94f68d79..cfcc37cb222b 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -21,7 +21,7 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q,
21 return 0; 21 return 0;
22 22
23 fbio = bio; 23 fbio = bio;
24 cluster = test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags); 24 cluster = blk_queue_cluster(q);
25 seg_size = 0; 25 seg_size = 0;
26 nr_phys_segs = 0; 26 nr_phys_segs = 0;
27 for_each_bio(bio) { 27 for_each_bio(bio) {
@@ -87,7 +87,7 @@ EXPORT_SYMBOL(blk_recount_segments);
87static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio, 87static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio,
88 struct bio *nxt) 88 struct bio *nxt)
89{ 89{
90 if (!test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags)) 90 if (!blk_queue_cluster(q))
91 return 0; 91 return 0;
92 92
93 if (bio->bi_seg_back_size + nxt->bi_seg_front_size > 93 if (bio->bi_seg_back_size + nxt->bi_seg_front_size >
@@ -123,7 +123,7 @@ int blk_rq_map_sg(struct request_queue *q, struct request *rq,
123 int nsegs, cluster; 123 int nsegs, cluster;
124 124
125 nsegs = 0; 125 nsegs = 0;
126 cluster = test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags); 126 cluster = blk_queue_cluster(q);
127 127
128 /* 128 /*
129 * for each bio in rq 129 * for each bio in rq
@@ -205,12 +205,11 @@ static inline int ll_new_hw_segment(struct request_queue *q,
205{ 205{
206 int nr_phys_segs = bio_phys_segments(q, bio); 206 int nr_phys_segs = bio_phys_segments(q, bio);
207 207
208 if (req->nr_phys_segments + nr_phys_segs > queue_max_segments(q)) { 208 if (req->nr_phys_segments + nr_phys_segs > queue_max_segments(q))
209 req->cmd_flags |= REQ_NOMERGE; 209 goto no_merge;
210 if (req == q->last_merge) 210
211 q->last_merge = NULL; 211 if (bio_integrity(bio) && blk_integrity_merge_bio(q, req, bio))
212 return 0; 212 goto no_merge;
213 }
214 213
215 /* 214 /*
216 * This will form the start of a new hw segment. Bump both 215 * This will form the start of a new hw segment. Bump both
@@ -218,6 +217,12 @@ static inline int ll_new_hw_segment(struct request_queue *q,
218 */ 217 */
219 req->nr_phys_segments += nr_phys_segs; 218 req->nr_phys_segments += nr_phys_segs;
220 return 1; 219 return 1;
220
221no_merge:
222 req->cmd_flags |= REQ_NOMERGE;
223 if (req == q->last_merge)
224 q->last_merge = NULL;
225 return 0;
221} 226}
222 227
223int ll_back_merge_fn(struct request_queue *q, struct request *req, 228int ll_back_merge_fn(struct request_queue *q, struct request *req,
@@ -301,6 +306,9 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
301 if (total_phys_segments > queue_max_segments(q)) 306 if (total_phys_segments > queue_max_segments(q))
302 return 0; 307 return 0;
303 308
309 if (blk_integrity_rq(req) && blk_integrity_merge_rq(q, req, next))
310 return 0;
311
304 /* Merge is OK... */ 312 /* Merge is OK... */
305 req->nr_phys_segments = total_phys_segments; 313 req->nr_phys_segments = total_phys_segments;
306 return 1; 314 return 1;
@@ -343,11 +351,12 @@ static void blk_account_io_merge(struct request *req)
343 int cpu; 351 int cpu;
344 352
345 cpu = part_stat_lock(); 353 cpu = part_stat_lock();
346 part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req)); 354 part = req->part;
347 355
348 part_round_stats(cpu, part); 356 part_round_stats(cpu, part);
349 part_dec_in_flight(part, rq_data_dir(req)); 357 part_dec_in_flight(part, rq_data_dir(req));
350 358
359 hd_struct_put(part);
351 part_stat_unlock(); 360 part_stat_unlock();
352 } 361 }
353} 362}
@@ -384,9 +393,6 @@ static int attempt_merge(struct request_queue *q, struct request *req,
384 || next->special) 393 || next->special)
385 return 0; 394 return 0;
386 395
387 if (blk_integrity_rq(req) != blk_integrity_rq(next))
388 return 0;
389
390 /* 396 /*
391 * If we are allowed to merge, then append bio list 397 * If we are allowed to merge, then append bio list
392 * from next to rq and release next. merge_requests_fn 398 * from next to rq and release next. merge_requests_fn
@@ -459,3 +465,9 @@ int attempt_front_merge(struct request_queue *q, struct request *rq)
459 465
460 return 0; 466 return 0;
461} 467}
468
469int blk_attempt_req_merge(struct request_queue *q, struct request *rq,
470 struct request *next)
471{
472 return attempt_merge(q, rq, next);
473}
diff --git a/block/blk-settings.c b/block/blk-settings.c
index a234f4bf1d6f..fa1eb0449a05 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -111,6 +111,7 @@ EXPORT_SYMBOL_GPL(blk_queue_lld_busy);
111void blk_set_default_limits(struct queue_limits *lim) 111void blk_set_default_limits(struct queue_limits *lim)
112{ 112{
113 lim->max_segments = BLK_MAX_SEGMENTS; 113 lim->max_segments = BLK_MAX_SEGMENTS;
114 lim->max_integrity_segments = 0;
114 lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK; 115 lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK;
115 lim->max_segment_size = BLK_MAX_SEGMENT_SIZE; 116 lim->max_segment_size = BLK_MAX_SEGMENT_SIZE;
116 lim->max_sectors = BLK_DEF_MAX_SECTORS; 117 lim->max_sectors = BLK_DEF_MAX_SECTORS;
@@ -119,13 +120,13 @@ void blk_set_default_limits(struct queue_limits *lim)
119 lim->discard_granularity = 0; 120 lim->discard_granularity = 0;
120 lim->discard_alignment = 0; 121 lim->discard_alignment = 0;
121 lim->discard_misaligned = 0; 122 lim->discard_misaligned = 0;
122 lim->discard_zeroes_data = -1; 123 lim->discard_zeroes_data = 1;
123 lim->logical_block_size = lim->physical_block_size = lim->io_min = 512; 124 lim->logical_block_size = lim->physical_block_size = lim->io_min = 512;
124 lim->bounce_pfn = (unsigned long)(BLK_BOUNCE_ANY >> PAGE_SHIFT); 125 lim->bounce_pfn = (unsigned long)(BLK_BOUNCE_ANY >> PAGE_SHIFT);
125 lim->alignment_offset = 0; 126 lim->alignment_offset = 0;
126 lim->io_opt = 0; 127 lim->io_opt = 0;
127 lim->misaligned = 0; 128 lim->misaligned = 0;
128 lim->no_cluster = 0; 129 lim->cluster = 1;
129} 130}
130EXPORT_SYMBOL(blk_set_default_limits); 131EXPORT_SYMBOL(blk_set_default_limits);
131 132
@@ -163,23 +164,9 @@ void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn)
163 blk_queue_congestion_threshold(q); 164 blk_queue_congestion_threshold(q);
164 q->nr_batching = BLK_BATCH_REQ; 165 q->nr_batching = BLK_BATCH_REQ;
165 166
166 q->unplug_thresh = 4; /* hmm */
167 q->unplug_delay = msecs_to_jiffies(3); /* 3 milliseconds */
168 if (q->unplug_delay == 0)
169 q->unplug_delay = 1;
170
171 q->unplug_timer.function = blk_unplug_timeout;
172 q->unplug_timer.data = (unsigned long)q;
173
174 blk_set_default_limits(&q->limits); 167 blk_set_default_limits(&q->limits);
175 blk_queue_max_hw_sectors(q, BLK_SAFE_MAX_SECTORS); 168 blk_queue_max_hw_sectors(q, BLK_SAFE_MAX_SECTORS);
176 169 q->limits.discard_zeroes_data = 0;
177 /*
178 * If the caller didn't supply a lock, fall back to our embedded
179 * per-queue locks
180 */
181 if (!q->queue_lock)
182 q->queue_lock = &q->__queue_lock;
183 170
184 /* 171 /*
185 * by default assume old behaviour and bounce for any highmem page 172 * by default assume old behaviour and bounce for any highmem page
@@ -213,7 +200,7 @@ void blk_queue_bounce_limit(struct request_queue *q, u64 dma_mask)
213 */ 200 */
214 if (b_pfn < (min_t(u64, 0xffffffffUL, BLK_BOUNCE_HIGH) >> PAGE_SHIFT)) 201 if (b_pfn < (min_t(u64, 0xffffffffUL, BLK_BOUNCE_HIGH) >> PAGE_SHIFT))
215 dma = 1; 202 dma = 1;
216 q->limits.bounce_pfn = max_low_pfn; 203 q->limits.bounce_pfn = max(max_low_pfn, b_pfn);
217#else 204#else
218 if (b_pfn < blk_max_low_pfn) 205 if (b_pfn < blk_max_low_pfn)
219 dma = 1; 206 dma = 1;
@@ -228,8 +215,8 @@ void blk_queue_bounce_limit(struct request_queue *q, u64 dma_mask)
228EXPORT_SYMBOL(blk_queue_bounce_limit); 215EXPORT_SYMBOL(blk_queue_bounce_limit);
229 216
230/** 217/**
231 * blk_queue_max_hw_sectors - set max sectors for a request for this queue 218 * blk_limits_max_hw_sectors - set hard and soft limit of max sectors for request
232 * @q: the request queue for the device 219 * @limits: the queue limits
233 * @max_hw_sectors: max hardware sectors in the usual 512b unit 220 * @max_hw_sectors: max hardware sectors in the usual 512b unit
234 * 221 *
235 * Description: 222 * Description:
@@ -243,7 +230,7 @@ EXPORT_SYMBOL(blk_queue_bounce_limit);
243 * per-device basis in /sys/block/<device>/queue/max_sectors_kb. 230 * per-device basis in /sys/block/<device>/queue/max_sectors_kb.
244 * The soft limit can not exceed max_hw_sectors. 231 * The soft limit can not exceed max_hw_sectors.
245 **/ 232 **/
246void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_sectors) 233void blk_limits_max_hw_sectors(struct queue_limits *limits, unsigned int max_hw_sectors)
247{ 234{
248 if ((max_hw_sectors << 9) < PAGE_CACHE_SIZE) { 235 if ((max_hw_sectors << 9) < PAGE_CACHE_SIZE) {
249 max_hw_sectors = 1 << (PAGE_CACHE_SHIFT - 9); 236 max_hw_sectors = 1 << (PAGE_CACHE_SHIFT - 9);
@@ -251,9 +238,23 @@ void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_secto
251 __func__, max_hw_sectors); 238 __func__, max_hw_sectors);
252 } 239 }
253 240
254 q->limits.max_hw_sectors = max_hw_sectors; 241 limits->max_hw_sectors = max_hw_sectors;
255 q->limits.max_sectors = min_t(unsigned int, max_hw_sectors, 242 limits->max_sectors = min_t(unsigned int, max_hw_sectors,
256 BLK_DEF_MAX_SECTORS); 243 BLK_DEF_MAX_SECTORS);
244}
245EXPORT_SYMBOL(blk_limits_max_hw_sectors);
246
247/**
248 * blk_queue_max_hw_sectors - set max sectors for a request for this queue
249 * @q: the request queue for the device
250 * @max_hw_sectors: max hardware sectors in the usual 512b unit
251 *
252 * Description:
253 * See description for blk_limits_max_hw_sectors().
254 **/
255void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_sectors)
256{
257 blk_limits_max_hw_sectors(&q->limits, max_hw_sectors);
257} 258}
258EXPORT_SYMBOL(blk_queue_max_hw_sectors); 259EXPORT_SYMBOL(blk_queue_max_hw_sectors);
259 260
@@ -343,7 +344,7 @@ EXPORT_SYMBOL(blk_queue_logical_block_size);
343 * hardware can operate on without reverting to read-modify-write 344 * hardware can operate on without reverting to read-modify-write
344 * operations. 345 * operations.
345 */ 346 */
346void blk_queue_physical_block_size(struct request_queue *q, unsigned short size) 347void blk_queue_physical_block_size(struct request_queue *q, unsigned int size)
347{ 348{
348 q->limits.physical_block_size = size; 349 q->limits.physical_block_size = size;
349 350
@@ -455,11 +456,6 @@ void blk_queue_io_opt(struct request_queue *q, unsigned int opt)
455} 456}
456EXPORT_SYMBOL(blk_queue_io_opt); 457EXPORT_SYMBOL(blk_queue_io_opt);
457 458
458/*
459 * Returns the minimum that is _not_ zero, unless both are zero.
460 */
461#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r))
462
463/** 459/**
464 * blk_queue_stack_limits - inherit underlying queue limits for stacked drivers 460 * blk_queue_stack_limits - inherit underlying queue limits for stacked drivers
465 * @t: the stacking driver (top) 461 * @t: the stacking driver (top)
@@ -468,15 +464,6 @@ EXPORT_SYMBOL(blk_queue_io_opt);
468void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b) 464void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b)
469{ 465{
470 blk_stack_limits(&t->limits, &b->limits, 0); 466 blk_stack_limits(&t->limits, &b->limits, 0);
471
472 if (!t->queue_lock)
473 WARN_ON_ONCE(1);
474 else if (!test_bit(QUEUE_FLAG_CLUSTER, &b->queue_flags)) {
475 unsigned long flags;
476 spin_lock_irqsave(t->queue_lock, flags);
477 queue_flag_clear(QUEUE_FLAG_CLUSTER, t);
478 spin_unlock_irqrestore(t->queue_lock, flags);
479 }
480} 467}
481EXPORT_SYMBOL(blk_queue_stack_limits); 468EXPORT_SYMBOL(blk_queue_stack_limits);
482 469
@@ -514,6 +501,8 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
514 b->seg_boundary_mask); 501 b->seg_boundary_mask);
515 502
516 t->max_segments = min_not_zero(t->max_segments, b->max_segments); 503 t->max_segments = min_not_zero(t->max_segments, b->max_segments);
504 t->max_integrity_segments = min_not_zero(t->max_integrity_segments,
505 b->max_integrity_segments);
517 506
518 t->max_segment_size = min_not_zero(t->max_segment_size, 507 t->max_segment_size = min_not_zero(t->max_segment_size,
519 b->max_segment_size); 508 b->max_segment_size);
@@ -547,7 +536,7 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
547 t->io_min = max(t->io_min, b->io_min); 536 t->io_min = max(t->io_min, b->io_min);
548 t->io_opt = lcm(t->io_opt, b->io_opt); 537 t->io_opt = lcm(t->io_opt, b->io_opt);
549 538
550 t->no_cluster |= b->no_cluster; 539 t->cluster &= b->cluster;
551 t->discard_zeroes_data &= b->discard_zeroes_data; 540 t->discard_zeroes_data &= b->discard_zeroes_data;
552 541
553 /* Physical block size a multiple of the logical block size? */ 542 /* Physical block size a multiple of the logical block size? */
@@ -643,7 +632,6 @@ void disk_stack_limits(struct gendisk *disk, struct block_device *bdev,
643 sector_t offset) 632 sector_t offset)
644{ 633{
645 struct request_queue *t = disk->queue; 634 struct request_queue *t = disk->queue;
646 struct request_queue *b = bdev_get_queue(bdev);
647 635
648 if (bdev_stack_limits(&t->limits, bdev, offset >> 9) < 0) { 636 if (bdev_stack_limits(&t->limits, bdev, offset >> 9) < 0) {
649 char top[BDEVNAME_SIZE], bottom[BDEVNAME_SIZE]; 637 char top[BDEVNAME_SIZE], bottom[BDEVNAME_SIZE];
@@ -654,17 +642,6 @@ void disk_stack_limits(struct gendisk *disk, struct block_device *bdev,
654 printk(KERN_NOTICE "%s: Warning: Device %s is misaligned\n", 642 printk(KERN_NOTICE "%s: Warning: Device %s is misaligned\n",
655 top, bottom); 643 top, bottom);
656 } 644 }
657
658 if (!t->queue_lock)
659 WARN_ON_ONCE(1);
660 else if (!test_bit(QUEUE_FLAG_CLUSTER, &b->queue_flags)) {
661 unsigned long flags;
662
663 spin_lock_irqsave(t->queue_lock, flags);
664 if (!test_bit(QUEUE_FLAG_CLUSTER, &b->queue_flags))
665 queue_flag_clear(QUEUE_FLAG_CLUSTER, t);
666 spin_unlock_irqrestore(t->queue_lock, flags);
667 }
668} 645}
669EXPORT_SYMBOL(disk_stack_limits); 646EXPORT_SYMBOL(disk_stack_limits);
670 647
@@ -794,6 +771,32 @@ void blk_queue_update_dma_alignment(struct request_queue *q, int mask)
794} 771}
795EXPORT_SYMBOL(blk_queue_update_dma_alignment); 772EXPORT_SYMBOL(blk_queue_update_dma_alignment);
796 773
774/**
775 * blk_queue_flush - configure queue's cache flush capability
776 * @q: the request queue for the device
777 * @flush: 0, REQ_FLUSH or REQ_FLUSH | REQ_FUA
778 *
779 * Tell block layer cache flush capability of @q. If it supports
780 * flushing, REQ_FLUSH should be set. If it supports bypassing
781 * write cache for individual writes, REQ_FUA should be set.
782 */
783void blk_queue_flush(struct request_queue *q, unsigned int flush)
784{
785 WARN_ON_ONCE(flush & ~(REQ_FLUSH | REQ_FUA));
786
787 if (WARN_ON_ONCE(!(flush & REQ_FLUSH) && (flush & REQ_FUA)))
788 flush &= ~REQ_FUA;
789
790 q->flush_flags = flush & (REQ_FLUSH | REQ_FUA);
791}
792EXPORT_SYMBOL_GPL(blk_queue_flush);
793
794void blk_queue_flush_queueable(struct request_queue *q, bool queueable)
795{
796 q->flush_not_queueable = !queueable;
797}
798EXPORT_SYMBOL_GPL(blk_queue_flush_queueable);
799
797static int __init blk_settings_init(void) 800static int __init blk_settings_init(void)
798{ 801{
799 blk_max_low_pfn = max_low_pfn - 1; 802 blk_max_low_pfn = max_low_pfn - 1;
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 0749b89c6885..d935bd859c87 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -66,14 +66,14 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count)
66 66
67 if (rl->count[BLK_RW_SYNC] >= q->nr_requests) { 67 if (rl->count[BLK_RW_SYNC] >= q->nr_requests) {
68 blk_set_queue_full(q, BLK_RW_SYNC); 68 blk_set_queue_full(q, BLK_RW_SYNC);
69 } else if (rl->count[BLK_RW_SYNC]+1 <= q->nr_requests) { 69 } else {
70 blk_clear_queue_full(q, BLK_RW_SYNC); 70 blk_clear_queue_full(q, BLK_RW_SYNC);
71 wake_up(&rl->wait[BLK_RW_SYNC]); 71 wake_up(&rl->wait[BLK_RW_SYNC]);
72 } 72 }
73 73
74 if (rl->count[BLK_RW_ASYNC] >= q->nr_requests) { 74 if (rl->count[BLK_RW_ASYNC] >= q->nr_requests) {
75 blk_set_queue_full(q, BLK_RW_ASYNC); 75 blk_set_queue_full(q, BLK_RW_ASYNC);
76 } else if (rl->count[BLK_RW_ASYNC]+1 <= q->nr_requests) { 76 } else {
77 blk_clear_queue_full(q, BLK_RW_ASYNC); 77 blk_clear_queue_full(q, BLK_RW_ASYNC);
78 wake_up(&rl->wait[BLK_RW_ASYNC]); 78 wake_up(&rl->wait[BLK_RW_ASYNC]);
79 } 79 }
@@ -112,9 +112,14 @@ static ssize_t queue_max_segments_show(struct request_queue *q, char *page)
112 return queue_var_show(queue_max_segments(q), (page)); 112 return queue_var_show(queue_max_segments(q), (page));
113} 113}
114 114
115static ssize_t queue_max_integrity_segments_show(struct request_queue *q, char *page)
116{
117 return queue_var_show(q->limits.max_integrity_segments, (page));
118}
119
115static ssize_t queue_max_segment_size_show(struct request_queue *q, char *page) 120static ssize_t queue_max_segment_size_show(struct request_queue *q, char *page)
116{ 121{
117 if (test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags)) 122 if (blk_queue_cluster(q))
118 return queue_var_show(queue_max_segment_size(q), (page)); 123 return queue_var_show(queue_max_segment_size(q), (page));
119 124
120 return queue_var_show(PAGE_CACHE_SIZE, (page)); 125 return queue_var_show(PAGE_CACHE_SIZE, (page));
@@ -147,7 +152,8 @@ static ssize_t queue_discard_granularity_show(struct request_queue *q, char *pag
147 152
148static ssize_t queue_discard_max_show(struct request_queue *q, char *page) 153static ssize_t queue_discard_max_show(struct request_queue *q, char *page)
149{ 154{
150 return queue_var_show(q->limits.max_discard_sectors << 9, page); 155 return sprintf(page, "%llu\n",
156 (unsigned long long)q->limits.max_discard_sectors << 9);
151} 157}
152 158
153static ssize_t queue_discard_zeroes_data_show(struct request_queue *q, char *page) 159static ssize_t queue_discard_zeroes_data_show(struct request_queue *q, char *page)
@@ -288,6 +294,11 @@ static struct queue_sysfs_entry queue_max_segments_entry = {
288 .show = queue_max_segments_show, 294 .show = queue_max_segments_show,
289}; 295};
290 296
297static struct queue_sysfs_entry queue_max_integrity_segments_entry = {
298 .attr = {.name = "max_integrity_segments", .mode = S_IRUGO },
299 .show = queue_max_integrity_segments_show,
300};
301
291static struct queue_sysfs_entry queue_max_segment_size_entry = { 302static struct queue_sysfs_entry queue_max_segment_size_entry = {
292 .attr = {.name = "max_segment_size", .mode = S_IRUGO }, 303 .attr = {.name = "max_segment_size", .mode = S_IRUGO },
293 .show = queue_max_segment_size_show, 304 .show = queue_max_segment_size_show,
@@ -375,6 +386,7 @@ static struct attribute *default_attrs[] = {
375 &queue_max_hw_sectors_entry.attr, 386 &queue_max_hw_sectors_entry.attr,
376 &queue_max_sectors_entry.attr, 387 &queue_max_sectors_entry.attr,
377 &queue_max_segments_entry.attr, 388 &queue_max_segments_entry.attr,
389 &queue_max_integrity_segments_entry.attr,
378 &queue_max_segment_size_entry.attr, 390 &queue_max_segment_size_entry.attr,
379 &queue_iosched_entry.attr, 391 &queue_iosched_entry.attr,
380 &queue_hw_sector_size_entry.attr, 392 &queue_hw_sector_size_entry.attr,
@@ -487,7 +499,6 @@ int blk_register_queue(struct gendisk *disk)
487{ 499{
488 int ret; 500 int ret;
489 struct device *dev = disk_to_dev(disk); 501 struct device *dev = disk_to_dev(disk);
490
491 struct request_queue *q = disk->queue; 502 struct request_queue *q = disk->queue;
492 503
493 if (WARN_ON(!q)) 504 if (WARN_ON(!q))
@@ -498,8 +509,10 @@ int blk_register_queue(struct gendisk *disk)
498 return ret; 509 return ret;
499 510
500 ret = kobject_add(&q->kobj, kobject_get(&dev->kobj), "%s", "queue"); 511 ret = kobject_add(&q->kobj, kobject_get(&dev->kobj), "%s", "queue");
501 if (ret < 0) 512 if (ret < 0) {
513 blk_trace_remove_sysfs(dev);
502 return ret; 514 return ret;
515 }
503 516
504 kobject_uevent(&q->kobj, KOBJ_ADD); 517 kobject_uevent(&q->kobj, KOBJ_ADD);
505 518
@@ -510,7 +523,7 @@ int blk_register_queue(struct gendisk *disk)
510 if (ret) { 523 if (ret) {
511 kobject_uevent(&q->kobj, KOBJ_REMOVE); 524 kobject_uevent(&q->kobj, KOBJ_REMOVE);
512 kobject_del(&q->kobj); 525 kobject_del(&q->kobj);
513 blk_trace_remove_sysfs(disk_to_dev(disk)); 526 blk_trace_remove_sysfs(dev);
514 kobject_put(&dev->kobj); 527 kobject_put(&dev->kobj);
515 return ret; 528 return ret;
516 } 529 }
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
new file mode 100644
index 000000000000..3689f833afdc
--- /dev/null
+++ b/block/blk-throttle.c
@@ -0,0 +1,1312 @@
1/*
2 * Interface for controlling IO bandwidth on a request queue
3 *
4 * Copyright (C) 2010 Vivek Goyal <vgoyal@redhat.com>
5 */
6
7#include <linux/module.h>
8#include <linux/slab.h>
9#include <linux/blkdev.h>
10#include <linux/bio.h>
11#include <linux/blktrace_api.h>
12#include "blk-cgroup.h"
13
14/* Max dispatch from a group in 1 round */
15static int throtl_grp_quantum = 8;
16
17/* Total max dispatch from all groups in one round */
18static int throtl_quantum = 32;
19
20/* Throttling is performed over 100ms slice and after that slice is renewed */
21static unsigned long throtl_slice = HZ/10; /* 100 ms */
22
23/* A workqueue to queue throttle related work */
24static struct workqueue_struct *kthrotld_workqueue;
25static void throtl_schedule_delayed_work(struct throtl_data *td,
26 unsigned long delay);
27
28struct throtl_rb_root {
29 struct rb_root rb;
30 struct rb_node *left;
31 unsigned int count;
32 unsigned long min_disptime;
33};
34
35#define THROTL_RB_ROOT (struct throtl_rb_root) { .rb = RB_ROOT, .left = NULL, \
36 .count = 0, .min_disptime = 0}
37
38#define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node)
39
40struct throtl_grp {
41 /* List of throtl groups on the request queue*/
42 struct hlist_node tg_node;
43
44 /* active throtl group service_tree member */
45 struct rb_node rb_node;
46
47 /*
48 * Dispatch time in jiffies. This is the estimated time when group
49 * will unthrottle and is ready to dispatch more bio. It is used as
50 * key to sort active groups in service tree.
51 */
52 unsigned long disptime;
53
54 struct blkio_group blkg;
55 atomic_t ref;
56 unsigned int flags;
57
58 /* Two lists for READ and WRITE */
59 struct bio_list bio_lists[2];
60
61 /* Number of queued bios on READ and WRITE lists */
62 unsigned int nr_queued[2];
63
64 /* bytes per second rate limits */
65 uint64_t bps[2];
66
67 /* IOPS limits */
68 unsigned int iops[2];
69
70 /* Number of bytes disptached in current slice */
71 uint64_t bytes_disp[2];
72 /* Number of bio's dispatched in current slice */
73 unsigned int io_disp[2];
74
75 /* When did we start a new slice */
76 unsigned long slice_start[2];
77 unsigned long slice_end[2];
78
79 /* Some throttle limits got updated for the group */
80 int limits_changed;
81
82 struct rcu_head rcu_head;
83};
84
85struct throtl_data
86{
87 /* List of throtl groups */
88 struct hlist_head tg_list;
89
90 /* service tree for active throtl groups */
91 struct throtl_rb_root tg_service_tree;
92
93 struct throtl_grp *root_tg;
94 struct request_queue *queue;
95
96 /* Total Number of queued bios on READ and WRITE lists */
97 unsigned int nr_queued[2];
98
99 /*
100 * number of total undestroyed groups
101 */
102 unsigned int nr_undestroyed_grps;
103
104 /* Work for dispatching throttled bios */
105 struct delayed_work throtl_work;
106
107 int limits_changed;
108};
109
110enum tg_state_flags {
111 THROTL_TG_FLAG_on_rr = 0, /* on round-robin busy list */
112};
113
114#define THROTL_TG_FNS(name) \
115static inline void throtl_mark_tg_##name(struct throtl_grp *tg) \
116{ \
117 (tg)->flags |= (1 << THROTL_TG_FLAG_##name); \
118} \
119static inline void throtl_clear_tg_##name(struct throtl_grp *tg) \
120{ \
121 (tg)->flags &= ~(1 << THROTL_TG_FLAG_##name); \
122} \
123static inline int throtl_tg_##name(const struct throtl_grp *tg) \
124{ \
125 return ((tg)->flags & (1 << THROTL_TG_FLAG_##name)) != 0; \
126}
127
128THROTL_TG_FNS(on_rr);
129
130#define throtl_log_tg(td, tg, fmt, args...) \
131 blk_add_trace_msg((td)->queue, "throtl %s " fmt, \
132 blkg_path(&(tg)->blkg), ##args); \
133
134#define throtl_log(td, fmt, args...) \
135 blk_add_trace_msg((td)->queue, "throtl " fmt, ##args)
136
137static inline struct throtl_grp *tg_of_blkg(struct blkio_group *blkg)
138{
139 if (blkg)
140 return container_of(blkg, struct throtl_grp, blkg);
141
142 return NULL;
143}
144
145static inline int total_nr_queued(struct throtl_data *td)
146{
147 return (td->nr_queued[0] + td->nr_queued[1]);
148}
149
150static inline struct throtl_grp *throtl_ref_get_tg(struct throtl_grp *tg)
151{
152 atomic_inc(&tg->ref);
153 return tg;
154}
155
156static void throtl_free_tg(struct rcu_head *head)
157{
158 struct throtl_grp *tg;
159
160 tg = container_of(head, struct throtl_grp, rcu_head);
161 free_percpu(tg->blkg.stats_cpu);
162 kfree(tg);
163}
164
165static void throtl_put_tg(struct throtl_grp *tg)
166{
167 BUG_ON(atomic_read(&tg->ref) <= 0);
168 if (!atomic_dec_and_test(&tg->ref))
169 return;
170
171 /*
172 * A group is freed in rcu manner. But having an rcu lock does not
173 * mean that one can access all the fields of blkg and assume these
174 * are valid. For example, don't try to follow throtl_data and
175 * request queue links.
176 *
177 * Having a reference to blkg under an rcu allows acess to only
178 * values local to groups like group stats and group rate limits
179 */
180 call_rcu(&tg->rcu_head, throtl_free_tg);
181}
182
183static void throtl_init_group(struct throtl_grp *tg)
184{
185 INIT_HLIST_NODE(&tg->tg_node);
186 RB_CLEAR_NODE(&tg->rb_node);
187 bio_list_init(&tg->bio_lists[0]);
188 bio_list_init(&tg->bio_lists[1]);
189 tg->limits_changed = false;
190
191 /* Practically unlimited BW */
192 tg->bps[0] = tg->bps[1] = -1;
193 tg->iops[0] = tg->iops[1] = -1;
194
195 /*
196 * Take the initial reference that will be released on destroy
197 * This can be thought of a joint reference by cgroup and
198 * request queue which will be dropped by either request queue
199 * exit or cgroup deletion path depending on who is exiting first.
200 */
201 atomic_set(&tg->ref, 1);
202}
203
204/* Should be called with rcu read lock held (needed for blkcg) */
205static void
206throtl_add_group_to_td_list(struct throtl_data *td, struct throtl_grp *tg)
207{
208 hlist_add_head(&tg->tg_node, &td->tg_list);
209 td->nr_undestroyed_grps++;
210}
211
212static void
213__throtl_tg_fill_dev_details(struct throtl_data *td, struct throtl_grp *tg)
214{
215 struct backing_dev_info *bdi = &td->queue->backing_dev_info;
216 unsigned int major, minor;
217
218 if (!tg || tg->blkg.dev)
219 return;
220
221 /*
222 * Fill in device details for a group which might not have been
223 * filled at group creation time as queue was being instantiated
224 * and driver had not attached a device yet
225 */
226 if (bdi->dev && dev_name(bdi->dev)) {
227 sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
228 tg->blkg.dev = MKDEV(major, minor);
229 }
230}
231
232/*
233 * Should be called with without queue lock held. Here queue lock will be
234 * taken rarely. It will be taken only once during life time of a group
235 * if need be
236 */
237static void
238throtl_tg_fill_dev_details(struct throtl_data *td, struct throtl_grp *tg)
239{
240 if (!tg || tg->blkg.dev)
241 return;
242
243 spin_lock_irq(td->queue->queue_lock);
244 __throtl_tg_fill_dev_details(td, tg);
245 spin_unlock_irq(td->queue->queue_lock);
246}
247
248static void throtl_init_add_tg_lists(struct throtl_data *td,
249 struct throtl_grp *tg, struct blkio_cgroup *blkcg)
250{
251 __throtl_tg_fill_dev_details(td, tg);
252
253 /* Add group onto cgroup list */
254 blkiocg_add_blkio_group(blkcg, &tg->blkg, (void *)td,
255 tg->blkg.dev, BLKIO_POLICY_THROTL);
256
257 tg->bps[READ] = blkcg_get_read_bps(blkcg, tg->blkg.dev);
258 tg->bps[WRITE] = blkcg_get_write_bps(blkcg, tg->blkg.dev);
259 tg->iops[READ] = blkcg_get_read_iops(blkcg, tg->blkg.dev);
260 tg->iops[WRITE] = blkcg_get_write_iops(blkcg, tg->blkg.dev);
261
262 throtl_add_group_to_td_list(td, tg);
263}
264
265/* Should be called without queue lock and outside of rcu period */
266static struct throtl_grp *throtl_alloc_tg(struct throtl_data *td)
267{
268 struct throtl_grp *tg = NULL;
269 int ret;
270
271 tg = kzalloc_node(sizeof(*tg), GFP_ATOMIC, td->queue->node);
272 if (!tg)
273 return NULL;
274
275 ret = blkio_alloc_blkg_stats(&tg->blkg);
276
277 if (ret) {
278 kfree(tg);
279 return NULL;
280 }
281
282 throtl_init_group(tg);
283 return tg;
284}
285
286static struct
287throtl_grp *throtl_find_tg(struct throtl_data *td, struct blkio_cgroup *blkcg)
288{
289 struct throtl_grp *tg = NULL;
290 void *key = td;
291
292 /*
293 * This is the common case when there are no blkio cgroups.
294 * Avoid lookup in this case
295 */
296 if (blkcg == &blkio_root_cgroup)
297 tg = td->root_tg;
298 else
299 tg = tg_of_blkg(blkiocg_lookup_group(blkcg, key));
300
301 __throtl_tg_fill_dev_details(td, tg);
302 return tg;
303}
304
305/*
306 * This function returns with queue lock unlocked in case of error, like
307 * request queue is no more
308 */
309static struct throtl_grp * throtl_get_tg(struct throtl_data *td)
310{
311 struct throtl_grp *tg = NULL, *__tg = NULL;
312 struct blkio_cgroup *blkcg;
313 struct request_queue *q = td->queue;
314
315 rcu_read_lock();
316 blkcg = task_blkio_cgroup(current);
317 tg = throtl_find_tg(td, blkcg);
318 if (tg) {
319 rcu_read_unlock();
320 return tg;
321 }
322
323 /*
324 * Need to allocate a group. Allocation of group also needs allocation
325 * of per cpu stats which in-turn takes a mutex() and can block. Hence
326 * we need to drop rcu lock and queue_lock before we call alloc
327 *
328 * Take the request queue reference to make sure queue does not
329 * go away once we return from allocation.
330 */
331 blk_get_queue(q);
332 rcu_read_unlock();
333 spin_unlock_irq(q->queue_lock);
334
335 tg = throtl_alloc_tg(td);
336 /*
337 * We might have slept in group allocation. Make sure queue is not
338 * dead
339 */
340 if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) {
341 blk_put_queue(q);
342 if (tg)
343 kfree(tg);
344
345 return ERR_PTR(-ENODEV);
346 }
347 blk_put_queue(q);
348
349 /* Group allocated and queue is still alive. take the lock */
350 spin_lock_irq(q->queue_lock);
351
352 /*
353 * Initialize the new group. After sleeping, read the blkcg again.
354 */
355 rcu_read_lock();
356 blkcg = task_blkio_cgroup(current);
357
358 /*
359 * If some other thread already allocated the group while we were
360 * not holding queue lock, free up the group
361 */
362 __tg = throtl_find_tg(td, blkcg);
363
364 if (__tg) {
365 kfree(tg);
366 rcu_read_unlock();
367 return __tg;
368 }
369
370 /* Group allocation failed. Account the IO to root group */
371 if (!tg) {
372 tg = td->root_tg;
373 return tg;
374 }
375
376 throtl_init_add_tg_lists(td, tg, blkcg);
377 rcu_read_unlock();
378 return tg;
379}
380
381static struct throtl_grp *throtl_rb_first(struct throtl_rb_root *root)
382{
383 /* Service tree is empty */
384 if (!root->count)
385 return NULL;
386
387 if (!root->left)
388 root->left = rb_first(&root->rb);
389
390 if (root->left)
391 return rb_entry_tg(root->left);
392
393 return NULL;
394}
395
396static void rb_erase_init(struct rb_node *n, struct rb_root *root)
397{
398 rb_erase(n, root);
399 RB_CLEAR_NODE(n);
400}
401
402static void throtl_rb_erase(struct rb_node *n, struct throtl_rb_root *root)
403{
404 if (root->left == n)
405 root->left = NULL;
406 rb_erase_init(n, &root->rb);
407 --root->count;
408}
409
410static void update_min_dispatch_time(struct throtl_rb_root *st)
411{
412 struct throtl_grp *tg;
413
414 tg = throtl_rb_first(st);
415 if (!tg)
416 return;
417
418 st->min_disptime = tg->disptime;
419}
420
421static void
422tg_service_tree_add(struct throtl_rb_root *st, struct throtl_grp *tg)
423{
424 struct rb_node **node = &st->rb.rb_node;
425 struct rb_node *parent = NULL;
426 struct throtl_grp *__tg;
427 unsigned long key = tg->disptime;
428 int left = 1;
429
430 while (*node != NULL) {
431 parent = *node;
432 __tg = rb_entry_tg(parent);
433
434 if (time_before(key, __tg->disptime))
435 node = &parent->rb_left;
436 else {
437 node = &parent->rb_right;
438 left = 0;
439 }
440 }
441
442 if (left)
443 st->left = &tg->rb_node;
444
445 rb_link_node(&tg->rb_node, parent, node);
446 rb_insert_color(&tg->rb_node, &st->rb);
447}
448
449static void __throtl_enqueue_tg(struct throtl_data *td, struct throtl_grp *tg)
450{
451 struct throtl_rb_root *st = &td->tg_service_tree;
452
453 tg_service_tree_add(st, tg);
454 throtl_mark_tg_on_rr(tg);
455 st->count++;
456}
457
458static void throtl_enqueue_tg(struct throtl_data *td, struct throtl_grp *tg)
459{
460 if (!throtl_tg_on_rr(tg))
461 __throtl_enqueue_tg(td, tg);
462}
463
464static void __throtl_dequeue_tg(struct throtl_data *td, struct throtl_grp *tg)
465{
466 throtl_rb_erase(&tg->rb_node, &td->tg_service_tree);
467 throtl_clear_tg_on_rr(tg);
468}
469
470static void throtl_dequeue_tg(struct throtl_data *td, struct throtl_grp *tg)
471{
472 if (throtl_tg_on_rr(tg))
473 __throtl_dequeue_tg(td, tg);
474}
475
476static void throtl_schedule_next_dispatch(struct throtl_data *td)
477{
478 struct throtl_rb_root *st = &td->tg_service_tree;
479
480 /*
481 * If there are more bios pending, schedule more work.
482 */
483 if (!total_nr_queued(td))
484 return;
485
486 BUG_ON(!st->count);
487
488 update_min_dispatch_time(st);
489
490 if (time_before_eq(st->min_disptime, jiffies))
491 throtl_schedule_delayed_work(td, 0);
492 else
493 throtl_schedule_delayed_work(td, (st->min_disptime - jiffies));
494}
495
496static inline void
497throtl_start_new_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
498{
499 tg->bytes_disp[rw] = 0;
500 tg->io_disp[rw] = 0;
501 tg->slice_start[rw] = jiffies;
502 tg->slice_end[rw] = jiffies + throtl_slice;
503 throtl_log_tg(td, tg, "[%c] new slice start=%lu end=%lu jiffies=%lu",
504 rw == READ ? 'R' : 'W', tg->slice_start[rw],
505 tg->slice_end[rw], jiffies);
506}
507
508static inline void throtl_set_slice_end(struct throtl_data *td,
509 struct throtl_grp *tg, bool rw, unsigned long jiffy_end)
510{
511 tg->slice_end[rw] = roundup(jiffy_end, throtl_slice);
512}
513
514static inline void throtl_extend_slice(struct throtl_data *td,
515 struct throtl_grp *tg, bool rw, unsigned long jiffy_end)
516{
517 tg->slice_end[rw] = roundup(jiffy_end, throtl_slice);
518 throtl_log_tg(td, tg, "[%c] extend slice start=%lu end=%lu jiffies=%lu",
519 rw == READ ? 'R' : 'W', tg->slice_start[rw],
520 tg->slice_end[rw], jiffies);
521}
522
523/* Determine if previously allocated or extended slice is complete or not */
524static bool
525throtl_slice_used(struct throtl_data *td, struct throtl_grp *tg, bool rw)
526{
527 if (time_in_range(jiffies, tg->slice_start[rw], tg->slice_end[rw]))
528 return 0;
529
530 return 1;
531}
532
533/* Trim the used slices and adjust slice start accordingly */
534static inline void
535throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
536{
537 unsigned long nr_slices, time_elapsed, io_trim;
538 u64 bytes_trim, tmp;
539
540 BUG_ON(time_before(tg->slice_end[rw], tg->slice_start[rw]));
541
542 /*
543 * If bps are unlimited (-1), then time slice don't get
544 * renewed. Don't try to trim the slice if slice is used. A new
545 * slice will start when appropriate.
546 */
547 if (throtl_slice_used(td, tg, rw))
548 return;
549
550 /*
551 * A bio has been dispatched. Also adjust slice_end. It might happen
552 * that initially cgroup limit was very low resulting in high
553 * slice_end, but later limit was bumped up and bio was dispached
554 * sooner, then we need to reduce slice_end. A high bogus slice_end
555 * is bad because it does not allow new slice to start.
556 */
557
558 throtl_set_slice_end(td, tg, rw, jiffies + throtl_slice);
559
560 time_elapsed = jiffies - tg->slice_start[rw];
561
562 nr_slices = time_elapsed / throtl_slice;
563
564 if (!nr_slices)
565 return;
566 tmp = tg->bps[rw] * throtl_slice * nr_slices;
567 do_div(tmp, HZ);
568 bytes_trim = tmp;
569
570 io_trim = (tg->iops[rw] * throtl_slice * nr_slices)/HZ;
571
572 if (!bytes_trim && !io_trim)
573 return;
574
575 if (tg->bytes_disp[rw] >= bytes_trim)
576 tg->bytes_disp[rw] -= bytes_trim;
577 else
578 tg->bytes_disp[rw] = 0;
579
580 if (tg->io_disp[rw] >= io_trim)
581 tg->io_disp[rw] -= io_trim;
582 else
583 tg->io_disp[rw] = 0;
584
585 tg->slice_start[rw] += nr_slices * throtl_slice;
586
587 throtl_log_tg(td, tg, "[%c] trim slice nr=%lu bytes=%llu io=%lu"
588 " start=%lu end=%lu jiffies=%lu",
589 rw == READ ? 'R' : 'W', nr_slices, bytes_trim, io_trim,
590 tg->slice_start[rw], tg->slice_end[rw], jiffies);
591}
592
593static bool tg_with_in_iops_limit(struct throtl_data *td, struct throtl_grp *tg,
594 struct bio *bio, unsigned long *wait)
595{
596 bool rw = bio_data_dir(bio);
597 unsigned int io_allowed;
598 unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
599 u64 tmp;
600
601 jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw];
602
603 /* Slice has just started. Consider one slice interval */
604 if (!jiffy_elapsed)
605 jiffy_elapsed_rnd = throtl_slice;
606
607 jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice);
608
609 /*
610 * jiffy_elapsed_rnd should not be a big value as minimum iops can be
611 * 1 then at max jiffy elapsed should be equivalent of 1 second as we
612 * will allow dispatch after 1 second and after that slice should
613 * have been trimmed.
614 */
615
616 tmp = (u64)tg->iops[rw] * jiffy_elapsed_rnd;
617 do_div(tmp, HZ);
618
619 if (tmp > UINT_MAX)
620 io_allowed = UINT_MAX;
621 else
622 io_allowed = tmp;
623
624 if (tg->io_disp[rw] + 1 <= io_allowed) {
625 if (wait)
626 *wait = 0;
627 return 1;
628 }
629
630 /* Calc approx time to dispatch */
631 jiffy_wait = ((tg->io_disp[rw] + 1) * HZ)/tg->iops[rw] + 1;
632
633 if (jiffy_wait > jiffy_elapsed)
634 jiffy_wait = jiffy_wait - jiffy_elapsed;
635 else
636 jiffy_wait = 1;
637
638 if (wait)
639 *wait = jiffy_wait;
640 return 0;
641}
642
643static bool tg_with_in_bps_limit(struct throtl_data *td, struct throtl_grp *tg,
644 struct bio *bio, unsigned long *wait)
645{
646 bool rw = bio_data_dir(bio);
647 u64 bytes_allowed, extra_bytes, tmp;
648 unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
649
650 jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw];
651
652 /* Slice has just started. Consider one slice interval */
653 if (!jiffy_elapsed)
654 jiffy_elapsed_rnd = throtl_slice;
655
656 jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice);
657
658 tmp = tg->bps[rw] * jiffy_elapsed_rnd;
659 do_div(tmp, HZ);
660 bytes_allowed = tmp;
661
662 if (tg->bytes_disp[rw] + bio->bi_size <= bytes_allowed) {
663 if (wait)
664 *wait = 0;
665 return 1;
666 }
667
668 /* Calc approx time to dispatch */
669 extra_bytes = tg->bytes_disp[rw] + bio->bi_size - bytes_allowed;
670 jiffy_wait = div64_u64(extra_bytes * HZ, tg->bps[rw]);
671
672 if (!jiffy_wait)
673 jiffy_wait = 1;
674
675 /*
676 * This wait time is without taking into consideration the rounding
677 * up we did. Add that time also.
678 */
679 jiffy_wait = jiffy_wait + (jiffy_elapsed_rnd - jiffy_elapsed);
680 if (wait)
681 *wait = jiffy_wait;
682 return 0;
683}
684
685static bool tg_no_rule_group(struct throtl_grp *tg, bool rw) {
686 if (tg->bps[rw] == -1 && tg->iops[rw] == -1)
687 return 1;
688 return 0;
689}
690
691/*
692 * Returns whether one can dispatch a bio or not. Also returns approx number
693 * of jiffies to wait before this bio is with-in IO rate and can be dispatched
694 */
695static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg,
696 struct bio *bio, unsigned long *wait)
697{
698 bool rw = bio_data_dir(bio);
699 unsigned long bps_wait = 0, iops_wait = 0, max_wait = 0;
700
701 /*
702 * Currently whole state machine of group depends on first bio
703 * queued in the group bio list. So one should not be calling
704 * this function with a different bio if there are other bios
705 * queued.
706 */
707 BUG_ON(tg->nr_queued[rw] && bio != bio_list_peek(&tg->bio_lists[rw]));
708
709 /* If tg->bps = -1, then BW is unlimited */
710 if (tg->bps[rw] == -1 && tg->iops[rw] == -1) {
711 if (wait)
712 *wait = 0;
713 return 1;
714 }
715
716 /*
717 * If previous slice expired, start a new one otherwise renew/extend
718 * existing slice to make sure it is at least throtl_slice interval
719 * long since now.
720 */
721 if (throtl_slice_used(td, tg, rw))
722 throtl_start_new_slice(td, tg, rw);
723 else {
724 if (time_before(tg->slice_end[rw], jiffies + throtl_slice))
725 throtl_extend_slice(td, tg, rw, jiffies + throtl_slice);
726 }
727
728 if (tg_with_in_bps_limit(td, tg, bio, &bps_wait)
729 && tg_with_in_iops_limit(td, tg, bio, &iops_wait)) {
730 if (wait)
731 *wait = 0;
732 return 1;
733 }
734
735 max_wait = max(bps_wait, iops_wait);
736
737 if (wait)
738 *wait = max_wait;
739
740 if (time_before(tg->slice_end[rw], jiffies + max_wait))
741 throtl_extend_slice(td, tg, rw, jiffies + max_wait);
742
743 return 0;
744}
745
746static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
747{
748 bool rw = bio_data_dir(bio);
749 bool sync = bio->bi_rw & REQ_SYNC;
750
751 /* Charge the bio to the group */
752 tg->bytes_disp[rw] += bio->bi_size;
753 tg->io_disp[rw]++;
754
755 blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size, rw, sync);
756}
757
758static void throtl_add_bio_tg(struct throtl_data *td, struct throtl_grp *tg,
759 struct bio *bio)
760{
761 bool rw = bio_data_dir(bio);
762
763 bio_list_add(&tg->bio_lists[rw], bio);
764 /* Take a bio reference on tg */
765 throtl_ref_get_tg(tg);
766 tg->nr_queued[rw]++;
767 td->nr_queued[rw]++;
768 throtl_enqueue_tg(td, tg);
769}
770
771static void tg_update_disptime(struct throtl_data *td, struct throtl_grp *tg)
772{
773 unsigned long read_wait = -1, write_wait = -1, min_wait = -1, disptime;
774 struct bio *bio;
775
776 if ((bio = bio_list_peek(&tg->bio_lists[READ])))
777 tg_may_dispatch(td, tg, bio, &read_wait);
778
779 if ((bio = bio_list_peek(&tg->bio_lists[WRITE])))
780 tg_may_dispatch(td, tg, bio, &write_wait);
781
782 min_wait = min(read_wait, write_wait);
783 disptime = jiffies + min_wait;
784
785 /* Update dispatch time */
786 throtl_dequeue_tg(td, tg);
787 tg->disptime = disptime;
788 throtl_enqueue_tg(td, tg);
789}
790
791static void tg_dispatch_one_bio(struct throtl_data *td, struct throtl_grp *tg,
792 bool rw, struct bio_list *bl)
793{
794 struct bio *bio;
795
796 bio = bio_list_pop(&tg->bio_lists[rw]);
797 tg->nr_queued[rw]--;
798 /* Drop bio reference on tg */
799 throtl_put_tg(tg);
800
801 BUG_ON(td->nr_queued[rw] <= 0);
802 td->nr_queued[rw]--;
803
804 throtl_charge_bio(tg, bio);
805 bio_list_add(bl, bio);
806 bio->bi_rw |= REQ_THROTTLED;
807
808 throtl_trim_slice(td, tg, rw);
809}
810
811static int throtl_dispatch_tg(struct throtl_data *td, struct throtl_grp *tg,
812 struct bio_list *bl)
813{
814 unsigned int nr_reads = 0, nr_writes = 0;
815 unsigned int max_nr_reads = throtl_grp_quantum*3/4;
816 unsigned int max_nr_writes = throtl_grp_quantum - max_nr_reads;
817 struct bio *bio;
818
819 /* Try to dispatch 75% READS and 25% WRITES */
820
821 while ((bio = bio_list_peek(&tg->bio_lists[READ]))
822 && tg_may_dispatch(td, tg, bio, NULL)) {
823
824 tg_dispatch_one_bio(td, tg, bio_data_dir(bio), bl);
825 nr_reads++;
826
827 if (nr_reads >= max_nr_reads)
828 break;
829 }
830
831 while ((bio = bio_list_peek(&tg->bio_lists[WRITE]))
832 && tg_may_dispatch(td, tg, bio, NULL)) {
833
834 tg_dispatch_one_bio(td, tg, bio_data_dir(bio), bl);
835 nr_writes++;
836
837 if (nr_writes >= max_nr_writes)
838 break;
839 }
840
841 return nr_reads + nr_writes;
842}
843
844static int throtl_select_dispatch(struct throtl_data *td, struct bio_list *bl)
845{
846 unsigned int nr_disp = 0;
847 struct throtl_grp *tg;
848 struct throtl_rb_root *st = &td->tg_service_tree;
849
850 while (1) {
851 tg = throtl_rb_first(st);
852
853 if (!tg)
854 break;
855
856 if (time_before(jiffies, tg->disptime))
857 break;
858
859 throtl_dequeue_tg(td, tg);
860
861 nr_disp += throtl_dispatch_tg(td, tg, bl);
862
863 if (tg->nr_queued[0] || tg->nr_queued[1]) {
864 tg_update_disptime(td, tg);
865 throtl_enqueue_tg(td, tg);
866 }
867
868 if (nr_disp >= throtl_quantum)
869 break;
870 }
871
872 return nr_disp;
873}
874
875static void throtl_process_limit_change(struct throtl_data *td)
876{
877 struct throtl_grp *tg;
878 struct hlist_node *pos, *n;
879
880 if (!td->limits_changed)
881 return;
882
883 xchg(&td->limits_changed, false);
884
885 throtl_log(td, "limits changed");
886
887 hlist_for_each_entry_safe(tg, pos, n, &td->tg_list, tg_node) {
888 if (!tg->limits_changed)
889 continue;
890
891 if (!xchg(&tg->limits_changed, false))
892 continue;
893
894 throtl_log_tg(td, tg, "limit change rbps=%llu wbps=%llu"
895 " riops=%u wiops=%u", tg->bps[READ], tg->bps[WRITE],
896 tg->iops[READ], tg->iops[WRITE]);
897
898 /*
899 * Restart the slices for both READ and WRITES. It
900 * might happen that a group's limit are dropped
901 * suddenly and we don't want to account recently
902 * dispatched IO with new low rate
903 */
904 throtl_start_new_slice(td, tg, 0);
905 throtl_start_new_slice(td, tg, 1);
906
907 if (throtl_tg_on_rr(tg))
908 tg_update_disptime(td, tg);
909 }
910}
911
912/* Dispatch throttled bios. Should be called without queue lock held. */
913static int throtl_dispatch(struct request_queue *q)
914{
915 struct throtl_data *td = q->td;
916 unsigned int nr_disp = 0;
917 struct bio_list bio_list_on_stack;
918 struct bio *bio;
919 struct blk_plug plug;
920
921 spin_lock_irq(q->queue_lock);
922
923 throtl_process_limit_change(td);
924
925 if (!total_nr_queued(td))
926 goto out;
927
928 bio_list_init(&bio_list_on_stack);
929
930 throtl_log(td, "dispatch nr_queued=%d read=%u write=%u",
931 total_nr_queued(td), td->nr_queued[READ],
932 td->nr_queued[WRITE]);
933
934 nr_disp = throtl_select_dispatch(td, &bio_list_on_stack);
935
936 if (nr_disp)
937 throtl_log(td, "bios disp=%u", nr_disp);
938
939 throtl_schedule_next_dispatch(td);
940out:
941 spin_unlock_irq(q->queue_lock);
942
943 /*
944 * If we dispatched some requests, unplug the queue to make sure
945 * immediate dispatch
946 */
947 if (nr_disp) {
948 blk_start_plug(&plug);
949 while((bio = bio_list_pop(&bio_list_on_stack)))
950 generic_make_request(bio);
951 blk_finish_plug(&plug);
952 }
953 return nr_disp;
954}
955
956void blk_throtl_work(struct work_struct *work)
957{
958 struct throtl_data *td = container_of(work, struct throtl_data,
959 throtl_work.work);
960 struct request_queue *q = td->queue;
961
962 throtl_dispatch(q);
963}
964
965/* Call with queue lock held */
966static void
967throtl_schedule_delayed_work(struct throtl_data *td, unsigned long delay)
968{
969
970 struct delayed_work *dwork = &td->throtl_work;
971
972 /* schedule work if limits changed even if no bio is queued */
973 if (total_nr_queued(td) > 0 || td->limits_changed) {
974 /*
975 * We might have a work scheduled to be executed in future.
976 * Cancel that and schedule a new one.
977 */
978 __cancel_delayed_work(dwork);
979 queue_delayed_work(kthrotld_workqueue, dwork, delay);
980 throtl_log(td, "schedule work. delay=%lu jiffies=%lu",
981 delay, jiffies);
982 }
983}
984
985static void
986throtl_destroy_tg(struct throtl_data *td, struct throtl_grp *tg)
987{
988 /* Something wrong if we are trying to remove same group twice */
989 BUG_ON(hlist_unhashed(&tg->tg_node));
990
991 hlist_del_init(&tg->tg_node);
992
993 /*
994 * Put the reference taken at the time of creation so that when all
995 * queues are gone, group can be destroyed.
996 */
997 throtl_put_tg(tg);
998 td->nr_undestroyed_grps--;
999}
1000
1001static void throtl_release_tgs(struct throtl_data *td)
1002{
1003 struct hlist_node *pos, *n;
1004 struct throtl_grp *tg;
1005
1006 hlist_for_each_entry_safe(tg, pos, n, &td->tg_list, tg_node) {
1007 /*
1008 * If cgroup removal path got to blk_group first and removed
1009 * it from cgroup list, then it will take care of destroying
1010 * cfqg also.
1011 */
1012 if (!blkiocg_del_blkio_group(&tg->blkg))
1013 throtl_destroy_tg(td, tg);
1014 }
1015}
1016
1017static void throtl_td_free(struct throtl_data *td)
1018{
1019 kfree(td);
1020}
1021
1022/*
1023 * Blk cgroup controller notification saying that blkio_group object is being
1024 * delinked as associated cgroup object is going away. That also means that
1025 * no new IO will come in this group. So get rid of this group as soon as
1026 * any pending IO in the group is finished.
1027 *
1028 * This function is called under rcu_read_lock(). key is the rcu protected
1029 * pointer. That means "key" is a valid throtl_data pointer as long as we are
1030 * rcu read lock.
1031 *
1032 * "key" was fetched from blkio_group under blkio_cgroup->lock. That means
1033 * it should not be NULL as even if queue was going away, cgroup deltion
1034 * path got to it first.
1035 */
1036void throtl_unlink_blkio_group(void *key, struct blkio_group *blkg)
1037{
1038 unsigned long flags;
1039 struct throtl_data *td = key;
1040
1041 spin_lock_irqsave(td->queue->queue_lock, flags);
1042 throtl_destroy_tg(td, tg_of_blkg(blkg));
1043 spin_unlock_irqrestore(td->queue->queue_lock, flags);
1044}
1045
1046static void throtl_update_blkio_group_common(struct throtl_data *td,
1047 struct throtl_grp *tg)
1048{
1049 xchg(&tg->limits_changed, true);
1050 xchg(&td->limits_changed, true);
1051 /* Schedule a work now to process the limit change */
1052 throtl_schedule_delayed_work(td, 0);
1053}
1054
1055/*
1056 * For all update functions, key should be a valid pointer because these
1057 * update functions are called under blkcg_lock, that means, blkg is
1058 * valid and in turn key is valid. queue exit path can not race because
1059 * of blkcg_lock
1060 *
1061 * Can not take queue lock in update functions as queue lock under blkcg_lock
1062 * is not allowed. Under other paths we take blkcg_lock under queue_lock.
1063 */
1064static void throtl_update_blkio_group_read_bps(void *key,
1065 struct blkio_group *blkg, u64 read_bps)
1066{
1067 struct throtl_data *td = key;
1068 struct throtl_grp *tg = tg_of_blkg(blkg);
1069
1070 tg->bps[READ] = read_bps;
1071 throtl_update_blkio_group_common(td, tg);
1072}
1073
1074static void throtl_update_blkio_group_write_bps(void *key,
1075 struct blkio_group *blkg, u64 write_bps)
1076{
1077 struct throtl_data *td = key;
1078 struct throtl_grp *tg = tg_of_blkg(blkg);
1079
1080 tg->bps[WRITE] = write_bps;
1081 throtl_update_blkio_group_common(td, tg);
1082}
1083
1084static void throtl_update_blkio_group_read_iops(void *key,
1085 struct blkio_group *blkg, unsigned int read_iops)
1086{
1087 struct throtl_data *td = key;
1088 struct throtl_grp *tg = tg_of_blkg(blkg);
1089
1090 tg->iops[READ] = read_iops;
1091 throtl_update_blkio_group_common(td, tg);
1092}
1093
1094static void throtl_update_blkio_group_write_iops(void *key,
1095 struct blkio_group *blkg, unsigned int write_iops)
1096{
1097 struct throtl_data *td = key;
1098 struct throtl_grp *tg = tg_of_blkg(blkg);
1099
1100 tg->iops[WRITE] = write_iops;
1101 throtl_update_blkio_group_common(td, tg);
1102}
1103
1104static void throtl_shutdown_wq(struct request_queue *q)
1105{
1106 struct throtl_data *td = q->td;
1107
1108 cancel_delayed_work_sync(&td->throtl_work);
1109}
1110
1111static struct blkio_policy_type blkio_policy_throtl = {
1112 .ops = {
1113 .blkio_unlink_group_fn = throtl_unlink_blkio_group,
1114 .blkio_update_group_read_bps_fn =
1115 throtl_update_blkio_group_read_bps,
1116 .blkio_update_group_write_bps_fn =
1117 throtl_update_blkio_group_write_bps,
1118 .blkio_update_group_read_iops_fn =
1119 throtl_update_blkio_group_read_iops,
1120 .blkio_update_group_write_iops_fn =
1121 throtl_update_blkio_group_write_iops,
1122 },
1123 .plid = BLKIO_POLICY_THROTL,
1124};
1125
1126int blk_throtl_bio(struct request_queue *q, struct bio **biop)
1127{
1128 struct throtl_data *td = q->td;
1129 struct throtl_grp *tg;
1130 struct bio *bio = *biop;
1131 bool rw = bio_data_dir(bio), update_disptime = true;
1132 struct blkio_cgroup *blkcg;
1133
1134 if (bio->bi_rw & REQ_THROTTLED) {
1135 bio->bi_rw &= ~REQ_THROTTLED;
1136 return 0;
1137 }
1138
1139 /*
1140 * A throtl_grp pointer retrieved under rcu can be used to access
1141 * basic fields like stats and io rates. If a group has no rules,
1142 * just update the dispatch stats in lockless manner and return.
1143 */
1144
1145 rcu_read_lock();
1146 blkcg = task_blkio_cgroup(current);
1147 tg = throtl_find_tg(td, blkcg);
1148 if (tg) {
1149 throtl_tg_fill_dev_details(td, tg);
1150
1151 if (tg_no_rule_group(tg, rw)) {
1152 blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size,
1153 rw, bio->bi_rw & REQ_SYNC);
1154 rcu_read_unlock();
1155 return 0;
1156 }
1157 }
1158 rcu_read_unlock();
1159
1160 /*
1161 * Either group has not been allocated yet or it is not an unlimited
1162 * IO group
1163 */
1164
1165 spin_lock_irq(q->queue_lock);
1166 tg = throtl_get_tg(td);
1167
1168 if (IS_ERR(tg)) {
1169 if (PTR_ERR(tg) == -ENODEV) {
1170 /*
1171 * Queue is gone. No queue lock held here.
1172 */
1173 return -ENODEV;
1174 }
1175 }
1176
1177 if (tg->nr_queued[rw]) {
1178 /*
1179 * There is already another bio queued in same dir. No
1180 * need to update dispatch time.
1181 */
1182 update_disptime = false;
1183 goto queue_bio;
1184
1185 }
1186
1187 /* Bio is with-in rate limit of group */
1188 if (tg_may_dispatch(td, tg, bio, NULL)) {
1189 throtl_charge_bio(tg, bio);
1190
1191 /*
1192 * We need to trim slice even when bios are not being queued
1193 * otherwise it might happen that a bio is not queued for
1194 * a long time and slice keeps on extending and trim is not
1195 * called for a long time. Now if limits are reduced suddenly
1196 * we take into account all the IO dispatched so far at new
1197 * low rate and * newly queued IO gets a really long dispatch
1198 * time.
1199 *
1200 * So keep on trimming slice even if bio is not queued.
1201 */
1202 throtl_trim_slice(td, tg, rw);
1203 goto out;
1204 }
1205
1206queue_bio:
1207 throtl_log_tg(td, tg, "[%c] bio. bdisp=%llu sz=%u bps=%llu"
1208 " iodisp=%u iops=%u queued=%d/%d",
1209 rw == READ ? 'R' : 'W',
1210 tg->bytes_disp[rw], bio->bi_size, tg->bps[rw],
1211 tg->io_disp[rw], tg->iops[rw],
1212 tg->nr_queued[READ], tg->nr_queued[WRITE]);
1213
1214 throtl_add_bio_tg(q->td, tg, bio);
1215 *biop = NULL;
1216
1217 if (update_disptime) {
1218 tg_update_disptime(td, tg);
1219 throtl_schedule_next_dispatch(td);
1220 }
1221
1222out:
1223 spin_unlock_irq(q->queue_lock);
1224 return 0;
1225}
1226
1227int blk_throtl_init(struct request_queue *q)
1228{
1229 struct throtl_data *td;
1230 struct throtl_grp *tg;
1231
1232 td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node);
1233 if (!td)
1234 return -ENOMEM;
1235
1236 INIT_HLIST_HEAD(&td->tg_list);
1237 td->tg_service_tree = THROTL_RB_ROOT;
1238 td->limits_changed = false;
1239 INIT_DELAYED_WORK(&td->throtl_work, blk_throtl_work);
1240
1241 /* alloc and Init root group. */
1242 td->queue = q;
1243 tg = throtl_alloc_tg(td);
1244
1245 if (!tg) {
1246 kfree(td);
1247 return -ENOMEM;
1248 }
1249
1250 td->root_tg = tg;
1251
1252 rcu_read_lock();
1253 throtl_init_add_tg_lists(td, tg, &blkio_root_cgroup);
1254 rcu_read_unlock();
1255
1256 /* Attach throtl data to request queue */
1257 q->td = td;
1258 return 0;
1259}
1260
1261void blk_throtl_exit(struct request_queue *q)
1262{
1263 struct throtl_data *td = q->td;
1264 bool wait = false;
1265
1266 BUG_ON(!td);
1267
1268 throtl_shutdown_wq(q);
1269
1270 spin_lock_irq(q->queue_lock);
1271 throtl_release_tgs(td);
1272
1273 /* If there are other groups */
1274 if (td->nr_undestroyed_grps > 0)
1275 wait = true;
1276
1277 spin_unlock_irq(q->queue_lock);
1278
1279 /*
1280 * Wait for tg->blkg->key accessors to exit their grace periods.
1281 * Do this wait only if there are other undestroyed groups out
1282 * there (other than root group). This can happen if cgroup deletion
1283 * path claimed the responsibility of cleaning up a group before
1284 * queue cleanup code get to the group.
1285 *
1286 * Do not call synchronize_rcu() unconditionally as there are drivers
1287 * which create/delete request queue hundreds of times during scan/boot
1288 * and synchronize_rcu() can take significant time and slow down boot.
1289 */
1290 if (wait)
1291 synchronize_rcu();
1292
1293 /*
1294 * Just being safe to make sure after previous flush if some body did
1295 * update limits through cgroup and another work got queued, cancel
1296 * it.
1297 */
1298 throtl_shutdown_wq(q);
1299 throtl_td_free(td);
1300}
1301
1302static int __init throtl_init(void)
1303{
1304 kthrotld_workqueue = alloc_workqueue("kthrotld", WQ_MEM_RECLAIM, 0);
1305 if (!kthrotld_workqueue)
1306 panic("Failed to create kthrotld\n");
1307
1308 blkio_policy_register(&blkio_policy_throtl);
1309 return 0;
1310}
1311
1312module_init(throtl_init);
diff --git a/block/blk.h b/block/blk.h
index d6b911ac002c..d6586287adc9 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -18,8 +18,6 @@ int blk_rq_append_bio(struct request_queue *q, struct request *rq,
18void blk_dequeue_request(struct request *rq); 18void blk_dequeue_request(struct request *rq);
19void __blk_queue_free_tags(struct request_queue *q); 19void __blk_queue_free_tags(struct request_queue *q);
20 20
21void blk_unplug_work(struct work_struct *work);
22void blk_unplug_timeout(unsigned long data);
23void blk_rq_timed_out_timer(unsigned long data); 21void blk_rq_timed_out_timer(unsigned long data);
24void blk_delete_timer(struct request *); 22void blk_delete_timer(struct request *);
25void blk_add_timer(struct request *); 23void blk_add_timer(struct request *);
@@ -34,7 +32,7 @@ enum rq_atomic_flags {
34 32
35/* 33/*
36 * EH timer and IO completion will both attempt to 'grab' the request, make 34 * EH timer and IO completion will both attempt to 'grab' the request, make
37 * sure that only one of them suceeds 35 * sure that only one of them succeeds
38 */ 36 */
39static inline int blk_mark_rq_complete(struct request *rq) 37static inline int blk_mark_rq_complete(struct request *rq)
40{ 38{
@@ -51,18 +49,41 @@ static inline void blk_clear_rq_complete(struct request *rq)
51 */ 49 */
52#define ELV_ON_HASH(rq) (!hlist_unhashed(&(rq)->hash)) 50#define ELV_ON_HASH(rq) (!hlist_unhashed(&(rq)->hash))
53 51
52void blk_insert_flush(struct request *rq);
53void blk_abort_flushes(struct request_queue *q);
54
54static inline struct request *__elv_next_request(struct request_queue *q) 55static inline struct request *__elv_next_request(struct request_queue *q)
55{ 56{
56 struct request *rq; 57 struct request *rq;
57 58
58 while (1) { 59 while (1) {
59 while (!list_empty(&q->queue_head)) { 60 if (!list_empty(&q->queue_head)) {
60 rq = list_entry_rq(q->queue_head.next); 61 rq = list_entry_rq(q->queue_head.next);
61 if (blk_do_ordered(q, &rq)) 62 return rq;
62 return rq;
63 } 63 }
64 64
65 if (!q->elevator->ops->elevator_dispatch_fn(q, 0)) 65 /*
66 * Flush request is running and flush request isn't queueable
67 * in the drive, we can hold the queue till flush request is
68 * finished. Even we don't do this, driver can't dispatch next
69 * requests and will requeue them. And this can improve
70 * throughput too. For example, we have request flush1, write1,
71 * flush 2. flush1 is dispatched, then queue is hold, write1
72 * isn't inserted to queue. After flush1 is finished, flush2
73 * will be dispatched. Since disk cache is already clean,
74 * flush2 will be finished very soon, so looks like flush2 is
75 * folded to flush1.
76 * Since the queue is hold, a flag is set to indicate the queue
77 * should be restarted later. Please see flush_end_io() for
78 * details.
79 */
80 if (q->flush_pending_idx != q->flush_running_idx &&
81 !queue_flush_queueable(q)) {
82 q->flush_queue_delayed = 1;
83 return NULL;
84 }
85 if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags) ||
86 !q->elevator->ops->elevator_dispatch_fn(q, 0))
66 return NULL; 87 return NULL;
67 } 88 }
68} 89}
@@ -103,6 +124,8 @@ int ll_front_merge_fn(struct request_queue *q, struct request *req,
103 struct bio *bio); 124 struct bio *bio);
104int attempt_back_merge(struct request_queue *q, struct request *rq); 125int attempt_back_merge(struct request_queue *q, struct request *rq);
105int attempt_front_merge(struct request_queue *q, struct request *rq); 126int attempt_front_merge(struct request_queue *q, struct request *rq);
127int blk_attempt_req_merge(struct request_queue *q, struct request *rq,
128 struct request *next);
106void blk_recalc_rq_segments(struct request *rq); 129void blk_recalc_rq_segments(struct request *rq);
107void blk_rq_set_mixed_merge(struct request *rq); 130void blk_rq_set_mixed_merge(struct request *rq);
108 131
@@ -132,14 +155,6 @@ static inline int queue_congestion_off_threshold(struct request_queue *q)
132 return q->nr_congestion_off; 155 return q->nr_congestion_off;
133} 156}
134 157
135#if defined(CONFIG_BLK_DEV_INTEGRITY)
136
137#define rq_for_each_integrity_segment(bvl, _rq, _iter) \
138 __rq_for_each_bio(_iter.bio, _rq) \
139 bip_for_each_vec(bvl, _iter.bio->bi_integrity, _iter.i)
140
141#endif /* BLK_DEV_INTEGRITY */
142
143static inline int blk_cpu_to_group(int cpu) 158static inline int blk_cpu_to_group(int cpu)
144{ 159{
145 int group = NR_CPUS; 160 int group = NR_CPUS;
diff --git a/block/bsg.c b/block/bsg.c
index 0c00870553a3..0c8b64a16484 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -20,7 +20,6 @@
20#include <linux/uio.h> 20#include <linux/uio.h>
21#include <linux/idr.h> 21#include <linux/idr.h>
22#include <linux/bsg.h> 22#include <linux/bsg.h>
23#include <linux/smp_lock.h>
24#include <linux/slab.h> 23#include <linux/slab.h>
25 24
26#include <scsi/scsi.h> 25#include <scsi/scsi.h>
@@ -251,6 +250,14 @@ bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, fmode_t has_write_perm,
251 int ret, rw; 250 int ret, rw;
252 unsigned int dxfer_len; 251 unsigned int dxfer_len;
253 void *dxferp = NULL; 252 void *dxferp = NULL;
253 struct bsg_class_device *bcd = &q->bsg_dev;
254
255 /* if the LLD has been removed then the bsg_unregister_queue will
256 * eventually be called and the class_dev was freed, so we can no
257 * longer use this request_queue. Return no such address.
258 */
259 if (!bcd->class_dev)
260 return ERR_PTR(-ENXIO);
254 261
255 dprintk("map hdr %llx/%u %llx/%u\n", (unsigned long long) hdr->dout_xferp, 262 dprintk("map hdr %llx/%u %llx/%u\n", (unsigned long long) hdr->dout_xferp,
256 hdr->dout_xfer_len, (unsigned long long) hdr->din_xferp, 263 hdr->dout_xfer_len, (unsigned long long) hdr->din_xferp,
@@ -843,9 +850,7 @@ static int bsg_open(struct inode *inode, struct file *file)
843{ 850{
844 struct bsg_device *bd; 851 struct bsg_device *bd;
845 852
846 lock_kernel();
847 bd = bsg_get_device(inode, file); 853 bd = bsg_get_device(inode, file);
848 unlock_kernel();
849 854
850 if (IS_ERR(bd)) 855 if (IS_ERR(bd))
851 return PTR_ERR(bd); 856 return PTR_ERR(bd);
@@ -968,6 +973,7 @@ static const struct file_operations bsg_fops = {
968 .release = bsg_release, 973 .release = bsg_release,
969 .unlocked_ioctl = bsg_ioctl, 974 .unlocked_ioctl = bsg_ioctl,
970 .owner = THIS_MODULE, 975 .owner = THIS_MODULE,
976 .llseek = default_llseek,
971}; 977};
972 978
973void bsg_unregister_queue(struct request_queue *q) 979void bsg_unregister_queue(struct request_queue *q)
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 9eba291eb6fd..ae21919f15e1 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -54,9 +54,9 @@ static const int cfq_hist_divisor = 4;
54#define CFQQ_SEEKY(cfqq) (hweight32(cfqq->seek_history) > 32/8) 54#define CFQQ_SEEKY(cfqq) (hweight32(cfqq->seek_history) > 32/8)
55 55
56#define RQ_CIC(rq) \ 56#define RQ_CIC(rq) \
57 ((struct cfq_io_context *) (rq)->elevator_private) 57 ((struct cfq_io_context *) (rq)->elevator_private[0])
58#define RQ_CFQQ(rq) (struct cfq_queue *) ((rq)->elevator_private2) 58#define RQ_CFQQ(rq) (struct cfq_queue *) ((rq)->elevator_private[1])
59#define RQ_CFQG(rq) (struct cfq_group *) ((rq)->elevator_private3) 59#define RQ_CFQG(rq) (struct cfq_group *) ((rq)->elevator_private[2])
60 60
61static struct kmem_cache *cfq_pool; 61static struct kmem_cache *cfq_pool;
62static struct kmem_cache *cfq_ioc_pool; 62static struct kmem_cache *cfq_ioc_pool;
@@ -87,7 +87,6 @@ struct cfq_rb_root {
87 unsigned count; 87 unsigned count;
88 unsigned total_weight; 88 unsigned total_weight;
89 u64 min_vdisktime; 89 u64 min_vdisktime;
90 struct rb_node *active;
91}; 90};
92#define CFQ_RB_ROOT (struct cfq_rb_root) { .rb = RB_ROOT, .left = NULL, \ 91#define CFQ_RB_ROOT (struct cfq_rb_root) { .rb = RB_ROOT, .left = NULL, \
93 .count = 0, .min_vdisktime = 0, } 92 .count = 0, .min_vdisktime = 0, }
@@ -97,7 +96,7 @@ struct cfq_rb_root {
97 */ 96 */
98struct cfq_queue { 97struct cfq_queue {
99 /* reference count */ 98 /* reference count */
100 atomic_t ref; 99 int ref;
101 /* various state flags, see below */ 100 /* various state flags, see below */
102 unsigned int flags; 101 unsigned int flags;
103 /* parent cfq_data */ 102 /* parent cfq_data */
@@ -147,7 +146,6 @@ struct cfq_queue {
147 struct cfq_rb_root *service_tree; 146 struct cfq_rb_root *service_tree;
148 struct cfq_queue *new_cfqq; 147 struct cfq_queue *new_cfqq;
149 struct cfq_group *cfqg; 148 struct cfq_group *cfqg;
150 struct cfq_group *orig_cfqg;
151 /* Number of sectors dispatched from queue in single dispatch round */ 149 /* Number of sectors dispatched from queue in single dispatch round */
152 unsigned long nr_sectors; 150 unsigned long nr_sectors;
153}; 151};
@@ -160,6 +158,7 @@ enum wl_prio_t {
160 BE_WORKLOAD = 0, 158 BE_WORKLOAD = 0,
161 RT_WORKLOAD = 1, 159 RT_WORKLOAD = 1,
162 IDLE_WORKLOAD = 2, 160 IDLE_WORKLOAD = 2,
161 CFQ_PRIO_NR,
163}; 162};
164 163
165/* 164/*
@@ -179,15 +178,25 @@ struct cfq_group {
179 /* group service_tree key */ 178 /* group service_tree key */
180 u64 vdisktime; 179 u64 vdisktime;
181 unsigned int weight; 180 unsigned int weight;
182 bool on_st; 181 unsigned int new_weight;
182 bool needs_update;
183 183
184 /* number of cfqq currently on this group */ 184 /* number of cfqq currently on this group */
185 int nr_cfqq; 185 int nr_cfqq;
186 186
187 /* Per group busy queus average. Useful for workload slice calc. */
188 unsigned int busy_queues_avg[2];
189 /* 187 /*
190 * rr lists of queues with requests, onle rr for each priority class. 188 * Per group busy queues average. Useful for workload slice calc. We
189 * create the array for each prio class but at run time it is used
190 * only for RT and BE class and slot for IDLE class remains unused.
191 * This is primarily done to avoid confusion and a gcc warning.
192 */
193 unsigned int busy_queues_avg[CFQ_PRIO_NR];
194 /*
195 * rr lists of queues with requests. We maintain service trees for
196 * RT and BE classes. These trees are subdivided in subclasses
197 * of SYNC, SYNC_NOIDLE and ASYNC based on workload type. For IDLE
198 * class there is no subclassification and all the cfq queues go on
199 * a single tree service_tree_idle.
191 * Counts are embedded in the cfq_rb_root 200 * Counts are embedded in the cfq_rb_root
192 */ 201 */
193 struct cfq_rb_root service_trees[2][3]; 202 struct cfq_rb_root service_trees[2][3];
@@ -199,7 +208,7 @@ struct cfq_group {
199 struct blkio_group blkg; 208 struct blkio_group blkg;
200#ifdef CONFIG_CFQ_GROUP_IOSCHED 209#ifdef CONFIG_CFQ_GROUP_IOSCHED
201 struct hlist_node cfqd_node; 210 struct hlist_node cfqd_node;
202 atomic_t ref; 211 int ref;
203#endif 212#endif
204 /* number of requests that are on the dispatch list or inside driver */ 213 /* number of requests that are on the dispatch list or inside driver */
205 int dispatched; 214 int dispatched;
@@ -221,7 +230,6 @@ struct cfq_data {
221 enum wl_type_t serving_type; 230 enum wl_type_t serving_type;
222 unsigned long workload_expires; 231 unsigned long workload_expires;
223 struct cfq_group *serving_group; 232 struct cfq_group *serving_group;
224 bool noidle_tree_requires_idle;
225 233
226 /* 234 /*
227 * Each priority tree is sorted by next_request position. These 235 * Each priority tree is sorted by next_request position. These
@@ -231,6 +239,7 @@ struct cfq_data {
231 struct rb_root prio_trees[CFQ_PRIO_LISTS]; 239 struct rb_root prio_trees[CFQ_PRIO_LISTS];
232 240
233 unsigned int busy_queues; 241 unsigned int busy_queues;
242 unsigned int busy_sync_queues;
234 243
235 int rq_in_driver; 244 int rq_in_driver;
236 int rq_in_flight[2]; 245 int rq_in_flight[2];
@@ -278,7 +287,6 @@ struct cfq_data {
278 unsigned int cfq_slice_idle; 287 unsigned int cfq_slice_idle;
279 unsigned int cfq_group_idle; 288 unsigned int cfq_group_idle;
280 unsigned int cfq_latency; 289 unsigned int cfq_latency;
281 unsigned int cfq_group_isolation;
282 290
283 unsigned int cic_index; 291 unsigned int cic_index;
284 struct list_head cic_list; 292 struct list_head cic_list;
@@ -292,7 +300,9 @@ struct cfq_data {
292 300
293 /* List of cfq groups being managed on this device*/ 301 /* List of cfq groups being managed on this device*/
294 struct hlist_head cfqg_list; 302 struct hlist_head cfqg_list;
295 struct rcu_head rcu; 303
304 /* Number of groups which are on blkcg->blkg_list */
305 unsigned int nr_blkcg_linked_grps;
296}; 306};
297 307
298static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd); 308static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);
@@ -359,16 +369,16 @@ CFQ_CFQQ_FNS(wait_busy);
359#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \ 369#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \
360 blk_add_trace_msg((cfqd)->queue, "cfq%d%c %s " fmt, (cfqq)->pid, \ 370 blk_add_trace_msg((cfqd)->queue, "cfq%d%c %s " fmt, (cfqq)->pid, \
361 cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \ 371 cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \
362 blkg_path(&(cfqq)->cfqg->blkg), ##args); 372 blkg_path(&(cfqq)->cfqg->blkg), ##args)
363 373
364#define cfq_log_cfqg(cfqd, cfqg, fmt, args...) \ 374#define cfq_log_cfqg(cfqd, cfqg, fmt, args...) \
365 blk_add_trace_msg((cfqd)->queue, "%s " fmt, \ 375 blk_add_trace_msg((cfqd)->queue, "%s " fmt, \
366 blkg_path(&(cfqg)->blkg), ##args); \ 376 blkg_path(&(cfqg)->blkg), ##args) \
367 377
368#else 378#else
369#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \ 379#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \
370 blk_add_trace_msg((cfqd)->queue, "cfq%d " fmt, (cfqq)->pid, ##args) 380 blk_add_trace_msg((cfqd)->queue, "cfq%d " fmt, (cfqq)->pid, ##args)
371#define cfq_log_cfqg(cfqd, cfqg, fmt, args...) do {} while (0); 381#define cfq_log_cfqg(cfqd, cfqg, fmt, args...) do {} while (0)
372#endif 382#endif
373#define cfq_log(cfqd, fmt, args...) \ 383#define cfq_log(cfqd, fmt, args...) \
374 blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args) 384 blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args)
@@ -494,13 +504,6 @@ static inline void cfq_schedule_dispatch(struct cfq_data *cfqd)
494 } 504 }
495} 505}
496 506
497static int cfq_queue_empty(struct request_queue *q)
498{
499 struct cfq_data *cfqd = q->elevator->elevator_data;
500
501 return !cfqd->rq_queued;
502}
503
504/* 507/*
505 * Scale schedule slice based on io priority. Use the sync time slice only 508 * Scale schedule slice based on io priority. Use the sync time slice only
506 * if a queue is marked sync and has sync io queued. A sync queue with async 509 * if a queue is marked sync and has sync io queued. A sync queue with async
@@ -551,20 +554,13 @@ static inline u64 min_vdisktime(u64 min_vdisktime, u64 vdisktime)
551 554
552static void update_min_vdisktime(struct cfq_rb_root *st) 555static void update_min_vdisktime(struct cfq_rb_root *st)
553{ 556{
554 u64 vdisktime = st->min_vdisktime;
555 struct cfq_group *cfqg; 557 struct cfq_group *cfqg;
556 558
557 if (st->active) {
558 cfqg = rb_entry_cfqg(st->active);
559 vdisktime = cfqg->vdisktime;
560 }
561
562 if (st->left) { 559 if (st->left) {
563 cfqg = rb_entry_cfqg(st->left); 560 cfqg = rb_entry_cfqg(st->left);
564 vdisktime = min_vdisktime(vdisktime, cfqg->vdisktime); 561 st->min_vdisktime = max_vdisktime(st->min_vdisktime,
562 cfqg->vdisktime);
565 } 563 }
566
567 st->min_vdisktime = max_vdisktime(st->min_vdisktime, vdisktime);
568} 564}
569 565
570/* 566/*
@@ -596,8 +592,8 @@ cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg)
596 return cfq_target_latency * cfqg->weight / st->total_weight; 592 return cfq_target_latency * cfqg->weight / st->total_weight;
597} 593}
598 594
599static inline void 595static inline unsigned
600cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq) 596cfq_scaled_cfqq_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
601{ 597{
602 unsigned slice = cfq_prio_to_slice(cfqd, cfqq); 598 unsigned slice = cfq_prio_to_slice(cfqd, cfqq);
603 if (cfqd->cfq_latency) { 599 if (cfqd->cfq_latency) {
@@ -623,6 +619,14 @@ cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
623 low_slice); 619 low_slice);
624 } 620 }
625 } 621 }
622 return slice;
623}
624
625static inline void
626cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
627{
628 unsigned slice = cfq_scaled_cfqq_slice(cfqd, cfqq);
629
626 cfqq->slice_start = jiffies; 630 cfqq->slice_start = jiffies;
627 cfqq->slice_end = jiffies + slice; 631 cfqq->slice_end = jiffies + slice;
628 cfqq->allocated_slice = slice; 632 cfqq->allocated_slice = slice;
@@ -637,11 +641,11 @@ cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
637static inline bool cfq_slice_used(struct cfq_queue *cfqq) 641static inline bool cfq_slice_used(struct cfq_queue *cfqq)
638{ 642{
639 if (cfq_cfqq_slice_new(cfqq)) 643 if (cfq_cfqq_slice_new(cfqq))
640 return 0; 644 return false;
641 if (time_before(jiffies, cfqq->slice_end)) 645 if (time_before(jiffies, cfqq->slice_end))
642 return 0; 646 return false;
643 647
644 return 1; 648 return true;
645} 649}
646 650
647/* 651/*
@@ -663,15 +667,11 @@ cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2,
663 if (rq2 == NULL) 667 if (rq2 == NULL)
664 return rq1; 668 return rq1;
665 669
666 if (rq_is_sync(rq1) && !rq_is_sync(rq2)) 670 if (rq_is_sync(rq1) != rq_is_sync(rq2))
667 return rq1; 671 return rq_is_sync(rq1) ? rq1 : rq2;
668 else if (rq_is_sync(rq2) && !rq_is_sync(rq1)) 672
669 return rq2; 673 if ((rq1->cmd_flags ^ rq2->cmd_flags) & REQ_META)
670 if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META)) 674 return rq1->cmd_flags & REQ_META ? rq1 : rq2;
671 return rq1;
672 else if ((rq2->cmd_flags & REQ_META) &&
673 !(rq1->cmd_flags & REQ_META))
674 return rq2;
675 675
676 s1 = blk_rq_pos(rq1); 676 s1 = blk_rq_pos(rq1);
677 s2 = blk_rq_pos(rq2); 677 s2 = blk_rq_pos(rq2);
@@ -853,20 +853,40 @@ __cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
853} 853}
854 854
855static void 855static void
856cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg) 856cfq_update_group_weight(struct cfq_group *cfqg)
857{
858 BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node));
859 if (cfqg->needs_update) {
860 cfqg->weight = cfqg->new_weight;
861 cfqg->needs_update = false;
862 }
863}
864
865static void
866cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
867{
868 BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node));
869
870 cfq_update_group_weight(cfqg);
871 __cfq_group_service_tree_add(st, cfqg);
872 st->total_weight += cfqg->weight;
873}
874
875static void
876cfq_group_notify_queue_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
857{ 877{
858 struct cfq_rb_root *st = &cfqd->grp_service_tree; 878 struct cfq_rb_root *st = &cfqd->grp_service_tree;
859 struct cfq_group *__cfqg; 879 struct cfq_group *__cfqg;
860 struct rb_node *n; 880 struct rb_node *n;
861 881
862 cfqg->nr_cfqq++; 882 cfqg->nr_cfqq++;
863 if (cfqg->on_st) 883 if (!RB_EMPTY_NODE(&cfqg->rb_node))
864 return; 884 return;
865 885
866 /* 886 /*
867 * Currently put the group at the end. Later implement something 887 * Currently put the group at the end. Later implement something
868 * so that groups get lesser vtime based on their weights, so that 888 * so that groups get lesser vtime based on their weights, so that
869 * if group does not loose all if it was not continously backlogged. 889 * if group does not loose all if it was not continuously backlogged.
870 */ 890 */
871 n = rb_last(&st->rb); 891 n = rb_last(&st->rb);
872 if (n) { 892 if (n) {
@@ -874,20 +894,22 @@ cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
874 cfqg->vdisktime = __cfqg->vdisktime + CFQ_IDLE_DELAY; 894 cfqg->vdisktime = __cfqg->vdisktime + CFQ_IDLE_DELAY;
875 } else 895 } else
876 cfqg->vdisktime = st->min_vdisktime; 896 cfqg->vdisktime = st->min_vdisktime;
897 cfq_group_service_tree_add(st, cfqg);
898}
877 899
878 __cfq_group_service_tree_add(st, cfqg); 900static void
879 cfqg->on_st = true; 901cfq_group_service_tree_del(struct cfq_rb_root *st, struct cfq_group *cfqg)
880 st->total_weight += cfqg->weight; 902{
903 st->total_weight -= cfqg->weight;
904 if (!RB_EMPTY_NODE(&cfqg->rb_node))
905 cfq_rb_erase(&cfqg->rb_node, st);
881} 906}
882 907
883static void 908static void
884cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg) 909cfq_group_notify_queue_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
885{ 910{
886 struct cfq_rb_root *st = &cfqd->grp_service_tree; 911 struct cfq_rb_root *st = &cfqd->grp_service_tree;
887 912
888 if (st->active == &cfqg->rb_node)
889 st->active = NULL;
890
891 BUG_ON(cfqg->nr_cfqq < 1); 913 BUG_ON(cfqg->nr_cfqq < 1);
892 cfqg->nr_cfqq--; 914 cfqg->nr_cfqq--;
893 915
@@ -896,15 +918,13 @@ cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
896 return; 918 return;
897 919
898 cfq_log_cfqg(cfqd, cfqg, "del_from_rr group"); 920 cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");
899 cfqg->on_st = false; 921 cfq_group_service_tree_del(st, cfqg);
900 st->total_weight -= cfqg->weight;
901 if (!RB_EMPTY_NODE(&cfqg->rb_node))
902 cfq_rb_erase(&cfqg->rb_node, st);
903 cfqg->saved_workload_slice = 0; 922 cfqg->saved_workload_slice = 0;
904 cfq_blkiocg_update_dequeue_stats(&cfqg->blkg, 1); 923 cfq_blkiocg_update_dequeue_stats(&cfqg->blkg, 1);
905} 924}
906 925
907static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq) 926static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq,
927 unsigned int *unaccounted_time)
908{ 928{
909 unsigned int slice_used; 929 unsigned int slice_used;
910 930
@@ -923,8 +943,13 @@ static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
923 1); 943 1);
924 } else { 944 } else {
925 slice_used = jiffies - cfqq->slice_start; 945 slice_used = jiffies - cfqq->slice_start;
926 if (slice_used > cfqq->allocated_slice) 946 if (slice_used > cfqq->allocated_slice) {
947 *unaccounted_time = slice_used - cfqq->allocated_slice;
927 slice_used = cfqq->allocated_slice; 948 slice_used = cfqq->allocated_slice;
949 }
950 if (time_after(cfqq->slice_start, cfqq->dispatch_start))
951 *unaccounted_time += cfqq->slice_start -
952 cfqq->dispatch_start;
928 } 953 }
929 954
930 return slice_used; 955 return slice_used;
@@ -934,12 +959,12 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
934 struct cfq_queue *cfqq) 959 struct cfq_queue *cfqq)
935{ 960{
936 struct cfq_rb_root *st = &cfqd->grp_service_tree; 961 struct cfq_rb_root *st = &cfqd->grp_service_tree;
937 unsigned int used_sl, charge; 962 unsigned int used_sl, charge, unaccounted_sl = 0;
938 int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg) 963 int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg)
939 - cfqg->service_tree_idle.count; 964 - cfqg->service_tree_idle.count;
940 965
941 BUG_ON(nr_sync < 0); 966 BUG_ON(nr_sync < 0);
942 used_sl = charge = cfq_cfqq_slice_usage(cfqq); 967 used_sl = charge = cfq_cfqq_slice_usage(cfqq, &unaccounted_sl);
943 968
944 if (iops_mode(cfqd)) 969 if (iops_mode(cfqd))
945 charge = cfqq->slice_dispatch; 970 charge = cfqq->slice_dispatch;
@@ -947,9 +972,10 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
947 charge = cfqq->allocated_slice; 972 charge = cfqq->allocated_slice;
948 973
949 /* Can't update vdisktime while group is on service tree */ 974 /* Can't update vdisktime while group is on service tree */
950 cfq_rb_erase(&cfqg->rb_node, st); 975 cfq_group_service_tree_del(st, cfqg);
951 cfqg->vdisktime += cfq_scale_slice(charge, cfqg); 976 cfqg->vdisktime += cfq_scale_slice(charge, cfqg);
952 __cfq_group_service_tree_add(st, cfqg); 977 /* If a new weight was requested, update now, off tree */
978 cfq_group_service_tree_add(st, cfqg);
953 979
954 /* This group is being expired. Save the context */ 980 /* This group is being expired. Save the context */
955 if (time_after(cfqd->workload_expires, jiffies)) { 981 if (time_after(cfqd->workload_expires, jiffies)) {
@@ -962,10 +988,12 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
962 988
963 cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime, 989 cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime,
964 st->min_vdisktime); 990 st->min_vdisktime);
965 cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u disp=%u charge=%u iops=%u" 991 cfq_log_cfqq(cfqq->cfqd, cfqq,
966 " sect=%u", used_sl, cfqq->slice_dispatch, charge, 992 "sl_used=%u disp=%u charge=%u iops=%u sect=%lu",
967 iops_mode(cfqd), cfqq->nr_sectors); 993 used_sl, cfqq->slice_dispatch, charge,
968 cfq_blkiocg_update_timeslice_used(&cfqg->blkg, used_sl); 994 iops_mode(cfqd), cfqq->nr_sectors);
995 cfq_blkiocg_update_timeslice_used(&cfqg->blkg, used_sl,
996 unaccounted_sl);
969 cfq_blkiocg_set_start_empty_time(&cfqg->blkg); 997 cfq_blkiocg_set_start_empty_time(&cfqg->blkg);
970} 998}
971 999
@@ -977,35 +1005,55 @@ static inline struct cfq_group *cfqg_of_blkg(struct blkio_group *blkg)
977 return NULL; 1005 return NULL;
978} 1006}
979 1007
980void 1008void cfq_update_blkio_group_weight(void *key, struct blkio_group *blkg,
981cfq_update_blkio_group_weight(struct blkio_group *blkg, unsigned int weight) 1009 unsigned int weight)
982{ 1010{
983 cfqg_of_blkg(blkg)->weight = weight; 1011 struct cfq_group *cfqg = cfqg_of_blkg(blkg);
1012 cfqg->new_weight = weight;
1013 cfqg->needs_update = true;
984} 1014}
985 1015
986static struct cfq_group * 1016static void cfq_init_add_cfqg_lists(struct cfq_data *cfqd,
987cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create) 1017 struct cfq_group *cfqg, struct blkio_cgroup *blkcg)
988{ 1018{
989 struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
990 struct cfq_group *cfqg = NULL;
991 void *key = cfqd;
992 int i, j;
993 struct cfq_rb_root *st;
994 struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info; 1019 struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
995 unsigned int major, minor; 1020 unsigned int major, minor;
996 1021
997 cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key)); 1022 /*
998 if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) { 1023 * Add group onto cgroup list. It might happen that bdi->dev is
1024 * not initialized yet. Initialize this new group without major
1025 * and minor info and this info will be filled in once a new thread
1026 * comes for IO.
1027 */
1028 if (bdi->dev) {
999 sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); 1029 sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
1000 cfqg->blkg.dev = MKDEV(major, minor); 1030 cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg,
1001 goto done; 1031 (void *)cfqd, MKDEV(major, minor));
1002 } 1032 } else
1003 if (cfqg || !create) 1033 cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg,
1004 goto done; 1034 (void *)cfqd, 0);
1035
1036 cfqd->nr_blkcg_linked_grps++;
1037 cfqg->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
1038
1039 /* Add group on cfqd list */
1040 hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
1041}
1042
1043/*
1044 * Should be called from sleepable context. No request queue lock as per
1045 * cpu stats are allocated dynamically and alloc_percpu needs to be called
1046 * from sleepable context.
1047 */
1048static struct cfq_group * cfq_alloc_cfqg(struct cfq_data *cfqd)
1049{
1050 struct cfq_group *cfqg = NULL;
1051 int i, j, ret;
1052 struct cfq_rb_root *st;
1005 1053
1006 cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node); 1054 cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node);
1007 if (!cfqg) 1055 if (!cfqg)
1008 goto done; 1056 return NULL;
1009 1057
1010 for_each_cfqg_st(cfqg, i, j, st) 1058 for_each_cfqg_st(cfqg, i, j, st)
1011 *st = CFQ_RB_ROOT; 1059 *st = CFQ_RB_ROOT;
@@ -1017,52 +1065,103 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
1017 * elevator which will be dropped by either elevator exit 1065 * elevator which will be dropped by either elevator exit
1018 * or cgroup deletion path depending on who is exiting first. 1066 * or cgroup deletion path depending on who is exiting first.
1019 */ 1067 */
1020 atomic_set(&cfqg->ref, 1); 1068 cfqg->ref = 1;
1069
1070 ret = blkio_alloc_blkg_stats(&cfqg->blkg);
1071 if (ret) {
1072 kfree(cfqg);
1073 return NULL;
1074 }
1075
1076 return cfqg;
1077}
1078
1079static struct cfq_group *
1080cfq_find_cfqg(struct cfq_data *cfqd, struct blkio_cgroup *blkcg)
1081{
1082 struct cfq_group *cfqg = NULL;
1083 void *key = cfqd;
1084 struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
1085 unsigned int major, minor;
1021 1086
1022 /* 1087 /*
1023 * Add group onto cgroup list. It might happen that bdi->dev is 1088 * This is the common case when there are no blkio cgroups.
1024 * not initiliazed yet. Initialize this new group without major 1089 * Avoid lookup in this case
1025 * and minor info and this info will be filled in once a new thread
1026 * comes for IO. See code above.
1027 */ 1090 */
1028 if (bdi->dev) { 1091 if (blkcg == &blkio_root_cgroup)
1029 sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); 1092 cfqg = &cfqd->root_group;
1030 cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd, 1093 else
1031 MKDEV(major, minor)); 1094 cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
1032 } else
1033 cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
1034 0);
1035
1036 cfqg->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
1037 1095
1038 /* Add group on cfqd list */ 1096 if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
1039 hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list); 1097 sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
1098 cfqg->blkg.dev = MKDEV(major, minor);
1099 }
1040 1100
1041done:
1042 return cfqg; 1101 return cfqg;
1043} 1102}
1044 1103
1045/* 1104/*
1046 * Search for the cfq group current task belongs to. If create = 1, then also 1105 * Search for the cfq group current task belongs to. request_queue lock must
1047 * create the cfq group if it does not exist. request_queue lock must be held. 1106 * be held.
1048 */ 1107 */
1049static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create) 1108static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd)
1050{ 1109{
1051 struct cgroup *cgroup; 1110 struct blkio_cgroup *blkcg;
1052 struct cfq_group *cfqg = NULL; 1111 struct cfq_group *cfqg = NULL, *__cfqg = NULL;
1112 struct request_queue *q = cfqd->queue;
1113
1114 rcu_read_lock();
1115 blkcg = task_blkio_cgroup(current);
1116 cfqg = cfq_find_cfqg(cfqd, blkcg);
1117 if (cfqg) {
1118 rcu_read_unlock();
1119 return cfqg;
1120 }
1121
1122 /*
1123 * Need to allocate a group. Allocation of group also needs allocation
1124 * of per cpu stats which in-turn takes a mutex() and can block. Hence
1125 * we need to drop rcu lock and queue_lock before we call alloc.
1126 *
1127 * Not taking any queue reference here and assuming that queue is
1128 * around by the time we return. CFQ queue allocation code does
1129 * the same. It might be racy though.
1130 */
1131
1132 rcu_read_unlock();
1133 spin_unlock_irq(q->queue_lock);
1134
1135 cfqg = cfq_alloc_cfqg(cfqd);
1136
1137 spin_lock_irq(q->queue_lock);
1053 1138
1054 rcu_read_lock(); 1139 rcu_read_lock();
1055 cgroup = task_cgroup(current, blkio_subsys_id); 1140 blkcg = task_blkio_cgroup(current);
1056 cfqg = cfq_find_alloc_cfqg(cfqd, cgroup, create); 1141
1057 if (!cfqg && create) 1142 /*
1143 * If some other thread already allocated the group while we were
1144 * not holding queue lock, free up the group
1145 */
1146 __cfqg = cfq_find_cfqg(cfqd, blkcg);
1147
1148 if (__cfqg) {
1149 kfree(cfqg);
1150 rcu_read_unlock();
1151 return __cfqg;
1152 }
1153
1154 if (!cfqg)
1058 cfqg = &cfqd->root_group; 1155 cfqg = &cfqd->root_group;
1156
1157 cfq_init_add_cfqg_lists(cfqd, cfqg, blkcg);
1059 rcu_read_unlock(); 1158 rcu_read_unlock();
1060 return cfqg; 1159 return cfqg;
1061} 1160}
1062 1161
1063static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg) 1162static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg)
1064{ 1163{
1065 atomic_inc(&cfqg->ref); 1164 cfqg->ref++;
1066 return cfqg; 1165 return cfqg;
1067} 1166}
1068 1167
@@ -1074,7 +1173,7 @@ static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg)
1074 1173
1075 cfqq->cfqg = cfqg; 1174 cfqq->cfqg = cfqg;
1076 /* cfqq reference on cfqg */ 1175 /* cfqq reference on cfqg */
1077 atomic_inc(&cfqq->cfqg->ref); 1176 cfqq->cfqg->ref++;
1078} 1177}
1079 1178
1080static void cfq_put_cfqg(struct cfq_group *cfqg) 1179static void cfq_put_cfqg(struct cfq_group *cfqg)
@@ -1082,11 +1181,13 @@ static void cfq_put_cfqg(struct cfq_group *cfqg)
1082 struct cfq_rb_root *st; 1181 struct cfq_rb_root *st;
1083 int i, j; 1182 int i, j;
1084 1183
1085 BUG_ON(atomic_read(&cfqg->ref) <= 0); 1184 BUG_ON(cfqg->ref <= 0);
1086 if (!atomic_dec_and_test(&cfqg->ref)) 1185 cfqg->ref--;
1186 if (cfqg->ref)
1087 return; 1187 return;
1088 for_each_cfqg_st(cfqg, i, j, st) 1188 for_each_cfqg_st(cfqg, i, j, st)
1089 BUG_ON(!RB_EMPTY_ROOT(&st->rb) || st->active != NULL); 1189 BUG_ON(!RB_EMPTY_ROOT(&st->rb));
1190 free_percpu(cfqg->blkg.stats_cpu);
1090 kfree(cfqg); 1191 kfree(cfqg);
1091} 1192}
1092 1193
@@ -1145,7 +1246,7 @@ void cfq_unlink_blkio_group(void *key, struct blkio_group *blkg)
1145} 1246}
1146 1247
1147#else /* GROUP_IOSCHED */ 1248#else /* GROUP_IOSCHED */
1148static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create) 1249static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd)
1149{ 1250{
1150 return &cfqd->root_group; 1251 return &cfqd->root_group;
1151} 1252}
@@ -1179,33 +1280,6 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1179 struct cfq_rb_root *service_tree; 1280 struct cfq_rb_root *service_tree;
1180 int left; 1281 int left;
1181 int new_cfqq = 1; 1282 int new_cfqq = 1;
1182 int group_changed = 0;
1183
1184#ifdef CONFIG_CFQ_GROUP_IOSCHED
1185 if (!cfqd->cfq_group_isolation
1186 && cfqq_type(cfqq) == SYNC_NOIDLE_WORKLOAD
1187 && cfqq->cfqg && cfqq->cfqg != &cfqd->root_group) {
1188 /* Move this cfq to root group */
1189 cfq_log_cfqq(cfqd, cfqq, "moving to root group");
1190 if (!RB_EMPTY_NODE(&cfqq->rb_node))
1191 cfq_group_service_tree_del(cfqd, cfqq->cfqg);
1192 cfqq->orig_cfqg = cfqq->cfqg;
1193 cfqq->cfqg = &cfqd->root_group;
1194 atomic_inc(&cfqd->root_group.ref);
1195 group_changed = 1;
1196 } else if (!cfqd->cfq_group_isolation
1197 && cfqq_type(cfqq) == SYNC_WORKLOAD && cfqq->orig_cfqg) {
1198 /* cfqq is sequential now needs to go to its original group */
1199 BUG_ON(cfqq->cfqg != &cfqd->root_group);
1200 if (!RB_EMPTY_NODE(&cfqq->rb_node))
1201 cfq_group_service_tree_del(cfqd, cfqq->cfqg);
1202 cfq_put_cfqg(cfqq->cfqg);
1203 cfqq->cfqg = cfqq->orig_cfqg;
1204 cfqq->orig_cfqg = NULL;
1205 group_changed = 1;
1206 cfq_log_cfqq(cfqd, cfqq, "moved to origin group");
1207 }
1208#endif
1209 1283
1210 service_tree = service_tree_for(cfqq->cfqg, cfqq_prio(cfqq), 1284 service_tree = service_tree_for(cfqq->cfqg, cfqq_prio(cfqq),
1211 cfqq_type(cfqq)); 1285 cfqq_type(cfqq));
@@ -1276,9 +1350,9 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1276 rb_link_node(&cfqq->rb_node, parent, p); 1350 rb_link_node(&cfqq->rb_node, parent, p);
1277 rb_insert_color(&cfqq->rb_node, &service_tree->rb); 1351 rb_insert_color(&cfqq->rb_node, &service_tree->rb);
1278 service_tree->count++; 1352 service_tree->count++;
1279 if ((add_front || !new_cfqq) && !group_changed) 1353 if (add_front || !new_cfqq)
1280 return; 1354 return;
1281 cfq_group_service_tree_add(cfqd, cfqq->cfqg); 1355 cfq_group_notify_queue_add(cfqd, cfqq->cfqg);
1282} 1356}
1283 1357
1284static struct cfq_queue * 1358static struct cfq_queue *
@@ -1366,6 +1440,8 @@ static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1366 BUG_ON(cfq_cfqq_on_rr(cfqq)); 1440 BUG_ON(cfq_cfqq_on_rr(cfqq));
1367 cfq_mark_cfqq_on_rr(cfqq); 1441 cfq_mark_cfqq_on_rr(cfqq);
1368 cfqd->busy_queues++; 1442 cfqd->busy_queues++;
1443 if (cfq_cfqq_sync(cfqq))
1444 cfqd->busy_sync_queues++;
1369 1445
1370 cfq_resort_rr_list(cfqd, cfqq); 1446 cfq_resort_rr_list(cfqd, cfqq);
1371} 1447}
@@ -1389,9 +1465,11 @@ static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1389 cfqq->p_root = NULL; 1465 cfqq->p_root = NULL;
1390 } 1466 }
1391 1467
1392 cfq_group_service_tree_del(cfqd, cfqq->cfqg); 1468 cfq_group_notify_queue_del(cfqd, cfqq->cfqg);
1393 BUG_ON(!cfqd->busy_queues); 1469 BUG_ON(!cfqd->busy_queues);
1394 cfqd->busy_queues--; 1470 cfqd->busy_queues--;
1471 if (cfq_cfqq_sync(cfqq))
1472 cfqd->busy_sync_queues--;
1395} 1473}
1396 1474
1397/* 1475/*
@@ -1663,8 +1741,11 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1663 /* 1741 /*
1664 * store what was left of this slice, if the queue idled/timed out 1742 * store what was left of this slice, if the queue idled/timed out
1665 */ 1743 */
1666 if (timed_out && !cfq_cfqq_slice_new(cfqq)) { 1744 if (timed_out) {
1667 cfqq->slice_resid = cfqq->slice_end - jiffies; 1745 if (cfq_cfqq_slice_new(cfqq))
1746 cfqq->slice_resid = cfq_scaled_cfqq_slice(cfqd, cfqq);
1747 else
1748 cfqq->slice_resid = cfqq->slice_end - jiffies;
1668 cfq_log_cfqq(cfqd, cfqq, "resid=%ld", cfqq->slice_resid); 1749 cfq_log_cfqq(cfqd, cfqq, "resid=%ld", cfqq->slice_resid);
1669 } 1750 }
1670 1751
@@ -1678,9 +1759,6 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1678 if (cfqq == cfqd->active_queue) 1759 if (cfqq == cfqd->active_queue)
1679 cfqd->active_queue = NULL; 1760 cfqd->active_queue = NULL;
1680 1761
1681 if (&cfqq->cfqg->rb_node == cfqd->grp_service_tree.active)
1682 cfqd->grp_service_tree.active = NULL;
1683
1684 if (cfqd->active_cic) { 1762 if (cfqd->active_cic) {
1685 put_io_context(cfqd->active_cic->ioc); 1763 put_io_context(cfqd->active_cic->ioc);
1686 cfqd->active_cic = NULL; 1764 cfqd->active_cic = NULL;
@@ -1892,10 +1970,10 @@ static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1892 * in their service tree. 1970 * in their service tree.
1893 */ 1971 */
1894 if (service_tree->count == 1 && cfq_cfqq_sync(cfqq)) 1972 if (service_tree->count == 1 && cfq_cfqq_sync(cfqq))
1895 return 1; 1973 return true;
1896 cfq_log_cfqq(cfqd, cfqq, "Not idling. st->count:%d", 1974 cfq_log_cfqq(cfqd, cfqq, "Not idling. st->count:%d",
1897 service_tree->count); 1975 service_tree->count);
1898 return 0; 1976 return false;
1899} 1977}
1900 1978
1901static void cfq_arm_slice_timer(struct cfq_data *cfqd) 1979static void cfq_arm_slice_timer(struct cfq_data *cfqd)
@@ -1946,8 +2024,8 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
1946 */ 2024 */
1947 if (sample_valid(cic->ttime_samples) && 2025 if (sample_valid(cic->ttime_samples) &&
1948 (cfqq->slice_end - jiffies < cic->ttime_mean)) { 2026 (cfqq->slice_end - jiffies < cic->ttime_mean)) {
1949 cfq_log_cfqq(cfqd, cfqq, "Not idling. think_time:%d", 2027 cfq_log_cfqq(cfqd, cfqq, "Not idling. think_time:%lu",
1950 cic->ttime_mean); 2028 cic->ttime_mean);
1951 return; 2029 return;
1952 } 2030 }
1953 2031
@@ -2020,7 +2098,7 @@ cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2020 2098
2021 WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR); 2099 WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR);
2022 2100
2023 return 2 * (base_rq + base_rq * (CFQ_PRIO_LISTS - 1 - cfqq->ioprio)); 2101 return 2 * base_rq * (IOPRIO_BE_NR - cfqq->ioprio);
2024} 2102}
2025 2103
2026/* 2104/*
@@ -2031,7 +2109,7 @@ static int cfqq_process_refs(struct cfq_queue *cfqq)
2031 int process_refs, io_refs; 2109 int process_refs, io_refs;
2032 2110
2033 io_refs = cfqq->allocated[READ] + cfqq->allocated[WRITE]; 2111 io_refs = cfqq->allocated[READ] + cfqq->allocated[WRITE];
2034 process_refs = atomic_read(&cfqq->ref) - io_refs; 2112 process_refs = cfqq->ref - io_refs;
2035 BUG_ON(process_refs < 0); 2113 BUG_ON(process_refs < 0);
2036 return process_refs; 2114 return process_refs;
2037} 2115}
@@ -2071,10 +2149,10 @@ static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq)
2071 */ 2149 */
2072 if (new_process_refs >= process_refs) { 2150 if (new_process_refs >= process_refs) {
2073 cfqq->new_cfqq = new_cfqq; 2151 cfqq->new_cfqq = new_cfqq;
2074 atomic_add(process_refs, &new_cfqq->ref); 2152 new_cfqq->ref += process_refs;
2075 } else { 2153 } else {
2076 new_cfqq->new_cfqq = cfqq; 2154 new_cfqq->new_cfqq = cfqq;
2077 atomic_add(new_process_refs, &cfqq->ref); 2155 cfqq->ref += new_process_refs;
2078 } 2156 }
2079} 2157}
2080 2158
@@ -2107,12 +2185,7 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
2107 unsigned count; 2185 unsigned count;
2108 struct cfq_rb_root *st; 2186 struct cfq_rb_root *st;
2109 unsigned group_slice; 2187 unsigned group_slice;
2110 2188 enum wl_prio_t original_prio = cfqd->serving_prio;
2111 if (!cfqg) {
2112 cfqd->serving_prio = IDLE_WORKLOAD;
2113 cfqd->workload_expires = jiffies + 1;
2114 return;
2115 }
2116 2189
2117 /* Choose next priority. RT > BE > IDLE */ 2190 /* Choose next priority. RT > BE > IDLE */
2118 if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg)) 2191 if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg))
@@ -2125,6 +2198,9 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
2125 return; 2198 return;
2126 } 2199 }
2127 2200
2201 if (original_prio != cfqd->serving_prio)
2202 goto new_workload;
2203
2128 /* 2204 /*
2129 * For RT and BE, we have to choose also the type 2205 * For RT and BE, we have to choose also the type
2130 * (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload 2206 * (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload
@@ -2139,6 +2215,7 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
2139 if (count && !time_after(jiffies, cfqd->workload_expires)) 2215 if (count && !time_after(jiffies, cfqd->workload_expires))
2140 return; 2216 return;
2141 2217
2218new_workload:
2142 /* otherwise select new workload type */ 2219 /* otherwise select new workload type */
2143 cfqd->serving_type = 2220 cfqd->serving_type =
2144 cfq_choose_wl(cfqd, cfqg, cfqd->serving_prio); 2221 cfq_choose_wl(cfqd, cfqg, cfqd->serving_prio);
@@ -2180,7 +2257,6 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
2180 slice = max_t(unsigned, slice, CFQ_MIN_TT); 2257 slice = max_t(unsigned, slice, CFQ_MIN_TT);
2181 cfq_log(cfqd, "workload slice:%d", slice); 2258 cfq_log(cfqd, "workload slice:%d", slice);
2182 cfqd->workload_expires = jiffies + slice; 2259 cfqd->workload_expires = jiffies + slice;
2183 cfqd->noidle_tree_requires_idle = false;
2184} 2260}
2185 2261
2186static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd) 2262static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd)
@@ -2191,7 +2267,6 @@ static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd)
2191 if (RB_EMPTY_ROOT(&st->rb)) 2267 if (RB_EMPTY_ROOT(&st->rb))
2192 return NULL; 2268 return NULL;
2193 cfqg = cfq_rb_first_group(st); 2269 cfqg = cfq_rb_first_group(st);
2194 st->active = &cfqg->rb_node;
2195 update_min_vdisktime(st); 2270 update_min_vdisktime(st);
2196 return cfqg; 2271 return cfqg;
2197} 2272}
@@ -2285,6 +2360,17 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
2285 goto keep_queue; 2360 goto keep_queue;
2286 } 2361 }
2287 2362
2363 /*
2364 * This is a deep seek queue, but the device is much faster than
2365 * the queue can deliver, don't idle
2366 **/
2367 if (CFQQ_SEEKY(cfqq) && cfq_cfqq_idle_window(cfqq) &&
2368 (cfq_cfqq_slice_new(cfqq) ||
2369 (cfqq->slice_end - jiffies > jiffies - cfqq->slice_start))) {
2370 cfq_clear_cfqq_deep(cfqq);
2371 cfq_clear_cfqq_idle_window(cfqq);
2372 }
2373
2288 if (cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) { 2374 if (cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) {
2289 cfqq = NULL; 2375 cfqq = NULL;
2290 goto keep_queue; 2376 goto keep_queue;
@@ -2359,12 +2445,12 @@ static inline bool cfq_slice_used_soon(struct cfq_data *cfqd,
2359{ 2445{
2360 /* the queue hasn't finished any request, can't estimate */ 2446 /* the queue hasn't finished any request, can't estimate */
2361 if (cfq_cfqq_slice_new(cfqq)) 2447 if (cfq_cfqq_slice_new(cfqq))
2362 return 1; 2448 return true;
2363 if (time_after(jiffies + cfqd->cfq_slice_idle * cfqq->dispatched, 2449 if (time_after(jiffies + cfqd->cfq_slice_idle * cfqq->dispatched,
2364 cfqq->slice_end)) 2450 cfqq->slice_end))
2365 return 1; 2451 return true;
2366 2452
2367 return 0; 2453 return false;
2368} 2454}
2369 2455
2370static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq) 2456static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq)
@@ -2391,6 +2477,7 @@ static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2391 * Does this cfqq already have too much IO in flight? 2477 * Does this cfqq already have too much IO in flight?
2392 */ 2478 */
2393 if (cfqq->dispatched >= max_dispatch) { 2479 if (cfqq->dispatched >= max_dispatch) {
2480 bool promote_sync = false;
2394 /* 2481 /*
2395 * idle queue must always only have a single IO in flight 2482 * idle queue must always only have a single IO in flight
2396 */ 2483 */
@@ -2398,15 +2485,26 @@ static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2398 return false; 2485 return false;
2399 2486
2400 /* 2487 /*
2488 * If there is only one sync queue
2489 * we can ignore async queue here and give the sync
2490 * queue no dispatch limit. The reason is a sync queue can
2491 * preempt async queue, limiting the sync queue doesn't make
2492 * sense. This is useful for aiostress test.
2493 */
2494 if (cfq_cfqq_sync(cfqq) && cfqd->busy_sync_queues == 1)
2495 promote_sync = true;
2496
2497 /*
2401 * We have other queues, don't allow more IO from this one 2498 * We have other queues, don't allow more IO from this one
2402 */ 2499 */
2403 if (cfqd->busy_queues > 1 && cfq_slice_used_soon(cfqd, cfqq)) 2500 if (cfqd->busy_queues > 1 && cfq_slice_used_soon(cfqd, cfqq) &&
2501 !promote_sync)
2404 return false; 2502 return false;
2405 2503
2406 /* 2504 /*
2407 * Sole queue user, no limit 2505 * Sole queue user, no limit
2408 */ 2506 */
2409 if (cfqd->busy_queues == 1) 2507 if (cfqd->busy_queues == 1 || promote_sync)
2410 max_dispatch = -1; 2508 max_dispatch = -1;
2411 else 2509 else
2412 /* 2510 /*
@@ -2528,18 +2626,18 @@ static int cfq_dispatch_requests(struct request_queue *q, int force)
2528static void cfq_put_queue(struct cfq_queue *cfqq) 2626static void cfq_put_queue(struct cfq_queue *cfqq)
2529{ 2627{
2530 struct cfq_data *cfqd = cfqq->cfqd; 2628 struct cfq_data *cfqd = cfqq->cfqd;
2531 struct cfq_group *cfqg, *orig_cfqg; 2629 struct cfq_group *cfqg;
2532 2630
2533 BUG_ON(atomic_read(&cfqq->ref) <= 0); 2631 BUG_ON(cfqq->ref <= 0);
2534 2632
2535 if (!atomic_dec_and_test(&cfqq->ref)) 2633 cfqq->ref--;
2634 if (cfqq->ref)
2536 return; 2635 return;
2537 2636
2538 cfq_log_cfqq(cfqd, cfqq, "put_queue"); 2637 cfq_log_cfqq(cfqd, cfqq, "put_queue");
2539 BUG_ON(rb_first(&cfqq->sort_list)); 2638 BUG_ON(rb_first(&cfqq->sort_list));
2540 BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]); 2639 BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]);
2541 cfqg = cfqq->cfqg; 2640 cfqg = cfqq->cfqg;
2542 orig_cfqg = cfqq->orig_cfqg;
2543 2641
2544 if (unlikely(cfqd->active_queue == cfqq)) { 2642 if (unlikely(cfqd->active_queue == cfqq)) {
2545 __cfq_slice_expired(cfqd, cfqq, 0); 2643 __cfq_slice_expired(cfqd, cfqq, 0);
@@ -2549,33 +2647,23 @@ static void cfq_put_queue(struct cfq_queue *cfqq)
2549 BUG_ON(cfq_cfqq_on_rr(cfqq)); 2647 BUG_ON(cfq_cfqq_on_rr(cfqq));
2550 kmem_cache_free(cfq_pool, cfqq); 2648 kmem_cache_free(cfq_pool, cfqq);
2551 cfq_put_cfqg(cfqg); 2649 cfq_put_cfqg(cfqg);
2552 if (orig_cfqg)
2553 cfq_put_cfqg(orig_cfqg);
2554} 2650}
2555 2651
2556/* 2652/*
2557 * Must always be called with the rcu_read_lock() held 2653 * Call func for each cic attached to this ioc.
2558 */ 2654 */
2559static void 2655static void
2560__call_for_each_cic(struct io_context *ioc, 2656call_for_each_cic(struct io_context *ioc,
2561 void (*func)(struct io_context *, struct cfq_io_context *)) 2657 void (*func)(struct io_context *, struct cfq_io_context *))
2562{ 2658{
2563 struct cfq_io_context *cic; 2659 struct cfq_io_context *cic;
2564 struct hlist_node *n; 2660 struct hlist_node *n;
2565 2661
2662 rcu_read_lock();
2663
2566 hlist_for_each_entry_rcu(cic, n, &ioc->cic_list, cic_list) 2664 hlist_for_each_entry_rcu(cic, n, &ioc->cic_list, cic_list)
2567 func(ioc, cic); 2665 func(ioc, cic);
2568}
2569 2666
2570/*
2571 * Call func for each cic attached to this ioc.
2572 */
2573static void
2574call_for_each_cic(struct io_context *ioc,
2575 void (*func)(struct io_context *, struct cfq_io_context *))
2576{
2577 rcu_read_lock();
2578 __call_for_each_cic(ioc, func);
2579 rcu_read_unlock(); 2667 rcu_read_unlock();
2580} 2668}
2581 2669
@@ -2636,7 +2724,7 @@ static void cfq_free_io_context(struct io_context *ioc)
2636 * should be ok to iterate over the known list, we will see all cic's 2724 * should be ok to iterate over the known list, we will see all cic's
2637 * since no new ones are added. 2725 * since no new ones are added.
2638 */ 2726 */
2639 __call_for_each_cic(ioc, cic_free_func); 2727 call_for_each_cic(ioc, cic_free_func);
2640} 2728}
2641 2729
2642static void cfq_put_cooperator(struct cfq_queue *cfqq) 2730static void cfq_put_cooperator(struct cfq_queue *cfqq)
@@ -2685,8 +2773,14 @@ static void __cfq_exit_single_io_context(struct cfq_data *cfqd,
2685 smp_wmb(); 2773 smp_wmb();
2686 cic->key = cfqd_dead_key(cfqd); 2774 cic->key = cfqd_dead_key(cfqd);
2687 2775
2688 if (ioc->ioc_data == cic) 2776 rcu_read_lock();
2777 if (rcu_dereference(ioc->ioc_data) == cic) {
2778 rcu_read_unlock();
2779 spin_lock(&ioc->lock);
2689 rcu_assign_pointer(ioc->ioc_data, NULL); 2780 rcu_assign_pointer(ioc->ioc_data, NULL);
2781 spin_unlock(&ioc->lock);
2782 } else
2783 rcu_read_unlock();
2690 2784
2691 if (cic->cfqq[BLK_RW_ASYNC]) { 2785 if (cic->cfqq[BLK_RW_ASYNC]) {
2692 cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_ASYNC]); 2786 cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_ASYNC]);
@@ -2835,7 +2929,7 @@ static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
2835 RB_CLEAR_NODE(&cfqq->p_node); 2929 RB_CLEAR_NODE(&cfqq->p_node);
2836 INIT_LIST_HEAD(&cfqq->fifo); 2930 INIT_LIST_HEAD(&cfqq->fifo);
2837 2931
2838 atomic_set(&cfqq->ref, 0); 2932 cfqq->ref = 0;
2839 cfqq->cfqd = cfqd; 2933 cfqq->cfqd = cfqd;
2840 2934
2841 cfq_mark_cfqq_prio_changed(cfqq); 2935 cfq_mark_cfqq_prio_changed(cfqq);
@@ -2892,7 +2986,7 @@ cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync,
2892 struct cfq_group *cfqg; 2986 struct cfq_group *cfqg;
2893 2987
2894retry: 2988retry:
2895 cfqg = cfq_get_cfqg(cfqd, 1); 2989 cfqg = cfq_get_cfqg(cfqd);
2896 cic = cfq_cic_lookup(cfqd, ioc); 2990 cic = cfq_cic_lookup(cfqd, ioc);
2897 /* cic always exists here */ 2991 /* cic always exists here */
2898 cfqq = cic_to_cfqq(cic, is_sync); 2992 cfqq = cic_to_cfqq(cic, is_sync);
@@ -2971,11 +3065,11 @@ cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct io_context *ioc,
2971 * pin the queue now that it's allocated, scheduler exit will prune it 3065 * pin the queue now that it's allocated, scheduler exit will prune it
2972 */ 3066 */
2973 if (!is_sync && !(*async_cfqq)) { 3067 if (!is_sync && !(*async_cfqq)) {
2974 atomic_inc(&cfqq->ref); 3068 cfqq->ref++;
2975 *async_cfqq = cfqq; 3069 *async_cfqq = cfqq;
2976 } 3070 }
2977 3071
2978 atomic_inc(&cfqq->ref); 3072 cfqq->ref++;
2979 return cfqq; 3073 return cfqq;
2980} 3074}
2981 3075
@@ -2993,7 +3087,8 @@ cfq_drop_dead_cic(struct cfq_data *cfqd, struct io_context *ioc,
2993 3087
2994 spin_lock_irqsave(&ioc->lock, flags); 3088 spin_lock_irqsave(&ioc->lock, flags);
2995 3089
2996 BUG_ON(ioc->ioc_data == cic); 3090 BUG_ON(rcu_dereference_check(ioc->ioc_data,
3091 lockdep_is_held(&ioc->lock)) == cic);
2997 3092
2998 radix_tree_delete(&ioc->radix_root, cfqd->cic_index); 3093 radix_tree_delete(&ioc->radix_root, cfqd->cic_index);
2999 hlist_del_rcu(&cic->cic_list); 3094 hlist_del_rcu(&cic->cic_list);
@@ -3177,7 +3272,9 @@ cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
3177 if (cfqq->queued[0] + cfqq->queued[1] >= 4) 3272 if (cfqq->queued[0] + cfqq->queued[1] >= 4)
3178 cfq_mark_cfqq_deep(cfqq); 3273 cfq_mark_cfqq_deep(cfqq);
3179 3274
3180 if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle || 3275 if (cfqq->next_rq && (cfqq->next_rq->cmd_flags & REQ_NOIDLE))
3276 enable_idle = 0;
3277 else if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle ||
3181 (!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq))) 3278 (!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq)))
3182 enable_idle = 0; 3279 enable_idle = 0;
3183 else if (sample_valid(cic->ttime_samples)) { 3280 else if (sample_valid(cic->ttime_samples)) {
@@ -3255,6 +3352,10 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
3255 if (cfq_class_rt(new_cfqq) && !cfq_class_rt(cfqq)) 3352 if (cfq_class_rt(new_cfqq) && !cfq_class_rt(cfqq))
3256 return true; 3353 return true;
3257 3354
3355 /* An idle queue should not be idle now for some reason */
3356 if (RB_EMPTY_ROOT(&cfqq->sort_list) && !cfq_should_idle(cfqd, cfqq))
3357 return true;
3358
3258 if (!cfqd->active_cic || !cfq_cfqq_wait_request(cfqq)) 3359 if (!cfqd->active_cic || !cfq_cfqq_wait_request(cfqq))
3259 return false; 3360 return false;
3260 3361
@@ -3274,10 +3375,19 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
3274 */ 3375 */
3275static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq) 3376static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
3276{ 3377{
3378 struct cfq_queue *old_cfqq = cfqd->active_queue;
3379
3277 cfq_log_cfqq(cfqd, cfqq, "preempt"); 3380 cfq_log_cfqq(cfqd, cfqq, "preempt");
3278 cfq_slice_expired(cfqd, 1); 3381 cfq_slice_expired(cfqd, 1);
3279 3382
3280 /* 3383 /*
3384 * workload type is changed, don't save slice, otherwise preempt
3385 * doesn't happen
3386 */
3387 if (cfqq_type(old_cfqq) != cfqq_type(cfqq))
3388 cfqq->cfqg->saved_workload_slice = 0;
3389
3390 /*
3281 * Put the new queue at the front of the of the current list, 3391 * Put the new queue at the front of the of the current list,
3282 * so we know that it will be selected next. 3392 * so we know that it will be selected next.
3283 */ 3393 */
@@ -3402,6 +3512,10 @@ static bool cfq_should_wait_busy(struct cfq_data *cfqd, struct cfq_queue *cfqq)
3402{ 3512{
3403 struct cfq_io_context *cic = cfqd->active_cic; 3513 struct cfq_io_context *cic = cfqd->active_cic;
3404 3514
3515 /* If the queue already has requests, don't wait */
3516 if (!RB_EMPTY_ROOT(&cfqq->sort_list))
3517 return false;
3518
3405 /* If there are other queues in the group, don't wait */ 3519 /* If there are other queues in the group, don't wait */
3406 if (cfqq->cfqg->nr_cfqq > 1) 3520 if (cfqq->cfqg->nr_cfqq > 1)
3407 return false; 3521 return false;
@@ -3494,17 +3608,7 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
3494 cfq_slice_expired(cfqd, 1); 3608 cfq_slice_expired(cfqd, 1);
3495 else if (sync && cfqq_empty && 3609 else if (sync && cfqq_empty &&
3496 !cfq_close_cooperator(cfqd, cfqq)) { 3610 !cfq_close_cooperator(cfqd, cfqq)) {
3497 cfqd->noidle_tree_requires_idle |= 3611 cfq_arm_slice_timer(cfqd);
3498 !(rq->cmd_flags & REQ_NOIDLE);
3499 /*
3500 * Idling is enabled for SYNC_WORKLOAD.
3501 * SYNC_NOIDLE_WORKLOAD idles at the end of the tree
3502 * only if we processed at least one !REQ_NOIDLE request
3503 */
3504 if (cfqd->serving_type == SYNC_WORKLOAD
3505 || cfqd->noidle_tree_requires_idle
3506 || cfqq->cfqg->nr_cfqq == 1)
3507 cfq_arm_slice_timer(cfqd);
3508 } 3612 }
3509 } 3613 }
3510 3614
@@ -3589,12 +3693,12 @@ static void cfq_put_request(struct request *rq)
3589 3693
3590 put_io_context(RQ_CIC(rq)->ioc); 3694 put_io_context(RQ_CIC(rq)->ioc);
3591 3695
3592 rq->elevator_private = NULL; 3696 rq->elevator_private[0] = NULL;
3593 rq->elevator_private2 = NULL; 3697 rq->elevator_private[1] = NULL;
3594 3698
3595 /* Put down rq reference on cfqg */ 3699 /* Put down rq reference on cfqg */
3596 cfq_put_cfqg(RQ_CFQG(rq)); 3700 cfq_put_cfqg(RQ_CFQG(rq));
3597 rq->elevator_private3 = NULL; 3701 rq->elevator_private[2] = NULL;
3598 3702
3599 cfq_put_queue(cfqq); 3703 cfq_put_queue(cfqq);
3600 } 3704 }
@@ -3681,19 +3785,15 @@ new_queue:
3681 } 3785 }
3682 3786
3683 cfqq->allocated[rw]++; 3787 cfqq->allocated[rw]++;
3684 atomic_inc(&cfqq->ref);
3685 3788
3789 cfqq->ref++;
3790 rq->elevator_private[0] = cic;
3791 rq->elevator_private[1] = cfqq;
3792 rq->elevator_private[2] = cfq_ref_get_cfqg(cfqq->cfqg);
3686 spin_unlock_irqrestore(q->queue_lock, flags); 3793 spin_unlock_irqrestore(q->queue_lock, flags);
3687
3688 rq->elevator_private = cic;
3689 rq->elevator_private2 = cfqq;
3690 rq->elevator_private3 = cfq_ref_get_cfqg(cfqq->cfqg);
3691 return 0; 3794 return 0;
3692 3795
3693queue_fail: 3796queue_fail:
3694 if (cic)
3695 put_io_context(cic->ioc);
3696
3697 cfq_schedule_dispatch(cfqd); 3797 cfq_schedule_dispatch(cfqd);
3698 spin_unlock_irqrestore(q->queue_lock, flags); 3798 spin_unlock_irqrestore(q->queue_lock, flags);
3699 cfq_log(cfqd, "set_request fail"); 3799 cfq_log(cfqd, "set_request fail");
@@ -3788,15 +3888,11 @@ static void cfq_put_async_queues(struct cfq_data *cfqd)
3788 cfq_put_queue(cfqd->async_idle_cfqq); 3888 cfq_put_queue(cfqd->async_idle_cfqq);
3789} 3889}
3790 3890
3791static void cfq_cfqd_free(struct rcu_head *head)
3792{
3793 kfree(container_of(head, struct cfq_data, rcu));
3794}
3795
3796static void cfq_exit_queue(struct elevator_queue *e) 3891static void cfq_exit_queue(struct elevator_queue *e)
3797{ 3892{
3798 struct cfq_data *cfqd = e->elevator_data; 3893 struct cfq_data *cfqd = e->elevator_data;
3799 struct request_queue *q = cfqd->queue; 3894 struct request_queue *q = cfqd->queue;
3895 bool wait = false;
3800 3896
3801 cfq_shutdown_timer_wq(cfqd); 3897 cfq_shutdown_timer_wq(cfqd);
3802 3898
@@ -3815,7 +3911,13 @@ static void cfq_exit_queue(struct elevator_queue *e)
3815 3911
3816 cfq_put_async_queues(cfqd); 3912 cfq_put_async_queues(cfqd);
3817 cfq_release_cfq_groups(cfqd); 3913 cfq_release_cfq_groups(cfqd);
3818 cfq_blkiocg_del_blkio_group(&cfqd->root_group.blkg); 3914
3915 /*
3916 * If there are groups which we could not unlink from blkcg list,
3917 * wait for a rcu period for them to be freed.
3918 */
3919 if (cfqd->nr_blkcg_linked_grps)
3920 wait = true;
3819 3921
3820 spin_unlock_irq(q->queue_lock); 3922 spin_unlock_irq(q->queue_lock);
3821 3923
@@ -3825,8 +3927,25 @@ static void cfq_exit_queue(struct elevator_queue *e)
3825 ida_remove(&cic_index_ida, cfqd->cic_index); 3927 ida_remove(&cic_index_ida, cfqd->cic_index);
3826 spin_unlock(&cic_index_lock); 3928 spin_unlock(&cic_index_lock);
3827 3929
3828 /* Wait for cfqg->blkg->key accessors to exit their grace periods. */ 3930 /*
3829 call_rcu(&cfqd->rcu, cfq_cfqd_free); 3931 * Wait for cfqg->blkg->key accessors to exit their grace periods.
3932 * Do this wait only if there are other unlinked groups out
3933 * there. This can happen if cgroup deletion path claimed the
3934 * responsibility of cleaning up a group before queue cleanup code
3935 * get to the group.
3936 *
3937 * Do not call synchronize_rcu() unconditionally as there are drivers
3938 * which create/delete request queue hundreds of times during scan/boot
3939 * and synchronize_rcu() can take significant time and slow down boot.
3940 */
3941 if (wait)
3942 synchronize_rcu();
3943
3944#ifdef CONFIG_CFQ_GROUP_IOSCHED
3945 /* Free up per cpu stats for root group */
3946 free_percpu(cfqd->root_group.blkg.stats_cpu);
3947#endif
3948 kfree(cfqd);
3830} 3949}
3831 3950
3832static int cfq_alloc_cic_index(void) 3951static int cfq_alloc_cic_index(void)
@@ -3859,9 +3978,17 @@ static void *cfq_init_queue(struct request_queue *q)
3859 return NULL; 3978 return NULL;
3860 3979
3861 cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node); 3980 cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node);
3862 if (!cfqd) 3981 if (!cfqd) {
3982 spin_lock(&cic_index_lock);
3983 ida_remove(&cic_index_ida, i);
3984 spin_unlock(&cic_index_lock);
3863 return NULL; 3985 return NULL;
3986 }
3864 3987
3988 /*
3989 * Don't need take queue_lock in the routine, since we are
3990 * initializing the ioscheduler, and nobody is using cfqd
3991 */
3865 cfqd->cic_index = i; 3992 cfqd->cic_index = i;
3866 3993
3867 /* Init root service tree */ 3994 /* Init root service tree */
@@ -3878,14 +4005,29 @@ static void *cfq_init_queue(struct request_queue *q)
3878 4005
3879#ifdef CONFIG_CFQ_GROUP_IOSCHED 4006#ifdef CONFIG_CFQ_GROUP_IOSCHED
3880 /* 4007 /*
3881 * Take a reference to root group which we never drop. This is just 4008 * Set root group reference to 2. One reference will be dropped when
3882 * to make sure that cfq_put_cfqg() does not try to kfree root group 4009 * all groups on cfqd->cfqg_list are being deleted during queue exit.
4010 * Other reference will remain there as we don't want to delete this
4011 * group as it is statically allocated and gets destroyed when
4012 * throtl_data goes away.
3883 */ 4013 */
3884 atomic_set(&cfqg->ref, 1); 4014 cfqg->ref = 2;
4015
4016 if (blkio_alloc_blkg_stats(&cfqg->blkg)) {
4017 kfree(cfqg);
4018 kfree(cfqd);
4019 return NULL;
4020 }
4021
3885 rcu_read_lock(); 4022 rcu_read_lock();
4023
3886 cfq_blkiocg_add_blkio_group(&blkio_root_cgroup, &cfqg->blkg, 4024 cfq_blkiocg_add_blkio_group(&blkio_root_cgroup, &cfqg->blkg,
3887 (void *)cfqd, 0); 4025 (void *)cfqd, 0);
3888 rcu_read_unlock(); 4026 rcu_read_unlock();
4027 cfqd->nr_blkcg_linked_grps++;
4028
4029 /* Add group on cfqd->cfqg_list */
4030 hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
3889#endif 4031#endif
3890 /* 4032 /*
3891 * Not strictly needed (since RB_ROOT just clears the node and we 4033 * Not strictly needed (since RB_ROOT just clears the node and we
@@ -3901,7 +4043,7 @@ static void *cfq_init_queue(struct request_queue *q)
3901 * will not attempt to free it. 4043 * will not attempt to free it.
3902 */ 4044 */
3903 cfq_init_cfqq(cfqd, &cfqd->oom_cfqq, 1, 0); 4045 cfq_init_cfqq(cfqd, &cfqd->oom_cfqq, 1, 0);
3904 atomic_inc(&cfqd->oom_cfqq.ref); 4046 cfqd->oom_cfqq.ref++;
3905 cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, &cfqd->root_group); 4047 cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, &cfqd->root_group);
3906 4048
3907 INIT_LIST_HEAD(&cfqd->cic_list); 4049 INIT_LIST_HEAD(&cfqd->cic_list);
@@ -3925,7 +4067,6 @@ static void *cfq_init_queue(struct request_queue *q)
3925 cfqd->cfq_slice_idle = cfq_slice_idle; 4067 cfqd->cfq_slice_idle = cfq_slice_idle;
3926 cfqd->cfq_group_idle = cfq_group_idle; 4068 cfqd->cfq_group_idle = cfq_group_idle;
3927 cfqd->cfq_latency = 1; 4069 cfqd->cfq_latency = 1;
3928 cfqd->cfq_group_isolation = 0;
3929 cfqd->hw_tag = -1; 4070 cfqd->hw_tag = -1;
3930 /* 4071 /*
3931 * we optimistically start assuming sync ops weren't delayed in last 4072 * we optimistically start assuming sync ops weren't delayed in last
@@ -4001,7 +4142,6 @@ SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1);
4001SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1); 4142SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1);
4002SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0); 4143SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0);
4003SHOW_FUNCTION(cfq_low_latency_show, cfqd->cfq_latency, 0); 4144SHOW_FUNCTION(cfq_low_latency_show, cfqd->cfq_latency, 0);
4004SHOW_FUNCTION(cfq_group_isolation_show, cfqd->cfq_group_isolation, 0);
4005#undef SHOW_FUNCTION 4145#undef SHOW_FUNCTION
4006 4146
4007#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ 4147#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \
@@ -4035,7 +4175,6 @@ STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1);
4035STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1, 4175STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1,
4036 UINT_MAX, 0); 4176 UINT_MAX, 0);
4037STORE_FUNCTION(cfq_low_latency_store, &cfqd->cfq_latency, 0, 1, 0); 4177STORE_FUNCTION(cfq_low_latency_store, &cfqd->cfq_latency, 0, 1, 0);
4038STORE_FUNCTION(cfq_group_isolation_store, &cfqd->cfq_group_isolation, 0, 1, 0);
4039#undef STORE_FUNCTION 4178#undef STORE_FUNCTION
4040 4179
4041#define CFQ_ATTR(name) \ 4180#define CFQ_ATTR(name) \
@@ -4053,7 +4192,6 @@ static struct elv_fs_entry cfq_attrs[] = {
4053 CFQ_ATTR(slice_idle), 4192 CFQ_ATTR(slice_idle),
4054 CFQ_ATTR(group_idle), 4193 CFQ_ATTR(group_idle),
4055 CFQ_ATTR(low_latency), 4194 CFQ_ATTR(low_latency),
4056 CFQ_ATTR(group_isolation),
4057 __ATTR_NULL 4195 __ATTR_NULL
4058}; 4196};
4059 4197
@@ -4068,7 +4206,6 @@ static struct elevator_type iosched_cfq = {
4068 .elevator_add_req_fn = cfq_insert_request, 4206 .elevator_add_req_fn = cfq_insert_request,
4069 .elevator_activate_req_fn = cfq_activate_request, 4207 .elevator_activate_req_fn = cfq_activate_request,
4070 .elevator_deactivate_req_fn = cfq_deactivate_request, 4208 .elevator_deactivate_req_fn = cfq_deactivate_request,
4071 .elevator_queue_empty_fn = cfq_queue_empty,
4072 .elevator_completed_req_fn = cfq_completed_request, 4209 .elevator_completed_req_fn = cfq_completed_request,
4073 .elevator_former_req_fn = elv_rb_former_request, 4210 .elevator_former_req_fn = elv_rb_former_request,
4074 .elevator_latter_req_fn = elv_rb_latter_request, 4211 .elevator_latter_req_fn = elv_rb_latter_request,
@@ -4090,6 +4227,7 @@ static struct blkio_policy_type blkio_policy_cfq = {
4090 .blkio_unlink_group_fn = cfq_unlink_blkio_group, 4227 .blkio_unlink_group_fn = cfq_unlink_blkio_group,
4091 .blkio_update_group_weight_fn = cfq_update_blkio_group_weight, 4228 .blkio_update_group_weight_fn = cfq_update_blkio_group_weight,
4092 }, 4229 },
4230 .plid = BLKIO_POLICY_PROP,
4093}; 4231};
4094#else 4232#else
4095static struct blkio_policy_type blkio_policy_cfq; 4233static struct blkio_policy_type blkio_policy_cfq;
diff --git a/block/cfq.h b/block/cfq.h
index 93448e5a2e41..2a155927e37c 100644
--- a/block/cfq.h
+++ b/block/cfq.h
@@ -16,9 +16,9 @@ static inline void cfq_blkiocg_update_dequeue_stats(struct blkio_group *blkg,
16} 16}
17 17
18static inline void cfq_blkiocg_update_timeslice_used(struct blkio_group *blkg, 18static inline void cfq_blkiocg_update_timeslice_used(struct blkio_group *blkg,
19 unsigned long time) 19 unsigned long time, unsigned long unaccounted_time)
20{ 20{
21 blkiocg_update_timeslice_used(blkg, time); 21 blkiocg_update_timeslice_used(blkg, time, unaccounted_time);
22} 22}
23 23
24static inline void cfq_blkiocg_set_start_empty_time(struct blkio_group *blkg) 24static inline void cfq_blkiocg_set_start_empty_time(struct blkio_group *blkg)
@@ -69,7 +69,7 @@ static inline void cfq_blkiocg_update_completion_stats(struct blkio_group *blkg,
69 69
70static inline void cfq_blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, 70static inline void cfq_blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
71 struct blkio_group *blkg, void *key, dev_t dev) { 71 struct blkio_group *blkg, void *key, dev_t dev) {
72 blkiocg_add_blkio_group(blkcg, blkg, key, dev); 72 blkiocg_add_blkio_group(blkcg, blkg, key, dev, BLKIO_POLICY_PROP);
73} 73}
74 74
75static inline int cfq_blkiocg_del_blkio_group(struct blkio_group *blkg) 75static inline int cfq_blkiocg_del_blkio_group(struct blkio_group *blkg)
@@ -85,7 +85,7 @@ static inline void cfq_blkiocg_update_dequeue_stats(struct blkio_group *blkg,
85 unsigned long dequeue) {} 85 unsigned long dequeue) {}
86 86
87static inline void cfq_blkiocg_update_timeslice_used(struct blkio_group *blkg, 87static inline void cfq_blkiocg_update_timeslice_used(struct blkio_group *blkg,
88 unsigned long time) {} 88 unsigned long time, unsigned long unaccounted_time) {}
89static inline void cfq_blkiocg_set_start_empty_time(struct blkio_group *blkg) {} 89static inline void cfq_blkiocg_set_start_empty_time(struct blkio_group *blkg) {}
90static inline void cfq_blkiocg_update_io_remove_stats(struct blkio_group *blkg, 90static inline void cfq_blkiocg_update_io_remove_stats(struct blkio_group *blkg,
91 bool direction, bool sync) {} 91 bool direction, bool sync) {}
diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c
index 119f07b74dc0..cc3eb78e333a 100644
--- a/block/compat_ioctl.c
+++ b/block/compat_ioctl.c
@@ -8,7 +8,6 @@
8#include <linux/hdreg.h> 8#include <linux/hdreg.h>
9#include <linux/slab.h> 9#include <linux/slab.h>
10#include <linux/syscalls.h> 10#include <linux/syscalls.h>
11#include <linux/smp_lock.h>
12#include <linux/types.h> 11#include <linux/types.h>
13#include <linux/uaccess.h> 12#include <linux/uaccess.h>
14 13
@@ -744,13 +743,13 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
744 bdi->ra_pages = (arg * 512) / PAGE_CACHE_SIZE; 743 bdi->ra_pages = (arg * 512) / PAGE_CACHE_SIZE;
745 return 0; 744 return 0;
746 case BLKGETSIZE: 745 case BLKGETSIZE:
747 size = bdev->bd_inode->i_size; 746 size = i_size_read(bdev->bd_inode);
748 if ((size >> 9) > ~0UL) 747 if ((size >> 9) > ~0UL)
749 return -EFBIG; 748 return -EFBIG;
750 return compat_put_ulong(arg, size >> 9); 749 return compat_put_ulong(arg, size >> 9);
751 750
752 case BLKGETSIZE64_32: 751 case BLKGETSIZE64_32:
753 return compat_put_u64(arg, bdev->bd_inode->i_size); 752 return compat_put_u64(arg, i_size_read(bdev->bd_inode));
754 753
755 case BLKTRACESETUP32: 754 case BLKTRACESETUP32:
756 case BLKTRACESTART: /* compatible */ 755 case BLKTRACESTART: /* compatible */
diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c
index b547cbca7b23..5139c0ea1864 100644
--- a/block/deadline-iosched.c
+++ b/block/deadline-iosched.c
@@ -326,14 +326,6 @@ dispatch_request:
326 return 1; 326 return 1;
327} 327}
328 328
329static int deadline_queue_empty(struct request_queue *q)
330{
331 struct deadline_data *dd = q->elevator->elevator_data;
332
333 return list_empty(&dd->fifo_list[WRITE])
334 && list_empty(&dd->fifo_list[READ]);
335}
336
337static void deadline_exit_queue(struct elevator_queue *e) 329static void deadline_exit_queue(struct elevator_queue *e)
338{ 330{
339 struct deadline_data *dd = e->elevator_data; 331 struct deadline_data *dd = e->elevator_data;
@@ -445,7 +437,6 @@ static struct elevator_type iosched_deadline = {
445 .elevator_merge_req_fn = deadline_merged_requests, 437 .elevator_merge_req_fn = deadline_merged_requests,
446 .elevator_dispatch_fn = deadline_dispatch_requests, 438 .elevator_dispatch_fn = deadline_dispatch_requests,
447 .elevator_add_req_fn = deadline_add_request, 439 .elevator_add_req_fn = deadline_add_request,
448 .elevator_queue_empty_fn = deadline_queue_empty,
449 .elevator_former_req_fn = elv_rb_former_request, 440 .elevator_former_req_fn = elv_rb_former_request,
450 .elevator_latter_req_fn = elv_rb_latter_request, 441 .elevator_latter_req_fn = elv_rb_latter_request,
451 .elevator_init_fn = deadline_init_queue, 442 .elevator_init_fn = deadline_init_queue,
diff --git a/block/elevator.c b/block/elevator.c
index 4e11559aa2b0..b0b38ce0dcb6 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -113,7 +113,7 @@ int elv_rq_merge_ok(struct request *rq, struct bio *bio)
113} 113}
114EXPORT_SYMBOL(elv_rq_merge_ok); 114EXPORT_SYMBOL(elv_rq_merge_ok);
115 115
116static inline int elv_try_merge(struct request *__rq, struct bio *bio) 116int elv_try_merge(struct request *__rq, struct bio *bio)
117{ 117{
118 int ret = ELEVATOR_NO_MERGE; 118 int ret = ELEVATOR_NO_MERGE;
119 119
@@ -155,13 +155,8 @@ static struct elevator_type *elevator_get(const char *name)
155 155
156 e = elevator_find(name); 156 e = elevator_find(name);
157 if (!e) { 157 if (!e) {
158 char elv[ELV_NAME_MAX + strlen("-iosched")];
159
160 spin_unlock(&elv_list_lock); 158 spin_unlock(&elv_list_lock);
161 159 request_module("%s-iosched", name);
162 snprintf(elv, sizeof(elv), "%s-iosched", name);
163
164 request_module("%s", elv);
165 spin_lock(&elv_list_lock); 160 spin_lock(&elv_list_lock);
166 e = elevator_find(name); 161 e = elevator_find(name);
167 } 162 }
@@ -429,7 +424,7 @@ void elv_dispatch_sort(struct request_queue *q, struct request *rq)
429 q->nr_sorted--; 424 q->nr_sorted--;
430 425
431 boundary = q->end_sector; 426 boundary = q->end_sector;
432 stop_flags = REQ_SOFTBARRIER | REQ_HARDBARRIER | REQ_STARTED; 427 stop_flags = REQ_SOFTBARRIER | REQ_STARTED;
433 list_for_each_prev(entry, &q->queue_head) { 428 list_for_each_prev(entry, &q->queue_head) {
434 struct request *pos = list_entry_rq(entry); 429 struct request *pos = list_entry_rq(entry);
435 430
@@ -519,6 +514,40 @@ int elv_merge(struct request_queue *q, struct request **req, struct bio *bio)
519 return ELEVATOR_NO_MERGE; 514 return ELEVATOR_NO_MERGE;
520} 515}
521 516
517/*
518 * Attempt to do an insertion back merge. Only check for the case where
519 * we can append 'rq' to an existing request, so we can throw 'rq' away
520 * afterwards.
521 *
522 * Returns true if we merged, false otherwise
523 */
524static bool elv_attempt_insert_merge(struct request_queue *q,
525 struct request *rq)
526{
527 struct request *__rq;
528
529 if (blk_queue_nomerges(q))
530 return false;
531
532 /*
533 * First try one-hit cache.
534 */
535 if (q->last_merge && blk_attempt_req_merge(q, q->last_merge, rq))
536 return true;
537
538 if (blk_queue_noxmerges(q))
539 return false;
540
541 /*
542 * See if our hash lookup can find a potential backmerge.
543 */
544 __rq = elv_rqhash_find(q, blk_rq_pos(rq));
545 if (__rq && blk_attempt_req_merge(q, __rq, rq))
546 return true;
547
548 return false;
549}
550
522void elv_merged_request(struct request_queue *q, struct request *rq, int type) 551void elv_merged_request(struct request_queue *q, struct request *rq, int type)
523{ 552{
524 struct elevator_queue *e = q->elevator; 553 struct elevator_queue *e = q->elevator;
@@ -536,14 +565,18 @@ void elv_merge_requests(struct request_queue *q, struct request *rq,
536 struct request *next) 565 struct request *next)
537{ 566{
538 struct elevator_queue *e = q->elevator; 567 struct elevator_queue *e = q->elevator;
568 const int next_sorted = next->cmd_flags & REQ_SORTED;
539 569
540 if (e->ops->elevator_merge_req_fn) 570 if (next_sorted && e->ops->elevator_merge_req_fn)
541 e->ops->elevator_merge_req_fn(q, rq, next); 571 e->ops->elevator_merge_req_fn(q, rq, next);
542 572
543 elv_rqhash_reposition(q, rq); 573 elv_rqhash_reposition(q, rq);
544 elv_rqhash_del(q, next);
545 574
546 q->nr_sorted--; 575 if (next_sorted) {
576 elv_rqhash_del(q, next);
577 q->nr_sorted--;
578 }
579
547 q->last_merge = rq; 580 q->last_merge = rq;
548} 581}
549 582
@@ -570,7 +603,7 @@ void elv_requeue_request(struct request_queue *q, struct request *rq)
570 603
571 rq->cmd_flags &= ~REQ_STARTED; 604 rq->cmd_flags &= ~REQ_STARTED;
572 605
573 elv_insert(q, rq, ELEVATOR_INSERT_REQUEUE); 606 __elv_add_request(q, rq, ELEVATOR_INSERT_REQUEUE);
574} 607}
575 608
576void elv_drain_elevator(struct request_queue *q) 609void elv_drain_elevator(struct request_queue *q)
@@ -615,20 +648,28 @@ void elv_quiesce_end(struct request_queue *q)
615 queue_flag_clear(QUEUE_FLAG_ELVSWITCH, q); 648 queue_flag_clear(QUEUE_FLAG_ELVSWITCH, q);
616} 649}
617 650
618void elv_insert(struct request_queue *q, struct request *rq, int where) 651void __elv_add_request(struct request_queue *q, struct request *rq, int where)
619{ 652{
620 struct list_head *pos;
621 unsigned ordseq;
622 int unplug_it = 1;
623
624 trace_block_rq_insert(q, rq); 653 trace_block_rq_insert(q, rq);
625 654
626 rq->q = q; 655 rq->q = q;
627 656
657 if (rq->cmd_flags & REQ_SOFTBARRIER) {
658 /* barriers are scheduling boundary, update end_sector */
659 if (rq->cmd_type == REQ_TYPE_FS ||
660 (rq->cmd_flags & REQ_DISCARD)) {
661 q->end_sector = rq_end_sector(rq);
662 q->boundary_rq = rq;
663 }
664 } else if (!(rq->cmd_flags & REQ_ELVPRIV) &&
665 (where == ELEVATOR_INSERT_SORT ||
666 where == ELEVATOR_INSERT_SORT_MERGE))
667 where = ELEVATOR_INSERT_BACK;
668
628 switch (where) { 669 switch (where) {
670 case ELEVATOR_INSERT_REQUEUE:
629 case ELEVATOR_INSERT_FRONT: 671 case ELEVATOR_INSERT_FRONT:
630 rq->cmd_flags |= REQ_SOFTBARRIER; 672 rq->cmd_flags |= REQ_SOFTBARRIER;
631
632 list_add(&rq->queuelist, &q->queue_head); 673 list_add(&rq->queuelist, &q->queue_head);
633 break; 674 break;
634 675
@@ -649,6 +690,14 @@ void elv_insert(struct request_queue *q, struct request *rq, int where)
649 __blk_run_queue(q); 690 __blk_run_queue(q);
650 break; 691 break;
651 692
693 case ELEVATOR_INSERT_SORT_MERGE:
694 /*
695 * If we succeed in merging this request with one in the
696 * queue already, we are done - rq has now been freed,
697 * so no need to do anything further.
698 */
699 if (elv_attempt_insert_merge(q, rq))
700 break;
652 case ELEVATOR_INSERT_SORT: 701 case ELEVATOR_INSERT_SORT:
653 BUG_ON(rq->cmd_type != REQ_TYPE_FS && 702 BUG_ON(rq->cmd_type != REQ_TYPE_FS &&
654 !(rq->cmd_flags & REQ_DISCARD)); 703 !(rq->cmd_flags & REQ_DISCARD));
@@ -668,115 +717,28 @@ void elv_insert(struct request_queue *q, struct request *rq, int where)
668 q->elevator->ops->elevator_add_req_fn(q, rq); 717 q->elevator->ops->elevator_add_req_fn(q, rq);
669 break; 718 break;
670 719
671 case ELEVATOR_INSERT_REQUEUE: 720 case ELEVATOR_INSERT_FLUSH:
672 /*
673 * If ordered flush isn't in progress, we do front
674 * insertion; otherwise, requests should be requeued
675 * in ordseq order.
676 */
677 rq->cmd_flags |= REQ_SOFTBARRIER; 721 rq->cmd_flags |= REQ_SOFTBARRIER;
678 722 blk_insert_flush(rq);
679 /*
680 * Most requeues happen because of a busy condition,
681 * don't force unplug of the queue for that case.
682 */
683 unplug_it = 0;
684
685 if (q->ordseq == 0) {
686 list_add(&rq->queuelist, &q->queue_head);
687 break;
688 }
689
690 ordseq = blk_ordered_req_seq(rq);
691
692 list_for_each(pos, &q->queue_head) {
693 struct request *pos_rq = list_entry_rq(pos);
694 if (ordseq <= blk_ordered_req_seq(pos_rq))
695 break;
696 }
697
698 list_add_tail(&rq->queuelist, pos);
699 break; 723 break;
700
701 default: 724 default:
702 printk(KERN_ERR "%s: bad insertion point %d\n", 725 printk(KERN_ERR "%s: bad insertion point %d\n",
703 __func__, where); 726 __func__, where);
704 BUG(); 727 BUG();
705 } 728 }
706
707 if (unplug_it && blk_queue_plugged(q)) {
708 int nrq = q->rq.count[BLK_RW_SYNC] + q->rq.count[BLK_RW_ASYNC]
709 - queue_in_flight(q);
710
711 if (nrq >= q->unplug_thresh)
712 __generic_unplug_device(q);
713 }
714}
715
716void __elv_add_request(struct request_queue *q, struct request *rq, int where,
717 int plug)
718{
719 if (q->ordcolor)
720 rq->cmd_flags |= REQ_ORDERED_COLOR;
721
722 if (rq->cmd_flags & (REQ_SOFTBARRIER | REQ_HARDBARRIER)) {
723 /*
724 * toggle ordered color
725 */
726 if (rq->cmd_flags & REQ_HARDBARRIER)
727 q->ordcolor ^= 1;
728
729 /*
730 * barriers implicitly indicate back insertion
731 */
732 if (where == ELEVATOR_INSERT_SORT)
733 where = ELEVATOR_INSERT_BACK;
734
735 /*
736 * this request is scheduling boundary, update
737 * end_sector
738 */
739 if (rq->cmd_type == REQ_TYPE_FS ||
740 (rq->cmd_flags & REQ_DISCARD)) {
741 q->end_sector = rq_end_sector(rq);
742 q->boundary_rq = rq;
743 }
744 } else if (!(rq->cmd_flags & REQ_ELVPRIV) &&
745 where == ELEVATOR_INSERT_SORT)
746 where = ELEVATOR_INSERT_BACK;
747
748 if (plug)
749 blk_plug_device(q);
750
751 elv_insert(q, rq, where);
752} 729}
753EXPORT_SYMBOL(__elv_add_request); 730EXPORT_SYMBOL(__elv_add_request);
754 731
755void elv_add_request(struct request_queue *q, struct request *rq, int where, 732void elv_add_request(struct request_queue *q, struct request *rq, int where)
756 int plug)
757{ 733{
758 unsigned long flags; 734 unsigned long flags;
759 735
760 spin_lock_irqsave(q->queue_lock, flags); 736 spin_lock_irqsave(q->queue_lock, flags);
761 __elv_add_request(q, rq, where, plug); 737 __elv_add_request(q, rq, where);
762 spin_unlock_irqrestore(q->queue_lock, flags); 738 spin_unlock_irqrestore(q->queue_lock, flags);
763} 739}
764EXPORT_SYMBOL(elv_add_request); 740EXPORT_SYMBOL(elv_add_request);
765 741
766int elv_queue_empty(struct request_queue *q)
767{
768 struct elevator_queue *e = q->elevator;
769
770 if (!list_empty(&q->queue_head))
771 return 0;
772
773 if (e->ops->elevator_queue_empty_fn)
774 return e->ops->elevator_queue_empty_fn(q);
775
776 return 1;
777}
778EXPORT_SYMBOL(elv_queue_empty);
779
780struct request *elv_latter_request(struct request_queue *q, struct request *rq) 742struct request *elv_latter_request(struct request_queue *q, struct request *rq)
781{ 743{
782 struct elevator_queue *e = q->elevator; 744 struct elevator_queue *e = q->elevator;
@@ -802,7 +764,7 @@ int elv_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask)
802 if (e->ops->elevator_set_req_fn) 764 if (e->ops->elevator_set_req_fn)
803 return e->ops->elevator_set_req_fn(q, rq, gfp_mask); 765 return e->ops->elevator_set_req_fn(q, rq, gfp_mask);
804 766
805 rq->elevator_private = NULL; 767 rq->elevator_private[0] = NULL;
806 return 0; 768 return 0;
807} 769}
808 770
@@ -828,6 +790,8 @@ void elv_abort_queue(struct request_queue *q)
828{ 790{
829 struct request *rq; 791 struct request *rq;
830 792
793 blk_abort_flushes(q);
794
831 while (!list_empty(&q->queue_head)) { 795 while (!list_empty(&q->queue_head)) {
832 rq = list_entry_rq(q->queue_head.next); 796 rq = list_entry_rq(q->queue_head.next);
833 rq->cmd_flags |= REQ_QUIET; 797 rq->cmd_flags |= REQ_QUIET;
@@ -855,24 +819,6 @@ void elv_completed_request(struct request_queue *q, struct request *rq)
855 e->ops->elevator_completed_req_fn) 819 e->ops->elevator_completed_req_fn)
856 e->ops->elevator_completed_req_fn(q, rq); 820 e->ops->elevator_completed_req_fn(q, rq);
857 } 821 }
858
859 /*
860 * Check if the queue is waiting for fs requests to be
861 * drained for flush sequence.
862 */
863 if (unlikely(q->ordseq)) {
864 struct request *next = NULL;
865
866 if (!list_empty(&q->queue_head))
867 next = list_entry_rq(q->queue_head.next);
868
869 if (!queue_in_flight(q) &&
870 blk_ordered_cur_seq(q) == QUEUE_ORDSEQ_DRAIN &&
871 (!next || blk_ordered_req_seq(next) > QUEUE_ORDSEQ_DRAIN)) {
872 blk_ordered_complete_seq(q, QUEUE_ORDSEQ_DRAIN, 0);
873 __blk_run_queue(q);
874 }
875 }
876} 822}
877 823
878#define to_elv(atr) container_of((atr), struct elv_fs_entry, attr) 824#define to_elv(atr) container_of((atr), struct elv_fs_entry, attr)
diff --git a/block/genhd.c b/block/genhd.c
index 59a2db6fecef..3608289c8ecd 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -18,13 +18,12 @@
18#include <linux/buffer_head.h> 18#include <linux/buffer_head.h>
19#include <linux/mutex.h> 19#include <linux/mutex.h>
20#include <linux/idr.h> 20#include <linux/idr.h>
21#include <linux/log2.h>
21 22
22#include "blk.h" 23#include "blk.h"
23 24
24static DEFINE_MUTEX(block_class_lock); 25static DEFINE_MUTEX(block_class_lock);
25#ifndef CONFIG_SYSFS_DEPRECATED
26struct kobject *block_depr; 26struct kobject *block_depr;
27#endif
28 27
29/* for extended dynamic devt allocation, currently only one major is used */ 28/* for extended dynamic devt allocation, currently only one major is used */
30#define MAX_EXT_DEVT (1 << MINORBITS) 29#define MAX_EXT_DEVT (1 << MINORBITS)
@@ -37,6 +36,10 @@ static DEFINE_IDR(ext_devt_idr);
37 36
38static struct device_type disk_type; 37static struct device_type disk_type;
39 38
39static void disk_add_events(struct gendisk *disk);
40static void disk_del_events(struct gendisk *disk);
41static void disk_release_events(struct gendisk *disk);
42
40/** 43/**
41 * disk_get_part - get partition 44 * disk_get_part - get partition
42 * @disk: disk to look partition from 45 * @disk: disk to look partition from
@@ -241,7 +244,7 @@ static struct blk_major_name {
241} *major_names[BLKDEV_MAJOR_HASH_SIZE]; 244} *major_names[BLKDEV_MAJOR_HASH_SIZE];
242 245
243/* index in the above - for now: assume no multimajor ranges */ 246/* index in the above - for now: assume no multimajor ranges */
244static inline int major_to_index(int major) 247static inline int major_to_index(unsigned major)
245{ 248{
246 return major % BLKDEV_MAJOR_HASH_SIZE; 249 return major % BLKDEV_MAJOR_HASH_SIZE;
247} 250}
@@ -504,6 +507,64 @@ static int exact_lock(dev_t devt, void *data)
504 return 0; 507 return 0;
505} 508}
506 509
510void register_disk(struct gendisk *disk)
511{
512 struct device *ddev = disk_to_dev(disk);
513 struct block_device *bdev;
514 struct disk_part_iter piter;
515 struct hd_struct *part;
516 int err;
517
518 ddev->parent = disk->driverfs_dev;
519
520 dev_set_name(ddev, disk->disk_name);
521
522 /* delay uevents, until we scanned partition table */
523 dev_set_uevent_suppress(ddev, 1);
524
525 if (device_add(ddev))
526 return;
527 if (!sysfs_deprecated) {
528 err = sysfs_create_link(block_depr, &ddev->kobj,
529 kobject_name(&ddev->kobj));
530 if (err) {
531 device_del(ddev);
532 return;
533 }
534 }
535 disk->part0.holder_dir = kobject_create_and_add("holders", &ddev->kobj);
536 disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj);
537
538 /* No minors to use for partitions */
539 if (!disk_partitionable(disk))
540 goto exit;
541
542 /* No such device (e.g., media were just removed) */
543 if (!get_capacity(disk))
544 goto exit;
545
546 bdev = bdget_disk(disk, 0);
547 if (!bdev)
548 goto exit;
549
550 bdev->bd_invalidated = 1;
551 err = blkdev_get(bdev, FMODE_READ, NULL);
552 if (err < 0)
553 goto exit;
554 blkdev_put(bdev, FMODE_READ);
555
556exit:
557 /* announce disk after possible partitions are created */
558 dev_set_uevent_suppress(ddev, 0);
559 kobject_uevent(&ddev->kobj, KOBJ_ADD);
560
561 /* announce possible partitions */
562 disk_part_iter_init(&piter, disk, 0);
563 while ((part = disk_part_iter_next(&piter)))
564 kobject_uevent(&part_to_dev(part)->kobj, KOBJ_ADD);
565 disk_part_iter_exit(&piter);
566}
567
507/** 568/**
508 * add_disk - add partitioning information to kernel list 569 * add_disk - add partitioning information to kernel list
509 * @disk: per-device partitioning information 570 * @disk: per-device partitioning information
@@ -541,28 +602,60 @@ void add_disk(struct gendisk *disk)
541 disk->major = MAJOR(devt); 602 disk->major = MAJOR(devt);
542 disk->first_minor = MINOR(devt); 603 disk->first_minor = MINOR(devt);
543 604
605 /* Register BDI before referencing it from bdev */
606 bdi = &disk->queue->backing_dev_info;
607 bdi_register_dev(bdi, disk_devt(disk));
608
544 blk_register_region(disk_devt(disk), disk->minors, NULL, 609 blk_register_region(disk_devt(disk), disk->minors, NULL,
545 exact_match, exact_lock, disk); 610 exact_match, exact_lock, disk);
546 register_disk(disk); 611 register_disk(disk);
547 blk_register_queue(disk); 612 blk_register_queue(disk);
548 613
549 bdi = &disk->queue->backing_dev_info;
550 bdi_register_dev(bdi, disk_devt(disk));
551 retval = sysfs_create_link(&disk_to_dev(disk)->kobj, &bdi->dev->kobj, 614 retval = sysfs_create_link(&disk_to_dev(disk)->kobj, &bdi->dev->kobj,
552 "bdi"); 615 "bdi");
553 WARN_ON(retval); 616 WARN_ON(retval);
554}
555 617
618 disk_add_events(disk);
619}
556EXPORT_SYMBOL(add_disk); 620EXPORT_SYMBOL(add_disk);
557EXPORT_SYMBOL(del_gendisk); /* in partitions/check.c */
558 621
559void unlink_gendisk(struct gendisk *disk) 622void del_gendisk(struct gendisk *disk)
560{ 623{
624 struct disk_part_iter piter;
625 struct hd_struct *part;
626
627 disk_del_events(disk);
628
629 /* invalidate stuff */
630 disk_part_iter_init(&piter, disk,
631 DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE);
632 while ((part = disk_part_iter_next(&piter))) {
633 invalidate_partition(disk, part->partno);
634 delete_partition(disk, part->partno);
635 }
636 disk_part_iter_exit(&piter);
637
638 invalidate_partition(disk, 0);
639 blk_free_devt(disk_to_dev(disk)->devt);
640 set_capacity(disk, 0);
641 disk->flags &= ~GENHD_FL_UP;
642
561 sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi"); 643 sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi");
562 bdi_unregister(&disk->queue->backing_dev_info); 644 bdi_unregister(&disk->queue->backing_dev_info);
563 blk_unregister_queue(disk); 645 blk_unregister_queue(disk);
564 blk_unregister_region(disk_devt(disk), disk->minors); 646 blk_unregister_region(disk_devt(disk), disk->minors);
647
648 part_stat_set_all(&disk->part0, 0);
649 disk->part0.stamp = 0;
650
651 kobject_put(disk->part0.holder_dir);
652 kobject_put(disk->slave_dir);
653 disk->driverfs_dev = NULL;
654 if (!sysfs_deprecated)
655 sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk)));
656 device_del(disk_to_dev(disk));
565} 657}
658EXPORT_SYMBOL(del_gendisk);
566 659
567/** 660/**
568 * get_gendisk - get partitioning information for a given device 661 * get_gendisk - get partitioning information for a given device
@@ -642,10 +735,11 @@ void __init printk_all_partitions(void)
642 struct hd_struct *part; 735 struct hd_struct *part;
643 char name_buf[BDEVNAME_SIZE]; 736 char name_buf[BDEVNAME_SIZE];
644 char devt_buf[BDEVT_SIZE]; 737 char devt_buf[BDEVT_SIZE];
738 u8 uuid[PARTITION_META_INFO_UUIDLTH * 2 + 1];
645 739
646 /* 740 /*
647 * Don't show empty devices or things that have been 741 * Don't show empty devices or things that have been
648 * surpressed 742 * suppressed
649 */ 743 */
650 if (get_capacity(disk) == 0 || 744 if (get_capacity(disk) == 0 ||
651 (disk->flags & GENHD_FL_SUPPRESS_PARTITION_INFO)) 745 (disk->flags & GENHD_FL_SUPPRESS_PARTITION_INFO))
@@ -660,10 +754,14 @@ void __init printk_all_partitions(void)
660 while ((part = disk_part_iter_next(&piter))) { 754 while ((part = disk_part_iter_next(&piter))) {
661 bool is_part0 = part == &disk->part0; 755 bool is_part0 = part == &disk->part0;
662 756
663 printk("%s%s %10llu %s", is_part0 ? "" : " ", 757 uuid[0] = 0;
758 if (part->info)
759 part_unpack_uuid(part->info->uuid, uuid);
760
761 printk("%s%s %10llu %s %s", is_part0 ? "" : " ",
664 bdevt_str(part_devt(part), devt_buf), 762 bdevt_str(part_devt(part), devt_buf),
665 (unsigned long long)part->nr_sects >> 1, 763 (unsigned long long)part->nr_sects >> 1,
666 disk_name(disk, part->partno, name_buf)); 764 disk_name(disk, part->partno, name_buf), uuid);
667 if (is_part0) { 765 if (is_part0) {
668 if (disk->driverfs_dev != NULL && 766 if (disk->driverfs_dev != NULL &&
669 disk->driverfs_dev->driver != NULL) 767 disk->driverfs_dev->driver != NULL)
@@ -730,7 +828,7 @@ static void *show_partition_start(struct seq_file *seqf, loff_t *pos)
730 static void *p; 828 static void *p;
731 829
732 p = disk_seqf_start(seqf, pos); 830 p = disk_seqf_start(seqf, pos);
733 if (!IS_ERR(p) && p && !*pos) 831 if (!IS_ERR_OR_NULL(p) && !*pos)
734 seq_puts(seqf, "major minor #blocks name\n\n"); 832 seq_puts(seqf, "major minor #blocks name\n\n");
735 return p; 833 return p;
736} 834}
@@ -803,10 +901,9 @@ static int __init genhd_device_init(void)
803 901
804 register_blkdev(BLOCK_EXT_MAJOR, "blkext"); 902 register_blkdev(BLOCK_EXT_MAJOR, "blkext");
805 903
806#ifndef CONFIG_SYSFS_DEPRECATED
807 /* create top-level block dir */ 904 /* create top-level block dir */
808 block_depr = kobject_create_and_add("block", NULL); 905 if (!sysfs_deprecated)
809#endif 906 block_depr = kobject_create_and_add("block", NULL);
810 return 0; 907 return 0;
811} 908}
812 909
@@ -1001,9 +1098,11 @@ static void disk_release(struct device *dev)
1001{ 1098{
1002 struct gendisk *disk = dev_to_disk(dev); 1099 struct gendisk *disk = dev_to_disk(dev);
1003 1100
1101 disk_release_events(disk);
1004 kfree(disk->random); 1102 kfree(disk->random);
1005 disk_replace_part_tbl(disk, NULL); 1103 disk_replace_part_tbl(disk, NULL);
1006 free_part_stats(&disk->part0); 1104 free_part_stats(&disk->part0);
1105 free_part_info(&disk->part0);
1007 kfree(disk); 1106 kfree(disk);
1008} 1107}
1009struct class block_class = { 1108struct class block_class = {
@@ -1059,14 +1158,14 @@ static int diskstats_show(struct seq_file *seqf, void *v)
1059 "%u %lu %lu %llu %u %u %u %u\n", 1158 "%u %lu %lu %llu %u %u %u %u\n",
1060 MAJOR(part_devt(hd)), MINOR(part_devt(hd)), 1159 MAJOR(part_devt(hd)), MINOR(part_devt(hd)),
1061 disk_name(gp, hd->partno, buf), 1160 disk_name(gp, hd->partno, buf),
1062 part_stat_read(hd, ios[0]), 1161 part_stat_read(hd, ios[READ]),
1063 part_stat_read(hd, merges[0]), 1162 part_stat_read(hd, merges[READ]),
1064 (unsigned long long)part_stat_read(hd, sectors[0]), 1163 (unsigned long long)part_stat_read(hd, sectors[READ]),
1065 jiffies_to_msecs(part_stat_read(hd, ticks[0])), 1164 jiffies_to_msecs(part_stat_read(hd, ticks[READ])),
1066 part_stat_read(hd, ios[1]), 1165 part_stat_read(hd, ios[WRITE]),
1067 part_stat_read(hd, merges[1]), 1166 part_stat_read(hd, merges[WRITE]),
1068 (unsigned long long)part_stat_read(hd, sectors[1]), 1167 (unsigned long long)part_stat_read(hd, sectors[WRITE]),
1069 jiffies_to_msecs(part_stat_read(hd, ticks[1])), 1168 jiffies_to_msecs(part_stat_read(hd, ticks[WRITE])),
1070 part_in_flight(hd), 1169 part_in_flight(hd),
1071 jiffies_to_msecs(part_stat_read(hd, io_ticks)), 1170 jiffies_to_msecs(part_stat_read(hd, io_ticks)),
1072 jiffies_to_msecs(part_stat_read(hd, time_in_queue)) 1171 jiffies_to_msecs(part_stat_read(hd, time_in_queue))
@@ -1105,29 +1204,6 @@ static int __init proc_genhd_init(void)
1105module_init(proc_genhd_init); 1204module_init(proc_genhd_init);
1106#endif /* CONFIG_PROC_FS */ 1205#endif /* CONFIG_PROC_FS */
1107 1206
1108static void media_change_notify_thread(struct work_struct *work)
1109{
1110 struct gendisk *gd = container_of(work, struct gendisk, async_notify);
1111 char event[] = "MEDIA_CHANGE=1";
1112 char *envp[] = { event, NULL };
1113
1114 /*
1115 * set enviroment vars to indicate which event this is for
1116 * so that user space will know to go check the media status.
1117 */
1118 kobject_uevent_env(&disk_to_dev(gd)->kobj, KOBJ_CHANGE, envp);
1119 put_device(gd->driverfs_dev);
1120}
1121
1122#if 0
1123void genhd_media_change_notify(struct gendisk *disk)
1124{
1125 get_device(disk->driverfs_dev);
1126 schedule_work(&disk->async_notify);
1127}
1128EXPORT_SYMBOL_GPL(genhd_media_change_notify);
1129#endif /* 0 */
1130
1131dev_t blk_lookup_devt(const char *name, int partno) 1207dev_t blk_lookup_devt(const char *name, int partno)
1132{ 1208{
1133 dev_t devt = MKDEV(0, 0); 1209 dev_t devt = MKDEV(0, 0);
@@ -1188,13 +1264,13 @@ struct gendisk *alloc_disk_node(int minors, int node_id)
1188 } 1264 }
1189 disk->part_tbl->part[0] = &disk->part0; 1265 disk->part_tbl->part[0] = &disk->part0;
1190 1266
1267 hd_ref_init(&disk->part0);
1268
1191 disk->minors = minors; 1269 disk->minors = minors;
1192 rand_initialize_disk(disk); 1270 rand_initialize_disk(disk);
1193 disk_to_dev(disk)->class = &block_class; 1271 disk_to_dev(disk)->class = &block_class;
1194 disk_to_dev(disk)->type = &disk_type; 1272 disk_to_dev(disk)->type = &disk_type;
1195 device_initialize(disk_to_dev(disk)); 1273 device_initialize(disk_to_dev(disk));
1196 INIT_WORK(&disk->async_notify,
1197 media_change_notify_thread);
1198 } 1274 }
1199 return disk; 1275 return disk;
1200} 1276}
@@ -1279,10 +1355,444 @@ int invalidate_partition(struct gendisk *disk, int partno)
1279 struct block_device *bdev = bdget_disk(disk, partno); 1355 struct block_device *bdev = bdget_disk(disk, partno);
1280 if (bdev) { 1356 if (bdev) {
1281 fsync_bdev(bdev); 1357 fsync_bdev(bdev);
1282 res = __invalidate_device(bdev); 1358 res = __invalidate_device(bdev, true);
1283 bdput(bdev); 1359 bdput(bdev);
1284 } 1360 }
1285 return res; 1361 return res;
1286} 1362}
1287 1363
1288EXPORT_SYMBOL(invalidate_partition); 1364EXPORT_SYMBOL(invalidate_partition);
1365
1366/*
1367 * Disk events - monitor disk events like media change and eject request.
1368 */
1369struct disk_events {
1370 struct list_head node; /* all disk_event's */
1371 struct gendisk *disk; /* the associated disk */
1372 spinlock_t lock;
1373
1374 struct mutex block_mutex; /* protects blocking */
1375 int block; /* event blocking depth */
1376 unsigned int pending; /* events already sent out */
1377 unsigned int clearing; /* events being cleared */
1378
1379 long poll_msecs; /* interval, -1 for default */
1380 struct delayed_work dwork;
1381};
1382
1383static const char *disk_events_strs[] = {
1384 [ilog2(DISK_EVENT_MEDIA_CHANGE)] = "media_change",
1385 [ilog2(DISK_EVENT_EJECT_REQUEST)] = "eject_request",
1386};
1387
1388static char *disk_uevents[] = {
1389 [ilog2(DISK_EVENT_MEDIA_CHANGE)] = "DISK_MEDIA_CHANGE=1",
1390 [ilog2(DISK_EVENT_EJECT_REQUEST)] = "DISK_EJECT_REQUEST=1",
1391};
1392
1393/* list of all disk_events */
1394static DEFINE_MUTEX(disk_events_mutex);
1395static LIST_HEAD(disk_events);
1396
1397/* disable in-kernel polling by default */
1398static unsigned long disk_events_dfl_poll_msecs = 0;
1399
1400static unsigned long disk_events_poll_jiffies(struct gendisk *disk)
1401{
1402 struct disk_events *ev = disk->ev;
1403 long intv_msecs = 0;
1404
1405 /*
1406 * If device-specific poll interval is set, always use it. If
1407 * the default is being used, poll iff there are events which
1408 * can't be monitored asynchronously.
1409 */
1410 if (ev->poll_msecs >= 0)
1411 intv_msecs = ev->poll_msecs;
1412 else if (disk->events & ~disk->async_events)
1413 intv_msecs = disk_events_dfl_poll_msecs;
1414
1415 return msecs_to_jiffies(intv_msecs);
1416}
1417
1418/**
1419 * disk_block_events - block and flush disk event checking
1420 * @disk: disk to block events for
1421 *
1422 * On return from this function, it is guaranteed that event checking
1423 * isn't in progress and won't happen until unblocked by
1424 * disk_unblock_events(). Events blocking is counted and the actual
1425 * unblocking happens after the matching number of unblocks are done.
1426 *
1427 * Note that this intentionally does not block event checking from
1428 * disk_clear_events().
1429 *
1430 * CONTEXT:
1431 * Might sleep.
1432 */
1433void disk_block_events(struct gendisk *disk)
1434{
1435 struct disk_events *ev = disk->ev;
1436 unsigned long flags;
1437 bool cancel;
1438
1439 if (!ev)
1440 return;
1441
1442 /*
1443 * Outer mutex ensures that the first blocker completes canceling
1444 * the event work before further blockers are allowed to finish.
1445 */
1446 mutex_lock(&ev->block_mutex);
1447
1448 spin_lock_irqsave(&ev->lock, flags);
1449 cancel = !ev->block++;
1450 spin_unlock_irqrestore(&ev->lock, flags);
1451
1452 if (cancel)
1453 cancel_delayed_work_sync(&disk->ev->dwork);
1454
1455 mutex_unlock(&ev->block_mutex);
1456}
1457
1458static void __disk_unblock_events(struct gendisk *disk, bool check_now)
1459{
1460 struct disk_events *ev = disk->ev;
1461 unsigned long intv;
1462 unsigned long flags;
1463
1464 spin_lock_irqsave(&ev->lock, flags);
1465
1466 if (WARN_ON_ONCE(ev->block <= 0))
1467 goto out_unlock;
1468
1469 if (--ev->block)
1470 goto out_unlock;
1471
1472 /*
1473 * Not exactly a latency critical operation, set poll timer
1474 * slack to 25% and kick event check.
1475 */
1476 intv = disk_events_poll_jiffies(disk);
1477 set_timer_slack(&ev->dwork.timer, intv / 4);
1478 if (check_now)
1479 queue_delayed_work(system_nrt_wq, &ev->dwork, 0);
1480 else if (intv)
1481 queue_delayed_work(system_nrt_wq, &ev->dwork, intv);
1482out_unlock:
1483 spin_unlock_irqrestore(&ev->lock, flags);
1484}
1485
1486/**
1487 * disk_unblock_events - unblock disk event checking
1488 * @disk: disk to unblock events for
1489 *
1490 * Undo disk_block_events(). When the block count reaches zero, it
1491 * starts events polling if configured.
1492 *
1493 * CONTEXT:
1494 * Don't care. Safe to call from irq context.
1495 */
1496void disk_unblock_events(struct gendisk *disk)
1497{
1498 if (disk->ev)
1499 __disk_unblock_events(disk, false);
1500}
1501
1502/**
1503 * disk_check_events - schedule immediate event checking
1504 * @disk: disk to check events for
1505 *
1506 * Schedule immediate event checking on @disk if not blocked.
1507 *
1508 * CONTEXT:
1509 * Don't care. Safe to call from irq context.
1510 */
1511void disk_check_events(struct gendisk *disk)
1512{
1513 struct disk_events *ev = disk->ev;
1514 unsigned long flags;
1515
1516 if (!ev)
1517 return;
1518
1519 spin_lock_irqsave(&ev->lock, flags);
1520 if (!ev->block) {
1521 cancel_delayed_work(&ev->dwork);
1522 queue_delayed_work(system_nrt_wq, &ev->dwork, 0);
1523 }
1524 spin_unlock_irqrestore(&ev->lock, flags);
1525}
1526EXPORT_SYMBOL_GPL(disk_check_events);
1527
1528/**
1529 * disk_clear_events - synchronously check, clear and return pending events
1530 * @disk: disk to fetch and clear events from
1531 * @mask: mask of events to be fetched and clearted
1532 *
1533 * Disk events are synchronously checked and pending events in @mask
1534 * are cleared and returned. This ignores the block count.
1535 *
1536 * CONTEXT:
1537 * Might sleep.
1538 */
1539unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask)
1540{
1541 const struct block_device_operations *bdops = disk->fops;
1542 struct disk_events *ev = disk->ev;
1543 unsigned int pending;
1544
1545 if (!ev) {
1546 /* for drivers still using the old ->media_changed method */
1547 if ((mask & DISK_EVENT_MEDIA_CHANGE) &&
1548 bdops->media_changed && bdops->media_changed(disk))
1549 return DISK_EVENT_MEDIA_CHANGE;
1550 return 0;
1551 }
1552
1553 /* tell the workfn about the events being cleared */
1554 spin_lock_irq(&ev->lock);
1555 ev->clearing |= mask;
1556 spin_unlock_irq(&ev->lock);
1557
1558 /* uncondtionally schedule event check and wait for it to finish */
1559 disk_block_events(disk);
1560 queue_delayed_work(system_nrt_wq, &ev->dwork, 0);
1561 flush_delayed_work(&ev->dwork);
1562 __disk_unblock_events(disk, false);
1563
1564 /* then, fetch and clear pending events */
1565 spin_lock_irq(&ev->lock);
1566 WARN_ON_ONCE(ev->clearing & mask); /* cleared by workfn */
1567 pending = ev->pending & mask;
1568 ev->pending &= ~mask;
1569 spin_unlock_irq(&ev->lock);
1570
1571 return pending;
1572}
1573
1574static void disk_events_workfn(struct work_struct *work)
1575{
1576 struct delayed_work *dwork = to_delayed_work(work);
1577 struct disk_events *ev = container_of(dwork, struct disk_events, dwork);
1578 struct gendisk *disk = ev->disk;
1579 char *envp[ARRAY_SIZE(disk_uevents) + 1] = { };
1580 unsigned int clearing = ev->clearing;
1581 unsigned int events;
1582 unsigned long intv;
1583 int nr_events = 0, i;
1584
1585 /* check events */
1586 events = disk->fops->check_events(disk, clearing);
1587
1588 /* accumulate pending events and schedule next poll if necessary */
1589 spin_lock_irq(&ev->lock);
1590
1591 events &= ~ev->pending;
1592 ev->pending |= events;
1593 ev->clearing &= ~clearing;
1594
1595 intv = disk_events_poll_jiffies(disk);
1596 if (!ev->block && intv)
1597 queue_delayed_work(system_nrt_wq, &ev->dwork, intv);
1598
1599 spin_unlock_irq(&ev->lock);
1600
1601 /*
1602 * Tell userland about new events. Only the events listed in
1603 * @disk->events are reported. Unlisted events are processed the
1604 * same internally but never get reported to userland.
1605 */
1606 for (i = 0; i < ARRAY_SIZE(disk_uevents); i++)
1607 if (events & disk->events & (1 << i))
1608 envp[nr_events++] = disk_uevents[i];
1609
1610 if (nr_events)
1611 kobject_uevent_env(&disk_to_dev(disk)->kobj, KOBJ_CHANGE, envp);
1612}
1613
1614/*
1615 * A disk events enabled device has the following sysfs nodes under
1616 * its /sys/block/X/ directory.
1617 *
1618 * events : list of all supported events
1619 * events_async : list of events which can be detected w/o polling
1620 * events_poll_msecs : polling interval, 0: disable, -1: system default
1621 */
1622static ssize_t __disk_events_show(unsigned int events, char *buf)
1623{
1624 const char *delim = "";
1625 ssize_t pos = 0;
1626 int i;
1627
1628 for (i = 0; i < ARRAY_SIZE(disk_events_strs); i++)
1629 if (events & (1 << i)) {
1630 pos += sprintf(buf + pos, "%s%s",
1631 delim, disk_events_strs[i]);
1632 delim = " ";
1633 }
1634 if (pos)
1635 pos += sprintf(buf + pos, "\n");
1636 return pos;
1637}
1638
1639static ssize_t disk_events_show(struct device *dev,
1640 struct device_attribute *attr, char *buf)
1641{
1642 struct gendisk *disk = dev_to_disk(dev);
1643
1644 return __disk_events_show(disk->events, buf);
1645}
1646
1647static ssize_t disk_events_async_show(struct device *dev,
1648 struct device_attribute *attr, char *buf)
1649{
1650 struct gendisk *disk = dev_to_disk(dev);
1651
1652 return __disk_events_show(disk->async_events, buf);
1653}
1654
1655static ssize_t disk_events_poll_msecs_show(struct device *dev,
1656 struct device_attribute *attr,
1657 char *buf)
1658{
1659 struct gendisk *disk = dev_to_disk(dev);
1660
1661 return sprintf(buf, "%ld\n", disk->ev->poll_msecs);
1662}
1663
1664static ssize_t disk_events_poll_msecs_store(struct device *dev,
1665 struct device_attribute *attr,
1666 const char *buf, size_t count)
1667{
1668 struct gendisk *disk = dev_to_disk(dev);
1669 long intv;
1670
1671 if (!count || !sscanf(buf, "%ld", &intv))
1672 return -EINVAL;
1673
1674 if (intv < 0 && intv != -1)
1675 return -EINVAL;
1676
1677 disk_block_events(disk);
1678 disk->ev->poll_msecs = intv;
1679 __disk_unblock_events(disk, true);
1680
1681 return count;
1682}
1683
1684static const DEVICE_ATTR(events, S_IRUGO, disk_events_show, NULL);
1685static const DEVICE_ATTR(events_async, S_IRUGO, disk_events_async_show, NULL);
1686static const DEVICE_ATTR(events_poll_msecs, S_IRUGO|S_IWUSR,
1687 disk_events_poll_msecs_show,
1688 disk_events_poll_msecs_store);
1689
1690static const struct attribute *disk_events_attrs[] = {
1691 &dev_attr_events.attr,
1692 &dev_attr_events_async.attr,
1693 &dev_attr_events_poll_msecs.attr,
1694 NULL,
1695};
1696
1697/*
1698 * The default polling interval can be specified by the kernel
1699 * parameter block.events_dfl_poll_msecs which defaults to 0
1700 * (disable). This can also be modified runtime by writing to
1701 * /sys/module/block/events_dfl_poll_msecs.
1702 */
1703static int disk_events_set_dfl_poll_msecs(const char *val,
1704 const struct kernel_param *kp)
1705{
1706 struct disk_events *ev;
1707 int ret;
1708
1709 ret = param_set_ulong(val, kp);
1710 if (ret < 0)
1711 return ret;
1712
1713 mutex_lock(&disk_events_mutex);
1714
1715 list_for_each_entry(ev, &disk_events, node)
1716 disk_check_events(ev->disk);
1717
1718 mutex_unlock(&disk_events_mutex);
1719
1720 return 0;
1721}
1722
1723static const struct kernel_param_ops disk_events_dfl_poll_msecs_param_ops = {
1724 .set = disk_events_set_dfl_poll_msecs,
1725 .get = param_get_ulong,
1726};
1727
1728#undef MODULE_PARAM_PREFIX
1729#define MODULE_PARAM_PREFIX "block."
1730
1731module_param_cb(events_dfl_poll_msecs, &disk_events_dfl_poll_msecs_param_ops,
1732 &disk_events_dfl_poll_msecs, 0644);
1733
1734/*
1735 * disk_{add|del|release}_events - initialize and destroy disk_events.
1736 */
1737static void disk_add_events(struct gendisk *disk)
1738{
1739 struct disk_events *ev;
1740
1741 if (!disk->fops->check_events)
1742 return;
1743
1744 ev = kzalloc(sizeof(*ev), GFP_KERNEL);
1745 if (!ev) {
1746 pr_warn("%s: failed to initialize events\n", disk->disk_name);
1747 return;
1748 }
1749
1750 if (sysfs_create_files(&disk_to_dev(disk)->kobj,
1751 disk_events_attrs) < 0) {
1752 pr_warn("%s: failed to create sysfs files for events\n",
1753 disk->disk_name);
1754 kfree(ev);
1755 return;
1756 }
1757
1758 disk->ev = ev;
1759
1760 INIT_LIST_HEAD(&ev->node);
1761 ev->disk = disk;
1762 spin_lock_init(&ev->lock);
1763 mutex_init(&ev->block_mutex);
1764 ev->block = 1;
1765 ev->poll_msecs = -1;
1766 INIT_DELAYED_WORK(&ev->dwork, disk_events_workfn);
1767
1768 mutex_lock(&disk_events_mutex);
1769 list_add_tail(&ev->node, &disk_events);
1770 mutex_unlock(&disk_events_mutex);
1771
1772 /*
1773 * Block count is initialized to 1 and the following initial
1774 * unblock kicks it into action.
1775 */
1776 __disk_unblock_events(disk, true);
1777}
1778
1779static void disk_del_events(struct gendisk *disk)
1780{
1781 if (!disk->ev)
1782 return;
1783
1784 disk_block_events(disk);
1785
1786 mutex_lock(&disk_events_mutex);
1787 list_del_init(&disk->ev->node);
1788 mutex_unlock(&disk_events_mutex);
1789
1790 sysfs_remove_files(&disk_to_dev(disk)->kobj, disk_events_attrs);
1791}
1792
1793static void disk_release_events(struct gendisk *disk)
1794{
1795 /* the block count should be 1 from disk_del_events() */
1796 WARN_ON_ONCE(disk->ev && disk->ev->block != 1);
1797 kfree(disk->ev);
1798}
diff --git a/block/ioctl.c b/block/ioctl.c
index d8052f0dabd3..1124cd297263 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -5,7 +5,6 @@
5#include <linux/hdreg.h> 5#include <linux/hdreg.h>
6#include <linux/backing-dev.h> 6#include <linux/backing-dev.h>
7#include <linux/buffer_head.h> 7#include <linux/buffer_head.h>
8#include <linux/smp_lock.h>
9#include <linux/blktrace_api.h> 8#include <linux/blktrace_api.h>
10#include <asm/uaccess.h> 9#include <asm/uaccess.h>
11 10
@@ -62,7 +61,7 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user
62 61
63 /* all seems OK */ 62 /* all seems OK */
64 part = add_partition(disk, partno, start, length, 63 part = add_partition(disk, partno, start, length,
65 ADDPART_FLAG_NONE); 64 ADDPART_FLAG_NONE, NULL);
66 mutex_unlock(&bdev->bd_mutex); 65 mutex_unlock(&bdev->bd_mutex);
67 return IS_ERR(part) ? PTR_ERR(part) : 0; 66 return IS_ERR(part) ? PTR_ERR(part) : 0;
68 case BLKPG_DEL_PARTITION: 67 case BLKPG_DEL_PARTITION:
@@ -116,7 +115,7 @@ static int blkdev_reread_part(struct block_device *bdev)
116static int blk_ioctl_discard(struct block_device *bdev, uint64_t start, 115static int blk_ioctl_discard(struct block_device *bdev, uint64_t start,
117 uint64_t len, int secure) 116 uint64_t len, int secure)
118{ 117{
119 unsigned long flags = BLKDEV_IFL_WAIT; 118 unsigned long flags = 0;
120 119
121 if (start & 511) 120 if (start & 511)
122 return -EINVAL; 121 return -EINVAL;
@@ -125,10 +124,10 @@ static int blk_ioctl_discard(struct block_device *bdev, uint64_t start,
125 start >>= 9; 124 start >>= 9;
126 len >>= 9; 125 len >>= 9;
127 126
128 if (start + len > (bdev->bd_inode->i_size >> 9)) 127 if (start + len > (i_size_read(bdev->bd_inode) >> 9))
129 return -EINVAL; 128 return -EINVAL;
130 if (secure) 129 if (secure)
131 flags |= BLKDEV_IFL_SECURE; 130 flags |= BLKDEV_DISCARD_SECURE;
132 return blkdev_issue_discard(bdev, start, len, GFP_KERNEL, flags); 131 return blkdev_issue_discard(bdev, start, len, GFP_KERNEL, flags);
133} 132}
134 133
@@ -242,6 +241,7 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
242 * We need to set the startsect first, the driver may 241 * We need to set the startsect first, the driver may
243 * want to override it. 242 * want to override it.
244 */ 243 */
244 memset(&geo, 0, sizeof(geo));
245 geo.start = get_start_sect(bdev); 245 geo.start = get_start_sect(bdev);
246 ret = disk->fops->getgeo(bdev, &geo); 246 ret = disk->fops->getgeo(bdev, &geo);
247 if (ret) 247 if (ret)
@@ -294,11 +294,14 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
294 return -EINVAL; 294 return -EINVAL;
295 if (get_user(n, (int __user *) arg)) 295 if (get_user(n, (int __user *) arg))
296 return -EFAULT; 296 return -EFAULT;
297 if (!(mode & FMODE_EXCL) && bd_claim(bdev, &bdev) < 0) 297 if (!(mode & FMODE_EXCL)) {
298 return -EBUSY; 298 bdgrab(bdev);
299 if (blkdev_get(bdev, mode | FMODE_EXCL, &bdev) < 0)
300 return -EBUSY;
301 }
299 ret = set_blocksize(bdev, n); 302 ret = set_blocksize(bdev, n);
300 if (!(mode & FMODE_EXCL)) 303 if (!(mode & FMODE_EXCL))
301 bd_release(bdev); 304 blkdev_put(bdev, mode | FMODE_EXCL);
302 return ret; 305 return ret;
303 case BLKPG: 306 case BLKPG:
304 ret = blkpg_ioctl(bdev, (struct blkpg_ioctl_arg __user *) arg); 307 ret = blkpg_ioctl(bdev, (struct blkpg_ioctl_arg __user *) arg);
@@ -307,12 +310,12 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
307 ret = blkdev_reread_part(bdev); 310 ret = blkdev_reread_part(bdev);
308 break; 311 break;
309 case BLKGETSIZE: 312 case BLKGETSIZE:
310 size = bdev->bd_inode->i_size; 313 size = i_size_read(bdev->bd_inode);
311 if ((size >> 9) > ~0UL) 314 if ((size >> 9) > ~0UL)
312 return -EFBIG; 315 return -EFBIG;
313 return put_ulong(arg, size >> 9); 316 return put_ulong(arg, size >> 9);
314 case BLKGETSIZE64: 317 case BLKGETSIZE64:
315 return put_u64(arg, bdev->bd_inode->i_size); 318 return put_u64(arg, i_size_read(bdev->bd_inode));
316 case BLKTRACESTART: 319 case BLKTRACESTART:
317 case BLKTRACESTOP: 320 case BLKTRACESTOP:
318 case BLKTRACESETUP: 321 case BLKTRACESETUP:
diff --git a/block/noop-iosched.c b/block/noop-iosched.c
index 232c4b38cd37..06389e9ef96d 100644
--- a/block/noop-iosched.c
+++ b/block/noop-iosched.c
@@ -39,13 +39,6 @@ static void noop_add_request(struct request_queue *q, struct request *rq)
39 list_add_tail(&rq->queuelist, &nd->queue); 39 list_add_tail(&rq->queuelist, &nd->queue);
40} 40}
41 41
42static int noop_queue_empty(struct request_queue *q)
43{
44 struct noop_data *nd = q->elevator->elevator_data;
45
46 return list_empty(&nd->queue);
47}
48
49static struct request * 42static struct request *
50noop_former_request(struct request_queue *q, struct request *rq) 43noop_former_request(struct request_queue *q, struct request *rq)
51{ 44{
@@ -90,7 +83,6 @@ static struct elevator_type elevator_noop = {
90 .elevator_merge_req_fn = noop_merged_requests, 83 .elevator_merge_req_fn = noop_merged_requests,
91 .elevator_dispatch_fn = noop_dispatch, 84 .elevator_dispatch_fn = noop_dispatch,
92 .elevator_add_req_fn = noop_add_request, 85 .elevator_add_req_fn = noop_add_request,
93 .elevator_queue_empty_fn = noop_queue_empty,
94 .elevator_former_req_fn = noop_former_request, 86 .elevator_former_req_fn = noop_former_request,
95 .elevator_latter_req_fn = noop_latter_request, 87 .elevator_latter_req_fn = noop_latter_request,
96 .elevator_init_fn = noop_init_queue, 88 .elevator_init_fn = noop_init_queue,
diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index a8b5a10eb5b0..4f4230b79bb6 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -321,33 +321,47 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk,
321 if (hdr->iovec_count) { 321 if (hdr->iovec_count) {
322 const int size = sizeof(struct sg_iovec) * hdr->iovec_count; 322 const int size = sizeof(struct sg_iovec) * hdr->iovec_count;
323 size_t iov_data_len; 323 size_t iov_data_len;
324 struct sg_iovec *iov; 324 struct sg_iovec *sg_iov;
325 struct iovec *iov;
326 int i;
325 327
326 iov = kmalloc(size, GFP_KERNEL); 328 sg_iov = kmalloc(size, GFP_KERNEL);
327 if (!iov) { 329 if (!sg_iov) {
328 ret = -ENOMEM; 330 ret = -ENOMEM;
329 goto out; 331 goto out;
330 } 332 }
331 333
332 if (copy_from_user(iov, hdr->dxferp, size)) { 334 if (copy_from_user(sg_iov, hdr->dxferp, size)) {
333 kfree(iov); 335 kfree(sg_iov);
334 ret = -EFAULT; 336 ret = -EFAULT;
335 goto out; 337 goto out;
336 } 338 }
337 339
340 /*
341 * Sum up the vecs, making sure they don't overflow
342 */
343 iov = (struct iovec *) sg_iov;
344 iov_data_len = 0;
345 for (i = 0; i < hdr->iovec_count; i++) {
346 if (iov_data_len + iov[i].iov_len < iov_data_len) {
347 kfree(sg_iov);
348 ret = -EINVAL;
349 goto out;
350 }
351 iov_data_len += iov[i].iov_len;
352 }
353
338 /* SG_IO howto says that the shorter of the two wins */ 354 /* SG_IO howto says that the shorter of the two wins */
339 iov_data_len = iov_length((struct iovec *)iov,
340 hdr->iovec_count);
341 if (hdr->dxfer_len < iov_data_len) { 355 if (hdr->dxfer_len < iov_data_len) {
342 hdr->iovec_count = iov_shorten((struct iovec *)iov, 356 hdr->iovec_count = iov_shorten(iov,
343 hdr->iovec_count, 357 hdr->iovec_count,
344 hdr->dxfer_len); 358 hdr->dxfer_len);
345 iov_data_len = hdr->dxfer_len; 359 iov_data_len = hdr->dxfer_len;
346 } 360 }
347 361
348 ret = blk_rq_map_user_iov(q, rq, NULL, iov, hdr->iovec_count, 362 ret = blk_rq_map_user_iov(q, rq, NULL, sg_iov, hdr->iovec_count,
349 iov_data_len, GFP_KERNEL); 363 iov_data_len, GFP_KERNEL);
350 kfree(iov); 364 kfree(sg_iov);
351 } else if (hdr->dxfer_len) 365 } else if (hdr->dxfer_len)
352 ret = blk_rq_map_user(q, rq, NULL, hdr->dxferp, hdr->dxfer_len, 366 ret = blk_rq_map_user(q, rq, NULL, hdr->dxferp, hdr->dxfer_len,
353 GFP_KERNEL); 367 GFP_KERNEL);