diff options
author | Glenn Elliott <gelliott@cs.unc.edu> | 2012-03-04 19:47:13 -0500 |
---|---|---|
committer | Glenn Elliott <gelliott@cs.unc.edu> | 2012-03-04 19:47:13 -0500 |
commit | c71c03bda1e86c9d5198c5d83f712e695c4f2a1e (patch) | |
tree | ecb166cb3e2b7e2adb3b5e292245fefd23381ac8 /block | |
parent | ea53c912f8a86a8567697115b6a0d8152beee5c8 (diff) | |
parent | 6a00f206debf8a5c8899055726ad127dbeeed098 (diff) |
Merge branch 'mpi-master' into wip-k-fmlpwip-k-fmlp
Conflicts:
litmus/sched_cedf.c
Diffstat (limited to 'block')
-rw-r--r-- | block/Kconfig | 14 | ||||
-rw-r--r-- | block/Makefile | 3 | ||||
-rw-r--r-- | block/blk-barrier.c | 350 | ||||
-rw-r--r-- | block/blk-cgroup.c | 997 | ||||
-rw-r--r-- | block/blk-cgroup.h | 138 | ||||
-rw-r--r-- | block/blk-core.c | 829 | ||||
-rw-r--r-- | block/blk-exec.c | 15 | ||||
-rw-r--r-- | block/blk-flush.c | 443 | ||||
-rw-r--r-- | block/blk-integrity.c | 106 | ||||
-rw-r--r-- | block/blk-ioc.c | 26 | ||||
-rw-r--r-- | block/blk-lib.c | 134 | ||||
-rw-r--r-- | block/blk-map.c | 8 | ||||
-rw-r--r-- | block/blk-merge.c | 38 | ||||
-rw-r--r-- | block/blk-settings.c | 107 | ||||
-rw-r--r-- | block/blk-sysfs.c | 27 | ||||
-rw-r--r-- | block/blk-throttle.c | 1312 | ||||
-rw-r--r-- | block/blk.h | 45 | ||||
-rw-r--r-- | block/bsg.c | 12 | ||||
-rw-r--r-- | block/cfq-iosched.c | 624 | ||||
-rw-r--r-- | block/cfq.h | 8 | ||||
-rw-r--r-- | block/compat_ioctl.c | 5 | ||||
-rw-r--r-- | block/deadline-iosched.c | 9 | ||||
-rw-r--r-- | block/elevator.c | 202 | ||||
-rw-r--r-- | block/genhd.c | 608 | ||||
-rw-r--r-- | block/ioctl.c | 23 | ||||
-rw-r--r-- | block/noop-iosched.c | 8 | ||||
-rw-r--r-- | block/scsi_ioctl.c | 34 |
27 files changed, 4553 insertions, 1572 deletions
diff --git a/block/Kconfig b/block/Kconfig index 9be0b56eaee1..60be1e0455da 100644 --- a/block/Kconfig +++ b/block/Kconfig | |||
@@ -2,7 +2,7 @@ | |||
2 | # Block layer core configuration | 2 | # Block layer core configuration |
3 | # | 3 | # |
4 | menuconfig BLOCK | 4 | menuconfig BLOCK |
5 | bool "Enable the block layer" if EMBEDDED | 5 | bool "Enable the block layer" if EXPERT |
6 | default y | 6 | default y |
7 | help | 7 | help |
8 | Provide block layer support for the kernel. | 8 | Provide block layer support for the kernel. |
@@ -77,6 +77,18 @@ config BLK_DEV_INTEGRITY | |||
77 | T10/SCSI Data Integrity Field or the T13/ATA External Path | 77 | T10/SCSI Data Integrity Field or the T13/ATA External Path |
78 | Protection. If in doubt, say N. | 78 | Protection. If in doubt, say N. |
79 | 79 | ||
80 | config BLK_DEV_THROTTLING | ||
81 | bool "Block layer bio throttling support" | ||
82 | depends on BLK_CGROUP=y && EXPERIMENTAL | ||
83 | default n | ||
84 | ---help--- | ||
85 | Block layer bio throttling support. It can be used to limit | ||
86 | the IO rate to a device. IO rate policies are per cgroup and | ||
87 | one needs to mount and use blkio cgroup controller for creating | ||
88 | cgroups and specifying per device IO rate policies. | ||
89 | |||
90 | See Documentation/cgroups/blkio-controller.txt for more information. | ||
91 | |||
80 | endif # BLOCK | 92 | endif # BLOCK |
81 | 93 | ||
82 | config BLOCK_COMPAT | 94 | config BLOCK_COMPAT |
diff --git a/block/Makefile b/block/Makefile index 0bb499a739cd..0fec4b3fab51 100644 --- a/block/Makefile +++ b/block/Makefile | |||
@@ -3,12 +3,13 @@ | |||
3 | # | 3 | # |
4 | 4 | ||
5 | obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \ | 5 | obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \ |
6 | blk-barrier.o blk-settings.o blk-ioc.o blk-map.o \ | 6 | blk-flush.o blk-settings.o blk-ioc.o blk-map.o \ |
7 | blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \ | 7 | blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \ |
8 | blk-iopoll.o blk-lib.o ioctl.o genhd.o scsi_ioctl.o | 8 | blk-iopoll.o blk-lib.o ioctl.o genhd.o scsi_ioctl.o |
9 | 9 | ||
10 | obj-$(CONFIG_BLK_DEV_BSG) += bsg.o | 10 | obj-$(CONFIG_BLK_DEV_BSG) += bsg.o |
11 | obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o | 11 | obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o |
12 | obj-$(CONFIG_BLK_DEV_THROTTLING) += blk-throttle.o | ||
12 | obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o | 13 | obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o |
13 | obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o | 14 | obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o |
14 | obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o | 15 | obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o |
diff --git a/block/blk-barrier.c b/block/blk-barrier.c deleted file mode 100644 index f0faefca032f..000000000000 --- a/block/blk-barrier.c +++ /dev/null | |||
@@ -1,350 +0,0 @@ | |||
1 | /* | ||
2 | * Functions related to barrier IO handling | ||
3 | */ | ||
4 | #include <linux/kernel.h> | ||
5 | #include <linux/module.h> | ||
6 | #include <linux/bio.h> | ||
7 | #include <linux/blkdev.h> | ||
8 | #include <linux/gfp.h> | ||
9 | |||
10 | #include "blk.h" | ||
11 | |||
12 | /** | ||
13 | * blk_queue_ordered - does this queue support ordered writes | ||
14 | * @q: the request queue | ||
15 | * @ordered: one of QUEUE_ORDERED_* | ||
16 | * | ||
17 | * Description: | ||
18 | * For journalled file systems, doing ordered writes on a commit | ||
19 | * block instead of explicitly doing wait_on_buffer (which is bad | ||
20 | * for performance) can be a big win. Block drivers supporting this | ||
21 | * feature should call this function and indicate so. | ||
22 | * | ||
23 | **/ | ||
24 | int blk_queue_ordered(struct request_queue *q, unsigned ordered) | ||
25 | { | ||
26 | if (ordered != QUEUE_ORDERED_NONE && | ||
27 | ordered != QUEUE_ORDERED_DRAIN && | ||
28 | ordered != QUEUE_ORDERED_DRAIN_FLUSH && | ||
29 | ordered != QUEUE_ORDERED_DRAIN_FUA && | ||
30 | ordered != QUEUE_ORDERED_TAG && | ||
31 | ordered != QUEUE_ORDERED_TAG_FLUSH && | ||
32 | ordered != QUEUE_ORDERED_TAG_FUA) { | ||
33 | printk(KERN_ERR "blk_queue_ordered: bad value %d\n", ordered); | ||
34 | return -EINVAL; | ||
35 | } | ||
36 | |||
37 | q->ordered = ordered; | ||
38 | q->next_ordered = ordered; | ||
39 | |||
40 | return 0; | ||
41 | } | ||
42 | EXPORT_SYMBOL(blk_queue_ordered); | ||
43 | |||
44 | /* | ||
45 | * Cache flushing for ordered writes handling | ||
46 | */ | ||
47 | unsigned blk_ordered_cur_seq(struct request_queue *q) | ||
48 | { | ||
49 | if (!q->ordseq) | ||
50 | return 0; | ||
51 | return 1 << ffz(q->ordseq); | ||
52 | } | ||
53 | |||
54 | unsigned blk_ordered_req_seq(struct request *rq) | ||
55 | { | ||
56 | struct request_queue *q = rq->q; | ||
57 | |||
58 | BUG_ON(q->ordseq == 0); | ||
59 | |||
60 | if (rq == &q->pre_flush_rq) | ||
61 | return QUEUE_ORDSEQ_PREFLUSH; | ||
62 | if (rq == &q->bar_rq) | ||
63 | return QUEUE_ORDSEQ_BAR; | ||
64 | if (rq == &q->post_flush_rq) | ||
65 | return QUEUE_ORDSEQ_POSTFLUSH; | ||
66 | |||
67 | /* | ||
68 | * !fs requests don't need to follow barrier ordering. Always | ||
69 | * put them at the front. This fixes the following deadlock. | ||
70 | * | ||
71 | * http://thread.gmane.org/gmane.linux.kernel/537473 | ||
72 | */ | ||
73 | if (rq->cmd_type != REQ_TYPE_FS) | ||
74 | return QUEUE_ORDSEQ_DRAIN; | ||
75 | |||
76 | if ((rq->cmd_flags & REQ_ORDERED_COLOR) == | ||
77 | (q->orig_bar_rq->cmd_flags & REQ_ORDERED_COLOR)) | ||
78 | return QUEUE_ORDSEQ_DRAIN; | ||
79 | else | ||
80 | return QUEUE_ORDSEQ_DONE; | ||
81 | } | ||
82 | |||
83 | bool blk_ordered_complete_seq(struct request_queue *q, unsigned seq, int error) | ||
84 | { | ||
85 | struct request *rq; | ||
86 | |||
87 | if (error && !q->orderr) | ||
88 | q->orderr = error; | ||
89 | |||
90 | BUG_ON(q->ordseq & seq); | ||
91 | q->ordseq |= seq; | ||
92 | |||
93 | if (blk_ordered_cur_seq(q) != QUEUE_ORDSEQ_DONE) | ||
94 | return false; | ||
95 | |||
96 | /* | ||
97 | * Okay, sequence complete. | ||
98 | */ | ||
99 | q->ordseq = 0; | ||
100 | rq = q->orig_bar_rq; | ||
101 | __blk_end_request_all(rq, q->orderr); | ||
102 | return true; | ||
103 | } | ||
104 | |||
105 | static void pre_flush_end_io(struct request *rq, int error) | ||
106 | { | ||
107 | elv_completed_request(rq->q, rq); | ||
108 | blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_PREFLUSH, error); | ||
109 | } | ||
110 | |||
111 | static void bar_end_io(struct request *rq, int error) | ||
112 | { | ||
113 | elv_completed_request(rq->q, rq); | ||
114 | blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_BAR, error); | ||
115 | } | ||
116 | |||
117 | static void post_flush_end_io(struct request *rq, int error) | ||
118 | { | ||
119 | elv_completed_request(rq->q, rq); | ||
120 | blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_POSTFLUSH, error); | ||
121 | } | ||
122 | |||
123 | static void queue_flush(struct request_queue *q, unsigned which) | ||
124 | { | ||
125 | struct request *rq; | ||
126 | rq_end_io_fn *end_io; | ||
127 | |||
128 | if (which == QUEUE_ORDERED_DO_PREFLUSH) { | ||
129 | rq = &q->pre_flush_rq; | ||
130 | end_io = pre_flush_end_io; | ||
131 | } else { | ||
132 | rq = &q->post_flush_rq; | ||
133 | end_io = post_flush_end_io; | ||
134 | } | ||
135 | |||
136 | blk_rq_init(q, rq); | ||
137 | rq->cmd_type = REQ_TYPE_FS; | ||
138 | rq->cmd_flags = REQ_HARDBARRIER | REQ_FLUSH; | ||
139 | rq->rq_disk = q->orig_bar_rq->rq_disk; | ||
140 | rq->end_io = end_io; | ||
141 | |||
142 | elv_insert(q, rq, ELEVATOR_INSERT_FRONT); | ||
143 | } | ||
144 | |||
145 | static inline bool start_ordered(struct request_queue *q, struct request **rqp) | ||
146 | { | ||
147 | struct request *rq = *rqp; | ||
148 | unsigned skip = 0; | ||
149 | |||
150 | q->orderr = 0; | ||
151 | q->ordered = q->next_ordered; | ||
152 | q->ordseq |= QUEUE_ORDSEQ_STARTED; | ||
153 | |||
154 | /* | ||
155 | * For an empty barrier, there's no actual BAR request, which | ||
156 | * in turn makes POSTFLUSH unnecessary. Mask them off. | ||
157 | */ | ||
158 | if (!blk_rq_sectors(rq)) { | ||
159 | q->ordered &= ~(QUEUE_ORDERED_DO_BAR | | ||
160 | QUEUE_ORDERED_DO_POSTFLUSH); | ||
161 | /* | ||
162 | * Empty barrier on a write-through device w/ ordered | ||
163 | * tag has no command to issue and without any command | ||
164 | * to issue, ordering by tag can't be used. Drain | ||
165 | * instead. | ||
166 | */ | ||
167 | if ((q->ordered & QUEUE_ORDERED_BY_TAG) && | ||
168 | !(q->ordered & QUEUE_ORDERED_DO_PREFLUSH)) { | ||
169 | q->ordered &= ~QUEUE_ORDERED_BY_TAG; | ||
170 | q->ordered |= QUEUE_ORDERED_BY_DRAIN; | ||
171 | } | ||
172 | } | ||
173 | |||
174 | /* stash away the original request */ | ||
175 | blk_dequeue_request(rq); | ||
176 | q->orig_bar_rq = rq; | ||
177 | rq = NULL; | ||
178 | |||
179 | /* | ||
180 | * Queue ordered sequence. As we stack them at the head, we | ||
181 | * need to queue in reverse order. Note that we rely on that | ||
182 | * no fs request uses ELEVATOR_INSERT_FRONT and thus no fs | ||
183 | * request gets inbetween ordered sequence. | ||
184 | */ | ||
185 | if (q->ordered & QUEUE_ORDERED_DO_POSTFLUSH) { | ||
186 | queue_flush(q, QUEUE_ORDERED_DO_POSTFLUSH); | ||
187 | rq = &q->post_flush_rq; | ||
188 | } else | ||
189 | skip |= QUEUE_ORDSEQ_POSTFLUSH; | ||
190 | |||
191 | if (q->ordered & QUEUE_ORDERED_DO_BAR) { | ||
192 | rq = &q->bar_rq; | ||
193 | |||
194 | /* initialize proxy request and queue it */ | ||
195 | blk_rq_init(q, rq); | ||
196 | if (bio_data_dir(q->orig_bar_rq->bio) == WRITE) | ||
197 | rq->cmd_flags |= REQ_WRITE; | ||
198 | if (q->ordered & QUEUE_ORDERED_DO_FUA) | ||
199 | rq->cmd_flags |= REQ_FUA; | ||
200 | init_request_from_bio(rq, q->orig_bar_rq->bio); | ||
201 | rq->end_io = bar_end_io; | ||
202 | |||
203 | elv_insert(q, rq, ELEVATOR_INSERT_FRONT); | ||
204 | } else | ||
205 | skip |= QUEUE_ORDSEQ_BAR; | ||
206 | |||
207 | if (q->ordered & QUEUE_ORDERED_DO_PREFLUSH) { | ||
208 | queue_flush(q, QUEUE_ORDERED_DO_PREFLUSH); | ||
209 | rq = &q->pre_flush_rq; | ||
210 | } else | ||
211 | skip |= QUEUE_ORDSEQ_PREFLUSH; | ||
212 | |||
213 | if ((q->ordered & QUEUE_ORDERED_BY_DRAIN) && queue_in_flight(q)) | ||
214 | rq = NULL; | ||
215 | else | ||
216 | skip |= QUEUE_ORDSEQ_DRAIN; | ||
217 | |||
218 | *rqp = rq; | ||
219 | |||
220 | /* | ||
221 | * Complete skipped sequences. If whole sequence is complete, | ||
222 | * return false to tell elevator that this request is gone. | ||
223 | */ | ||
224 | return !blk_ordered_complete_seq(q, skip, 0); | ||
225 | } | ||
226 | |||
227 | bool blk_do_ordered(struct request_queue *q, struct request **rqp) | ||
228 | { | ||
229 | struct request *rq = *rqp; | ||
230 | const int is_barrier = rq->cmd_type == REQ_TYPE_FS && | ||
231 | (rq->cmd_flags & REQ_HARDBARRIER); | ||
232 | |||
233 | if (!q->ordseq) { | ||
234 | if (!is_barrier) | ||
235 | return true; | ||
236 | |||
237 | if (q->next_ordered != QUEUE_ORDERED_NONE) | ||
238 | return start_ordered(q, rqp); | ||
239 | else { | ||
240 | /* | ||
241 | * Queue ordering not supported. Terminate | ||
242 | * with prejudice. | ||
243 | */ | ||
244 | blk_dequeue_request(rq); | ||
245 | __blk_end_request_all(rq, -EOPNOTSUPP); | ||
246 | *rqp = NULL; | ||
247 | return false; | ||
248 | } | ||
249 | } | ||
250 | |||
251 | /* | ||
252 | * Ordered sequence in progress | ||
253 | */ | ||
254 | |||
255 | /* Special requests are not subject to ordering rules. */ | ||
256 | if (rq->cmd_type != REQ_TYPE_FS && | ||
257 | rq != &q->pre_flush_rq && rq != &q->post_flush_rq) | ||
258 | return true; | ||
259 | |||
260 | if (q->ordered & QUEUE_ORDERED_BY_TAG) { | ||
261 | /* Ordered by tag. Blocking the next barrier is enough. */ | ||
262 | if (is_barrier && rq != &q->bar_rq) | ||
263 | *rqp = NULL; | ||
264 | } else { | ||
265 | /* Ordered by draining. Wait for turn. */ | ||
266 | WARN_ON(blk_ordered_req_seq(rq) < blk_ordered_cur_seq(q)); | ||
267 | if (blk_ordered_req_seq(rq) > blk_ordered_cur_seq(q)) | ||
268 | *rqp = NULL; | ||
269 | } | ||
270 | |||
271 | return true; | ||
272 | } | ||
273 | |||
274 | static void bio_end_empty_barrier(struct bio *bio, int err) | ||
275 | { | ||
276 | if (err) { | ||
277 | if (err == -EOPNOTSUPP) | ||
278 | set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); | ||
279 | clear_bit(BIO_UPTODATE, &bio->bi_flags); | ||
280 | } | ||
281 | if (bio->bi_private) | ||
282 | complete(bio->bi_private); | ||
283 | bio_put(bio); | ||
284 | } | ||
285 | |||
286 | /** | ||
287 | * blkdev_issue_flush - queue a flush | ||
288 | * @bdev: blockdev to issue flush for | ||
289 | * @gfp_mask: memory allocation flags (for bio_alloc) | ||
290 | * @error_sector: error sector | ||
291 | * @flags: BLKDEV_IFL_* flags to control behaviour | ||
292 | * | ||
293 | * Description: | ||
294 | * Issue a flush for the block device in question. Caller can supply | ||
295 | * room for storing the error offset in case of a flush error, if they | ||
296 | * wish to. If WAIT flag is not passed then caller may check only what | ||
297 | * request was pushed in some internal queue for later handling. | ||
298 | */ | ||
299 | int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask, | ||
300 | sector_t *error_sector, unsigned long flags) | ||
301 | { | ||
302 | DECLARE_COMPLETION_ONSTACK(wait); | ||
303 | struct request_queue *q; | ||
304 | struct bio *bio; | ||
305 | int ret = 0; | ||
306 | |||
307 | if (bdev->bd_disk == NULL) | ||
308 | return -ENXIO; | ||
309 | |||
310 | q = bdev_get_queue(bdev); | ||
311 | if (!q) | ||
312 | return -ENXIO; | ||
313 | |||
314 | /* | ||
315 | * some block devices may not have their queue correctly set up here | ||
316 | * (e.g. loop device without a backing file) and so issuing a flush | ||
317 | * here will panic. Ensure there is a request function before issuing | ||
318 | * the barrier. | ||
319 | */ | ||
320 | if (!q->make_request_fn) | ||
321 | return -ENXIO; | ||
322 | |||
323 | bio = bio_alloc(gfp_mask, 0); | ||
324 | bio->bi_end_io = bio_end_empty_barrier; | ||
325 | bio->bi_bdev = bdev; | ||
326 | if (test_bit(BLKDEV_WAIT, &flags)) | ||
327 | bio->bi_private = &wait; | ||
328 | |||
329 | bio_get(bio); | ||
330 | submit_bio(WRITE_BARRIER, bio); | ||
331 | if (test_bit(BLKDEV_WAIT, &flags)) { | ||
332 | wait_for_completion(&wait); | ||
333 | /* | ||
334 | * The driver must store the error location in ->bi_sector, if | ||
335 | * it supports it. For non-stacked drivers, this should be | ||
336 | * copied from blk_rq_pos(rq). | ||
337 | */ | ||
338 | if (error_sector) | ||
339 | *error_sector = bio->bi_sector; | ||
340 | } | ||
341 | |||
342 | if (bio_flagged(bio, BIO_EOPNOTSUPP)) | ||
343 | ret = -EOPNOTSUPP; | ||
344 | else if (!bio_flagged(bio, BIO_UPTODATE)) | ||
345 | ret = -EIO; | ||
346 | |||
347 | bio_put(bio); | ||
348 | return ret; | ||
349 | } | ||
350 | EXPORT_SYMBOL(blkdev_issue_flush); | ||
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 2fef1ef931a0..bcaf16ee6ad1 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c | |||
@@ -30,18 +30,22 @@ EXPORT_SYMBOL_GPL(blkio_root_cgroup); | |||
30 | 30 | ||
31 | static struct cgroup_subsys_state *blkiocg_create(struct cgroup_subsys *, | 31 | static struct cgroup_subsys_state *blkiocg_create(struct cgroup_subsys *, |
32 | struct cgroup *); | 32 | struct cgroup *); |
33 | static int blkiocg_can_attach(struct cgroup_subsys *, struct cgroup *, | 33 | static int blkiocg_can_attach_task(struct cgroup *, struct task_struct *); |
34 | struct task_struct *, bool); | 34 | static void blkiocg_attach_task(struct cgroup *, struct task_struct *); |
35 | static void blkiocg_attach(struct cgroup_subsys *, struct cgroup *, | ||
36 | struct cgroup *, struct task_struct *, bool); | ||
37 | static void blkiocg_destroy(struct cgroup_subsys *, struct cgroup *); | 35 | static void blkiocg_destroy(struct cgroup_subsys *, struct cgroup *); |
38 | static int blkiocg_populate(struct cgroup_subsys *, struct cgroup *); | 36 | static int blkiocg_populate(struct cgroup_subsys *, struct cgroup *); |
39 | 37 | ||
38 | /* for encoding cft->private value on file */ | ||
39 | #define BLKIOFILE_PRIVATE(x, val) (((x) << 16) | (val)) | ||
40 | /* What policy owns the file, proportional or throttle */ | ||
41 | #define BLKIOFILE_POLICY(val) (((val) >> 16) & 0xffff) | ||
42 | #define BLKIOFILE_ATTR(val) ((val) & 0xffff) | ||
43 | |||
40 | struct cgroup_subsys blkio_subsys = { | 44 | struct cgroup_subsys blkio_subsys = { |
41 | .name = "blkio", | 45 | .name = "blkio", |
42 | .create = blkiocg_create, | 46 | .create = blkiocg_create, |
43 | .can_attach = blkiocg_can_attach, | 47 | .can_attach_task = blkiocg_can_attach_task, |
44 | .attach = blkiocg_attach, | 48 | .attach_task = blkiocg_attach_task, |
45 | .destroy = blkiocg_destroy, | 49 | .destroy = blkiocg_destroy, |
46 | .populate = blkiocg_populate, | 50 | .populate = blkiocg_populate, |
47 | #ifdef CONFIG_BLK_CGROUP | 51 | #ifdef CONFIG_BLK_CGROUP |
@@ -59,6 +63,27 @@ static inline void blkio_policy_insert_node(struct blkio_cgroup *blkcg, | |||
59 | list_add(&pn->node, &blkcg->policy_list); | 63 | list_add(&pn->node, &blkcg->policy_list); |
60 | } | 64 | } |
61 | 65 | ||
66 | static inline bool cftype_blkg_same_policy(struct cftype *cft, | ||
67 | struct blkio_group *blkg) | ||
68 | { | ||
69 | enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); | ||
70 | |||
71 | if (blkg->plid == plid) | ||
72 | return 1; | ||
73 | |||
74 | return 0; | ||
75 | } | ||
76 | |||
77 | /* Determines if policy node matches cgroup file being accessed */ | ||
78 | static inline bool pn_matches_cftype(struct cftype *cft, | ||
79 | struct blkio_policy_node *pn) | ||
80 | { | ||
81 | enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); | ||
82 | int fileid = BLKIOFILE_ATTR(cft->private); | ||
83 | |||
84 | return (plid == pn->plid && fileid == pn->fileid); | ||
85 | } | ||
86 | |||
62 | /* Must be called with blkcg->lock held */ | 87 | /* Must be called with blkcg->lock held */ |
63 | static inline void blkio_policy_delete_node(struct blkio_policy_node *pn) | 88 | static inline void blkio_policy_delete_node(struct blkio_policy_node *pn) |
64 | { | 89 | { |
@@ -67,12 +92,13 @@ static inline void blkio_policy_delete_node(struct blkio_policy_node *pn) | |||
67 | 92 | ||
68 | /* Must be called with blkcg->lock held */ | 93 | /* Must be called with blkcg->lock held */ |
69 | static struct blkio_policy_node * | 94 | static struct blkio_policy_node * |
70 | blkio_policy_search_node(const struct blkio_cgroup *blkcg, dev_t dev) | 95 | blkio_policy_search_node(const struct blkio_cgroup *blkcg, dev_t dev, |
96 | enum blkio_policy_id plid, int fileid) | ||
71 | { | 97 | { |
72 | struct blkio_policy_node *pn; | 98 | struct blkio_policy_node *pn; |
73 | 99 | ||
74 | list_for_each_entry(pn, &blkcg->policy_list, node) { | 100 | list_for_each_entry(pn, &blkcg->policy_list, node) { |
75 | if (pn->dev == dev) | 101 | if (pn->dev == dev && pn->plid == plid && pn->fileid == fileid) |
76 | return pn; | 102 | return pn; |
77 | } | 103 | } |
78 | 104 | ||
@@ -86,6 +112,74 @@ struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup) | |||
86 | } | 112 | } |
87 | EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup); | 113 | EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup); |
88 | 114 | ||
115 | struct blkio_cgroup *task_blkio_cgroup(struct task_struct *tsk) | ||
116 | { | ||
117 | return container_of(task_subsys_state(tsk, blkio_subsys_id), | ||
118 | struct blkio_cgroup, css); | ||
119 | } | ||
120 | EXPORT_SYMBOL_GPL(task_blkio_cgroup); | ||
121 | |||
122 | static inline void | ||
123 | blkio_update_group_weight(struct blkio_group *blkg, unsigned int weight) | ||
124 | { | ||
125 | struct blkio_policy_type *blkiop; | ||
126 | |||
127 | list_for_each_entry(blkiop, &blkio_list, list) { | ||
128 | /* If this policy does not own the blkg, do not send updates */ | ||
129 | if (blkiop->plid != blkg->plid) | ||
130 | continue; | ||
131 | if (blkiop->ops.blkio_update_group_weight_fn) | ||
132 | blkiop->ops.blkio_update_group_weight_fn(blkg->key, | ||
133 | blkg, weight); | ||
134 | } | ||
135 | } | ||
136 | |||
137 | static inline void blkio_update_group_bps(struct blkio_group *blkg, u64 bps, | ||
138 | int fileid) | ||
139 | { | ||
140 | struct blkio_policy_type *blkiop; | ||
141 | |||
142 | list_for_each_entry(blkiop, &blkio_list, list) { | ||
143 | |||
144 | /* If this policy does not own the blkg, do not send updates */ | ||
145 | if (blkiop->plid != blkg->plid) | ||
146 | continue; | ||
147 | |||
148 | if (fileid == BLKIO_THROTL_read_bps_device | ||
149 | && blkiop->ops.blkio_update_group_read_bps_fn) | ||
150 | blkiop->ops.blkio_update_group_read_bps_fn(blkg->key, | ||
151 | blkg, bps); | ||
152 | |||
153 | if (fileid == BLKIO_THROTL_write_bps_device | ||
154 | && blkiop->ops.blkio_update_group_write_bps_fn) | ||
155 | blkiop->ops.blkio_update_group_write_bps_fn(blkg->key, | ||
156 | blkg, bps); | ||
157 | } | ||
158 | } | ||
159 | |||
160 | static inline void blkio_update_group_iops(struct blkio_group *blkg, | ||
161 | unsigned int iops, int fileid) | ||
162 | { | ||
163 | struct blkio_policy_type *blkiop; | ||
164 | |||
165 | list_for_each_entry(blkiop, &blkio_list, list) { | ||
166 | |||
167 | /* If this policy does not own the blkg, do not send updates */ | ||
168 | if (blkiop->plid != blkg->plid) | ||
169 | continue; | ||
170 | |||
171 | if (fileid == BLKIO_THROTL_read_iops_device | ||
172 | && blkiop->ops.blkio_update_group_read_iops_fn) | ||
173 | blkiop->ops.blkio_update_group_read_iops_fn(blkg->key, | ||
174 | blkg, iops); | ||
175 | |||
176 | if (fileid == BLKIO_THROTL_write_iops_device | ||
177 | && blkiop->ops.blkio_update_group_write_iops_fn) | ||
178 | blkiop->ops.blkio_update_group_write_iops_fn(blkg->key, | ||
179 | blkg,iops); | ||
180 | } | ||
181 | } | ||
182 | |||
89 | /* | 183 | /* |
90 | * Add to the appropriate stat variable depending on the request type. | 184 | * Add to the appropriate stat variable depending on the request type. |
91 | * This should be called with the blkg->stats_lock held. | 185 | * This should be called with the blkg->stats_lock held. |
@@ -282,30 +376,47 @@ void blkiocg_update_io_remove_stats(struct blkio_group *blkg, | |||
282 | } | 376 | } |
283 | EXPORT_SYMBOL_GPL(blkiocg_update_io_remove_stats); | 377 | EXPORT_SYMBOL_GPL(blkiocg_update_io_remove_stats); |
284 | 378 | ||
285 | void blkiocg_update_timeslice_used(struct blkio_group *blkg, unsigned long time) | 379 | void blkiocg_update_timeslice_used(struct blkio_group *blkg, unsigned long time, |
380 | unsigned long unaccounted_time) | ||
286 | { | 381 | { |
287 | unsigned long flags; | 382 | unsigned long flags; |
288 | 383 | ||
289 | spin_lock_irqsave(&blkg->stats_lock, flags); | 384 | spin_lock_irqsave(&blkg->stats_lock, flags); |
290 | blkg->stats.time += time; | 385 | blkg->stats.time += time; |
386 | #ifdef CONFIG_DEBUG_BLK_CGROUP | ||
387 | blkg->stats.unaccounted_time += unaccounted_time; | ||
388 | #endif | ||
291 | spin_unlock_irqrestore(&blkg->stats_lock, flags); | 389 | spin_unlock_irqrestore(&blkg->stats_lock, flags); |
292 | } | 390 | } |
293 | EXPORT_SYMBOL_GPL(blkiocg_update_timeslice_used); | 391 | EXPORT_SYMBOL_GPL(blkiocg_update_timeslice_used); |
294 | 392 | ||
393 | /* | ||
394 | * should be called under rcu read lock or queue lock to make sure blkg pointer | ||
395 | * is valid. | ||
396 | */ | ||
295 | void blkiocg_update_dispatch_stats(struct blkio_group *blkg, | 397 | void blkiocg_update_dispatch_stats(struct blkio_group *blkg, |
296 | uint64_t bytes, bool direction, bool sync) | 398 | uint64_t bytes, bool direction, bool sync) |
297 | { | 399 | { |
298 | struct blkio_group_stats *stats; | 400 | struct blkio_group_stats_cpu *stats_cpu; |
299 | unsigned long flags; | 401 | unsigned long flags; |
300 | 402 | ||
301 | spin_lock_irqsave(&blkg->stats_lock, flags); | 403 | /* |
302 | stats = &blkg->stats; | 404 | * Disabling interrupts to provide mutual exclusion between two |
303 | stats->sectors += bytes >> 9; | 405 | * writes on same cpu. It probably is not needed for 64bit. Not |
304 | blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICED], 1, direction, | 406 | * optimizing that case yet. |
305 | sync); | 407 | */ |
306 | blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICE_BYTES], bytes, | 408 | local_irq_save(flags); |
307 | direction, sync); | 409 | |
308 | spin_unlock_irqrestore(&blkg->stats_lock, flags); | 410 | stats_cpu = this_cpu_ptr(blkg->stats_cpu); |
411 | |||
412 | u64_stats_update_begin(&stats_cpu->syncp); | ||
413 | stats_cpu->sectors += bytes >> 9; | ||
414 | blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_SERVICED], | ||
415 | 1, direction, sync); | ||
416 | blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_SERVICE_BYTES], | ||
417 | bytes, direction, sync); | ||
418 | u64_stats_update_end(&stats_cpu->syncp); | ||
419 | local_irq_restore(flags); | ||
309 | } | 420 | } |
310 | EXPORT_SYMBOL_GPL(blkiocg_update_dispatch_stats); | 421 | EXPORT_SYMBOL_GPL(blkiocg_update_dispatch_stats); |
311 | 422 | ||
@@ -328,20 +439,47 @@ void blkiocg_update_completion_stats(struct blkio_group *blkg, | |||
328 | } | 439 | } |
329 | EXPORT_SYMBOL_GPL(blkiocg_update_completion_stats); | 440 | EXPORT_SYMBOL_GPL(blkiocg_update_completion_stats); |
330 | 441 | ||
442 | /* Merged stats are per cpu. */ | ||
331 | void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction, | 443 | void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction, |
332 | bool sync) | 444 | bool sync) |
333 | { | 445 | { |
446 | struct blkio_group_stats_cpu *stats_cpu; | ||
334 | unsigned long flags; | 447 | unsigned long flags; |
335 | 448 | ||
336 | spin_lock_irqsave(&blkg->stats_lock, flags); | 449 | /* |
337 | blkio_add_stat(blkg->stats.stat_arr[BLKIO_STAT_MERGED], 1, direction, | 450 | * Disabling interrupts to provide mutual exclusion between two |
338 | sync); | 451 | * writes on same cpu. It probably is not needed for 64bit. Not |
339 | spin_unlock_irqrestore(&blkg->stats_lock, flags); | 452 | * optimizing that case yet. |
453 | */ | ||
454 | local_irq_save(flags); | ||
455 | |||
456 | stats_cpu = this_cpu_ptr(blkg->stats_cpu); | ||
457 | |||
458 | u64_stats_update_begin(&stats_cpu->syncp); | ||
459 | blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_MERGED], 1, | ||
460 | direction, sync); | ||
461 | u64_stats_update_end(&stats_cpu->syncp); | ||
462 | local_irq_restore(flags); | ||
340 | } | 463 | } |
341 | EXPORT_SYMBOL_GPL(blkiocg_update_io_merged_stats); | 464 | EXPORT_SYMBOL_GPL(blkiocg_update_io_merged_stats); |
342 | 465 | ||
466 | /* | ||
467 | * This function allocates the per cpu stats for blkio_group. Should be called | ||
468 | * from sleepable context as alloc_per_cpu() requires that. | ||
469 | */ | ||
470 | int blkio_alloc_blkg_stats(struct blkio_group *blkg) | ||
471 | { | ||
472 | /* Allocate memory for per cpu stats */ | ||
473 | blkg->stats_cpu = alloc_percpu(struct blkio_group_stats_cpu); | ||
474 | if (!blkg->stats_cpu) | ||
475 | return -ENOMEM; | ||
476 | return 0; | ||
477 | } | ||
478 | EXPORT_SYMBOL_GPL(blkio_alloc_blkg_stats); | ||
479 | |||
343 | void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, | 480 | void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, |
344 | struct blkio_group *blkg, void *key, dev_t dev) | 481 | struct blkio_group *blkg, void *key, dev_t dev, |
482 | enum blkio_policy_id plid) | ||
345 | { | 483 | { |
346 | unsigned long flags; | 484 | unsigned long flags; |
347 | 485 | ||
@@ -350,6 +488,7 @@ void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, | |||
350 | rcu_assign_pointer(blkg->key, key); | 488 | rcu_assign_pointer(blkg->key, key); |
351 | blkg->blkcg_id = css_id(&blkcg->css); | 489 | blkg->blkcg_id = css_id(&blkcg->css); |
352 | hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list); | 490 | hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list); |
491 | blkg->plid = plid; | ||
353 | spin_unlock_irqrestore(&blkcg->lock, flags); | 492 | spin_unlock_irqrestore(&blkcg->lock, flags); |
354 | /* Need to take css reference ? */ | 493 | /* Need to take css reference ? */ |
355 | cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path)); | 494 | cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path)); |
@@ -408,49 +547,28 @@ struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key) | |||
408 | } | 547 | } |
409 | EXPORT_SYMBOL_GPL(blkiocg_lookup_group); | 548 | EXPORT_SYMBOL_GPL(blkiocg_lookup_group); |
410 | 549 | ||
411 | #define SHOW_FUNCTION(__VAR) \ | 550 | static void blkio_reset_stats_cpu(struct blkio_group *blkg) |
412 | static u64 blkiocg_##__VAR##_read(struct cgroup *cgroup, \ | ||
413 | struct cftype *cftype) \ | ||
414 | { \ | ||
415 | struct blkio_cgroup *blkcg; \ | ||
416 | \ | ||
417 | blkcg = cgroup_to_blkio_cgroup(cgroup); \ | ||
418 | return (u64)blkcg->__VAR; \ | ||
419 | } | ||
420 | |||
421 | SHOW_FUNCTION(weight); | ||
422 | #undef SHOW_FUNCTION | ||
423 | |||
424 | static int | ||
425 | blkiocg_weight_write(struct cgroup *cgroup, struct cftype *cftype, u64 val) | ||
426 | { | 551 | { |
427 | struct blkio_cgroup *blkcg; | 552 | struct blkio_group_stats_cpu *stats_cpu; |
428 | struct blkio_group *blkg; | 553 | int i, j, k; |
429 | struct hlist_node *n; | 554 | /* |
430 | struct blkio_policy_type *blkiop; | 555 | * Note: On 64 bit arch this should not be an issue. This has the |
431 | struct blkio_policy_node *pn; | 556 | * possibility of returning some inconsistent value on 32bit arch |
432 | 557 | * as 64bit update on 32bit is non atomic. Taking care of this | |
433 | if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX) | 558 | * corner case makes code very complicated, like sending IPIs to |
434 | return -EINVAL; | 559 | * cpus, taking care of stats of offline cpus etc. |
435 | 560 | * | |
436 | blkcg = cgroup_to_blkio_cgroup(cgroup); | 561 | * reset stats is anyway more of a debug feature and this sounds a |
437 | spin_lock(&blkio_list_lock); | 562 | * corner case. So I am not complicating the code yet until and |
438 | spin_lock_irq(&blkcg->lock); | 563 | * unless this becomes a real issue. |
439 | blkcg->weight = (unsigned int)val; | 564 | */ |
440 | 565 | for_each_possible_cpu(i) { | |
441 | hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) { | 566 | stats_cpu = per_cpu_ptr(blkg->stats_cpu, i); |
442 | pn = blkio_policy_search_node(blkcg, blkg->dev); | 567 | stats_cpu->sectors = 0; |
443 | 568 | for(j = 0; j < BLKIO_STAT_CPU_NR; j++) | |
444 | if (pn) | 569 | for (k = 0; k < BLKIO_STAT_TOTAL; k++) |
445 | continue; | 570 | stats_cpu->stat_arr_cpu[j][k] = 0; |
446 | |||
447 | list_for_each_entry(blkiop, &blkio_list, list) | ||
448 | blkiop->ops.blkio_update_group_weight_fn(blkg, | ||
449 | blkcg->weight); | ||
450 | } | 571 | } |
451 | spin_unlock_irq(&blkcg->lock); | ||
452 | spin_unlock(&blkio_list_lock); | ||
453 | return 0; | ||
454 | } | 572 | } |
455 | 573 | ||
456 | static int | 574 | static int |
@@ -497,7 +615,11 @@ blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val) | |||
497 | } | 615 | } |
498 | #endif | 616 | #endif |
499 | spin_unlock(&blkg->stats_lock); | 617 | spin_unlock(&blkg->stats_lock); |
618 | |||
619 | /* Reset Per cpu stats which don't take blkg->stats_lock */ | ||
620 | blkio_reset_stats_cpu(blkg); | ||
500 | } | 621 | } |
622 | |||
501 | spin_unlock_irq(&blkcg->lock); | 623 | spin_unlock_irq(&blkcg->lock); |
502 | return 0; | 624 | return 0; |
503 | } | 625 | } |
@@ -543,6 +665,59 @@ static uint64_t blkio_fill_stat(char *str, int chars_left, uint64_t val, | |||
543 | return val; | 665 | return val; |
544 | } | 666 | } |
545 | 667 | ||
668 | |||
669 | static uint64_t blkio_read_stat_cpu(struct blkio_group *blkg, | ||
670 | enum stat_type_cpu type, enum stat_sub_type sub_type) | ||
671 | { | ||
672 | int cpu; | ||
673 | struct blkio_group_stats_cpu *stats_cpu; | ||
674 | u64 val = 0, tval; | ||
675 | |||
676 | for_each_possible_cpu(cpu) { | ||
677 | unsigned int start; | ||
678 | stats_cpu = per_cpu_ptr(blkg->stats_cpu, cpu); | ||
679 | |||
680 | do { | ||
681 | start = u64_stats_fetch_begin(&stats_cpu->syncp); | ||
682 | if (type == BLKIO_STAT_CPU_SECTORS) | ||
683 | tval = stats_cpu->sectors; | ||
684 | else | ||
685 | tval = stats_cpu->stat_arr_cpu[type][sub_type]; | ||
686 | } while(u64_stats_fetch_retry(&stats_cpu->syncp, start)); | ||
687 | |||
688 | val += tval; | ||
689 | } | ||
690 | |||
691 | return val; | ||
692 | } | ||
693 | |||
694 | static uint64_t blkio_get_stat_cpu(struct blkio_group *blkg, | ||
695 | struct cgroup_map_cb *cb, dev_t dev, enum stat_type_cpu type) | ||
696 | { | ||
697 | uint64_t disk_total, val; | ||
698 | char key_str[MAX_KEY_LEN]; | ||
699 | enum stat_sub_type sub_type; | ||
700 | |||
701 | if (type == BLKIO_STAT_CPU_SECTORS) { | ||
702 | val = blkio_read_stat_cpu(blkg, type, 0); | ||
703 | return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, val, cb, dev); | ||
704 | } | ||
705 | |||
706 | for (sub_type = BLKIO_STAT_READ; sub_type < BLKIO_STAT_TOTAL; | ||
707 | sub_type++) { | ||
708 | blkio_get_key_name(sub_type, dev, key_str, MAX_KEY_LEN, false); | ||
709 | val = blkio_read_stat_cpu(blkg, type, sub_type); | ||
710 | cb->fill(cb, key_str, val); | ||
711 | } | ||
712 | |||
713 | disk_total = blkio_read_stat_cpu(blkg, type, BLKIO_STAT_READ) + | ||
714 | blkio_read_stat_cpu(blkg, type, BLKIO_STAT_WRITE); | ||
715 | |||
716 | blkio_get_key_name(BLKIO_STAT_TOTAL, dev, key_str, MAX_KEY_LEN, false); | ||
717 | cb->fill(cb, key_str, disk_total); | ||
718 | return disk_total; | ||
719 | } | ||
720 | |||
546 | /* This should be called with blkg->stats_lock held */ | 721 | /* This should be called with blkg->stats_lock held */ |
547 | static uint64_t blkio_get_stat(struct blkio_group *blkg, | 722 | static uint64_t blkio_get_stat(struct blkio_group *blkg, |
548 | struct cgroup_map_cb *cb, dev_t dev, enum stat_type type) | 723 | struct cgroup_map_cb *cb, dev_t dev, enum stat_type type) |
@@ -554,10 +729,10 @@ static uint64_t blkio_get_stat(struct blkio_group *blkg, | |||
554 | if (type == BLKIO_STAT_TIME) | 729 | if (type == BLKIO_STAT_TIME) |
555 | return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, | 730 | return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, |
556 | blkg->stats.time, cb, dev); | 731 | blkg->stats.time, cb, dev); |
557 | if (type == BLKIO_STAT_SECTORS) | ||
558 | return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, | ||
559 | blkg->stats.sectors, cb, dev); | ||
560 | #ifdef CONFIG_DEBUG_BLK_CGROUP | 732 | #ifdef CONFIG_DEBUG_BLK_CGROUP |
733 | if (type == BLKIO_STAT_UNACCOUNTED_TIME) | ||
734 | return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, | ||
735 | blkg->stats.unaccounted_time, cb, dev); | ||
561 | if (type == BLKIO_STAT_AVG_QUEUE_SIZE) { | 736 | if (type == BLKIO_STAT_AVG_QUEUE_SIZE) { |
562 | uint64_t sum = blkg->stats.avg_queue_size_sum; | 737 | uint64_t sum = blkg->stats.avg_queue_size_sum; |
563 | uint64_t samples = blkg->stats.avg_queue_size_samples; | 738 | uint64_t samples = blkg->stats.avg_queue_size_samples; |
@@ -593,52 +768,6 @@ static uint64_t blkio_get_stat(struct blkio_group *blkg, | |||
593 | return disk_total; | 768 | return disk_total; |
594 | } | 769 | } |
595 | 770 | ||
596 | #define SHOW_FUNCTION_PER_GROUP(__VAR, type, show_total) \ | ||
597 | static int blkiocg_##__VAR##_read(struct cgroup *cgroup, \ | ||
598 | struct cftype *cftype, struct cgroup_map_cb *cb) \ | ||
599 | { \ | ||
600 | struct blkio_cgroup *blkcg; \ | ||
601 | struct blkio_group *blkg; \ | ||
602 | struct hlist_node *n; \ | ||
603 | uint64_t cgroup_total = 0; \ | ||
604 | \ | ||
605 | if (!cgroup_lock_live_group(cgroup)) \ | ||
606 | return -ENODEV; \ | ||
607 | \ | ||
608 | blkcg = cgroup_to_blkio_cgroup(cgroup); \ | ||
609 | rcu_read_lock(); \ | ||
610 | hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {\ | ||
611 | if (blkg->dev) { \ | ||
612 | spin_lock_irq(&blkg->stats_lock); \ | ||
613 | cgroup_total += blkio_get_stat(blkg, cb, \ | ||
614 | blkg->dev, type); \ | ||
615 | spin_unlock_irq(&blkg->stats_lock); \ | ||
616 | } \ | ||
617 | } \ | ||
618 | if (show_total) \ | ||
619 | cb->fill(cb, "Total", cgroup_total); \ | ||
620 | rcu_read_unlock(); \ | ||
621 | cgroup_unlock(); \ | ||
622 | return 0; \ | ||
623 | } | ||
624 | |||
625 | SHOW_FUNCTION_PER_GROUP(time, BLKIO_STAT_TIME, 0); | ||
626 | SHOW_FUNCTION_PER_GROUP(sectors, BLKIO_STAT_SECTORS, 0); | ||
627 | SHOW_FUNCTION_PER_GROUP(io_service_bytes, BLKIO_STAT_SERVICE_BYTES, 1); | ||
628 | SHOW_FUNCTION_PER_GROUP(io_serviced, BLKIO_STAT_SERVICED, 1); | ||
629 | SHOW_FUNCTION_PER_GROUP(io_service_time, BLKIO_STAT_SERVICE_TIME, 1); | ||
630 | SHOW_FUNCTION_PER_GROUP(io_wait_time, BLKIO_STAT_WAIT_TIME, 1); | ||
631 | SHOW_FUNCTION_PER_GROUP(io_merged, BLKIO_STAT_MERGED, 1); | ||
632 | SHOW_FUNCTION_PER_GROUP(io_queued, BLKIO_STAT_QUEUED, 1); | ||
633 | #ifdef CONFIG_DEBUG_BLK_CGROUP | ||
634 | SHOW_FUNCTION_PER_GROUP(dequeue, BLKIO_STAT_DEQUEUE, 0); | ||
635 | SHOW_FUNCTION_PER_GROUP(avg_queue_size, BLKIO_STAT_AVG_QUEUE_SIZE, 0); | ||
636 | SHOW_FUNCTION_PER_GROUP(group_wait_time, BLKIO_STAT_GROUP_WAIT_TIME, 0); | ||
637 | SHOW_FUNCTION_PER_GROUP(idle_time, BLKIO_STAT_IDLE_TIME, 0); | ||
638 | SHOW_FUNCTION_PER_GROUP(empty_time, BLKIO_STAT_EMPTY_TIME, 0); | ||
639 | #endif | ||
640 | #undef SHOW_FUNCTION_PER_GROUP | ||
641 | |||
642 | static int blkio_check_dev_num(dev_t dev) | 771 | static int blkio_check_dev_num(dev_t dev) |
643 | { | 772 | { |
644 | int part = 0; | 773 | int part = 0; |
@@ -652,13 +781,14 @@ static int blkio_check_dev_num(dev_t dev) | |||
652 | } | 781 | } |
653 | 782 | ||
654 | static int blkio_policy_parse_and_set(char *buf, | 783 | static int blkio_policy_parse_and_set(char *buf, |
655 | struct blkio_policy_node *newpn) | 784 | struct blkio_policy_node *newpn, enum blkio_policy_id plid, int fileid) |
656 | { | 785 | { |
657 | char *s[4], *p, *major_s = NULL, *minor_s = NULL; | 786 | char *s[4], *p, *major_s = NULL, *minor_s = NULL; |
658 | int ret; | 787 | int ret; |
659 | unsigned long major, minor, temp; | 788 | unsigned long major, minor, temp; |
660 | int i = 0; | 789 | int i = 0; |
661 | dev_t dev; | 790 | dev_t dev; |
791 | u64 bps, iops; | ||
662 | 792 | ||
663 | memset(s, 0, sizeof(s)); | 793 | memset(s, 0, sizeof(s)); |
664 | 794 | ||
@@ -705,12 +835,47 @@ static int blkio_policy_parse_and_set(char *buf, | |||
705 | if (s[1] == NULL) | 835 | if (s[1] == NULL) |
706 | return -EINVAL; | 836 | return -EINVAL; |
707 | 837 | ||
708 | ret = strict_strtoul(s[1], 10, &temp); | 838 | switch (plid) { |
709 | if (ret || (temp < BLKIO_WEIGHT_MIN && temp > 0) || | 839 | case BLKIO_POLICY_PROP: |
710 | temp > BLKIO_WEIGHT_MAX) | 840 | ret = strict_strtoul(s[1], 10, &temp); |
711 | return -EINVAL; | 841 | if (ret || (temp < BLKIO_WEIGHT_MIN && temp > 0) || |
842 | temp > BLKIO_WEIGHT_MAX) | ||
843 | return -EINVAL; | ||
712 | 844 | ||
713 | newpn->weight = temp; | 845 | newpn->plid = plid; |
846 | newpn->fileid = fileid; | ||
847 | newpn->val.weight = temp; | ||
848 | break; | ||
849 | case BLKIO_POLICY_THROTL: | ||
850 | switch(fileid) { | ||
851 | case BLKIO_THROTL_read_bps_device: | ||
852 | case BLKIO_THROTL_write_bps_device: | ||
853 | ret = strict_strtoull(s[1], 10, &bps); | ||
854 | if (ret) | ||
855 | return -EINVAL; | ||
856 | |||
857 | newpn->plid = plid; | ||
858 | newpn->fileid = fileid; | ||
859 | newpn->val.bps = bps; | ||
860 | break; | ||
861 | case BLKIO_THROTL_read_iops_device: | ||
862 | case BLKIO_THROTL_write_iops_device: | ||
863 | ret = strict_strtoull(s[1], 10, &iops); | ||
864 | if (ret) | ||
865 | return -EINVAL; | ||
866 | |||
867 | if (iops > THROTL_IOPS_MAX) | ||
868 | return -EINVAL; | ||
869 | |||
870 | newpn->plid = plid; | ||
871 | newpn->fileid = fileid; | ||
872 | newpn->val.iops = (unsigned int)iops; | ||
873 | break; | ||
874 | } | ||
875 | break; | ||
876 | default: | ||
877 | BUG(); | ||
878 | } | ||
714 | 879 | ||
715 | return 0; | 880 | return 0; |
716 | } | 881 | } |
@@ -720,26 +885,180 @@ unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg, | |||
720 | { | 885 | { |
721 | struct blkio_policy_node *pn; | 886 | struct blkio_policy_node *pn; |
722 | 887 | ||
723 | pn = blkio_policy_search_node(blkcg, dev); | 888 | pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_PROP, |
889 | BLKIO_PROP_weight_device); | ||
724 | if (pn) | 890 | if (pn) |
725 | return pn->weight; | 891 | return pn->val.weight; |
726 | else | 892 | else |
727 | return blkcg->weight; | 893 | return blkcg->weight; |
728 | } | 894 | } |
729 | EXPORT_SYMBOL_GPL(blkcg_get_weight); | 895 | EXPORT_SYMBOL_GPL(blkcg_get_weight); |
730 | 896 | ||
897 | uint64_t blkcg_get_read_bps(struct blkio_cgroup *blkcg, dev_t dev) | ||
898 | { | ||
899 | struct blkio_policy_node *pn; | ||
900 | |||
901 | pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL, | ||
902 | BLKIO_THROTL_read_bps_device); | ||
903 | if (pn) | ||
904 | return pn->val.bps; | ||
905 | else | ||
906 | return -1; | ||
907 | } | ||
908 | |||
909 | uint64_t blkcg_get_write_bps(struct blkio_cgroup *blkcg, dev_t dev) | ||
910 | { | ||
911 | struct blkio_policy_node *pn; | ||
912 | pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL, | ||
913 | BLKIO_THROTL_write_bps_device); | ||
914 | if (pn) | ||
915 | return pn->val.bps; | ||
916 | else | ||
917 | return -1; | ||
918 | } | ||
919 | |||
920 | unsigned int blkcg_get_read_iops(struct blkio_cgroup *blkcg, dev_t dev) | ||
921 | { | ||
922 | struct blkio_policy_node *pn; | ||
923 | |||
924 | pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL, | ||
925 | BLKIO_THROTL_read_iops_device); | ||
926 | if (pn) | ||
927 | return pn->val.iops; | ||
928 | else | ||
929 | return -1; | ||
930 | } | ||
931 | |||
932 | unsigned int blkcg_get_write_iops(struct blkio_cgroup *blkcg, dev_t dev) | ||
933 | { | ||
934 | struct blkio_policy_node *pn; | ||
935 | pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL, | ||
936 | BLKIO_THROTL_write_iops_device); | ||
937 | if (pn) | ||
938 | return pn->val.iops; | ||
939 | else | ||
940 | return -1; | ||
941 | } | ||
942 | |||
943 | /* Checks whether user asked for deleting a policy rule */ | ||
944 | static bool blkio_delete_rule_command(struct blkio_policy_node *pn) | ||
945 | { | ||
946 | switch(pn->plid) { | ||
947 | case BLKIO_POLICY_PROP: | ||
948 | if (pn->val.weight == 0) | ||
949 | return 1; | ||
950 | break; | ||
951 | case BLKIO_POLICY_THROTL: | ||
952 | switch(pn->fileid) { | ||
953 | case BLKIO_THROTL_read_bps_device: | ||
954 | case BLKIO_THROTL_write_bps_device: | ||
955 | if (pn->val.bps == 0) | ||
956 | return 1; | ||
957 | break; | ||
958 | case BLKIO_THROTL_read_iops_device: | ||
959 | case BLKIO_THROTL_write_iops_device: | ||
960 | if (pn->val.iops == 0) | ||
961 | return 1; | ||
962 | } | ||
963 | break; | ||
964 | default: | ||
965 | BUG(); | ||
966 | } | ||
967 | |||
968 | return 0; | ||
969 | } | ||
970 | |||
971 | static void blkio_update_policy_rule(struct blkio_policy_node *oldpn, | ||
972 | struct blkio_policy_node *newpn) | ||
973 | { | ||
974 | switch(oldpn->plid) { | ||
975 | case BLKIO_POLICY_PROP: | ||
976 | oldpn->val.weight = newpn->val.weight; | ||
977 | break; | ||
978 | case BLKIO_POLICY_THROTL: | ||
979 | switch(newpn->fileid) { | ||
980 | case BLKIO_THROTL_read_bps_device: | ||
981 | case BLKIO_THROTL_write_bps_device: | ||
982 | oldpn->val.bps = newpn->val.bps; | ||
983 | break; | ||
984 | case BLKIO_THROTL_read_iops_device: | ||
985 | case BLKIO_THROTL_write_iops_device: | ||
986 | oldpn->val.iops = newpn->val.iops; | ||
987 | } | ||
988 | break; | ||
989 | default: | ||
990 | BUG(); | ||
991 | } | ||
992 | } | ||
993 | |||
994 | /* | ||
995 | * Some rules/values in blkg have changed. Propagate those to respective | ||
996 | * policies. | ||
997 | */ | ||
998 | static void blkio_update_blkg_policy(struct blkio_cgroup *blkcg, | ||
999 | struct blkio_group *blkg, struct blkio_policy_node *pn) | ||
1000 | { | ||
1001 | unsigned int weight, iops; | ||
1002 | u64 bps; | ||
1003 | |||
1004 | switch(pn->plid) { | ||
1005 | case BLKIO_POLICY_PROP: | ||
1006 | weight = pn->val.weight ? pn->val.weight : | ||
1007 | blkcg->weight; | ||
1008 | blkio_update_group_weight(blkg, weight); | ||
1009 | break; | ||
1010 | case BLKIO_POLICY_THROTL: | ||
1011 | switch(pn->fileid) { | ||
1012 | case BLKIO_THROTL_read_bps_device: | ||
1013 | case BLKIO_THROTL_write_bps_device: | ||
1014 | bps = pn->val.bps ? pn->val.bps : (-1); | ||
1015 | blkio_update_group_bps(blkg, bps, pn->fileid); | ||
1016 | break; | ||
1017 | case BLKIO_THROTL_read_iops_device: | ||
1018 | case BLKIO_THROTL_write_iops_device: | ||
1019 | iops = pn->val.iops ? pn->val.iops : (-1); | ||
1020 | blkio_update_group_iops(blkg, iops, pn->fileid); | ||
1021 | break; | ||
1022 | } | ||
1023 | break; | ||
1024 | default: | ||
1025 | BUG(); | ||
1026 | } | ||
1027 | } | ||
731 | 1028 | ||
732 | static int blkiocg_weight_device_write(struct cgroup *cgrp, struct cftype *cft, | 1029 | /* |
733 | const char *buffer) | 1030 | * A policy node rule has been updated. Propagate this update to all the |
1031 | * block groups which might be affected by this update. | ||
1032 | */ | ||
1033 | static void blkio_update_policy_node_blkg(struct blkio_cgroup *blkcg, | ||
1034 | struct blkio_policy_node *pn) | ||
1035 | { | ||
1036 | struct blkio_group *blkg; | ||
1037 | struct hlist_node *n; | ||
1038 | |||
1039 | spin_lock(&blkio_list_lock); | ||
1040 | spin_lock_irq(&blkcg->lock); | ||
1041 | |||
1042 | hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) { | ||
1043 | if (pn->dev != blkg->dev || pn->plid != blkg->plid) | ||
1044 | continue; | ||
1045 | blkio_update_blkg_policy(blkcg, blkg, pn); | ||
1046 | } | ||
1047 | |||
1048 | spin_unlock_irq(&blkcg->lock); | ||
1049 | spin_unlock(&blkio_list_lock); | ||
1050 | } | ||
1051 | |||
1052 | static int blkiocg_file_write(struct cgroup *cgrp, struct cftype *cft, | ||
1053 | const char *buffer) | ||
734 | { | 1054 | { |
735 | int ret = 0; | 1055 | int ret = 0; |
736 | char *buf; | 1056 | char *buf; |
737 | struct blkio_policy_node *newpn, *pn; | 1057 | struct blkio_policy_node *newpn, *pn; |
738 | struct blkio_cgroup *blkcg; | 1058 | struct blkio_cgroup *blkcg; |
739 | struct blkio_group *blkg; | ||
740 | int keep_newpn = 0; | 1059 | int keep_newpn = 0; |
741 | struct hlist_node *n; | 1060 | enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); |
742 | struct blkio_policy_type *blkiop; | 1061 | int fileid = BLKIOFILE_ATTR(cft->private); |
743 | 1062 | ||
744 | buf = kstrdup(buffer, GFP_KERNEL); | 1063 | buf = kstrdup(buffer, GFP_KERNEL); |
745 | if (!buf) | 1064 | if (!buf) |
@@ -751,7 +1070,7 @@ static int blkiocg_weight_device_write(struct cgroup *cgrp, struct cftype *cft, | |||
751 | goto free_buf; | 1070 | goto free_buf; |
752 | } | 1071 | } |
753 | 1072 | ||
754 | ret = blkio_policy_parse_and_set(buf, newpn); | 1073 | ret = blkio_policy_parse_and_set(buf, newpn, plid, fileid); |
755 | if (ret) | 1074 | if (ret) |
756 | goto free_newpn; | 1075 | goto free_newpn; |
757 | 1076 | ||
@@ -759,9 +1078,9 @@ static int blkiocg_weight_device_write(struct cgroup *cgrp, struct cftype *cft, | |||
759 | 1078 | ||
760 | spin_lock_irq(&blkcg->lock); | 1079 | spin_lock_irq(&blkcg->lock); |
761 | 1080 | ||
762 | pn = blkio_policy_search_node(blkcg, newpn->dev); | 1081 | pn = blkio_policy_search_node(blkcg, newpn->dev, plid, fileid); |
763 | if (!pn) { | 1082 | if (!pn) { |
764 | if (newpn->weight != 0) { | 1083 | if (!blkio_delete_rule_command(newpn)) { |
765 | blkio_policy_insert_node(blkcg, newpn); | 1084 | blkio_policy_insert_node(blkcg, newpn); |
766 | keep_newpn = 1; | 1085 | keep_newpn = 1; |
767 | } | 1086 | } |
@@ -769,33 +1088,17 @@ static int blkiocg_weight_device_write(struct cgroup *cgrp, struct cftype *cft, | |||
769 | goto update_io_group; | 1088 | goto update_io_group; |
770 | } | 1089 | } |
771 | 1090 | ||
772 | if (newpn->weight == 0) { | 1091 | if (blkio_delete_rule_command(newpn)) { |
773 | /* weight == 0 means deleteing a specific weight */ | ||
774 | blkio_policy_delete_node(pn); | 1092 | blkio_policy_delete_node(pn); |
775 | spin_unlock_irq(&blkcg->lock); | 1093 | spin_unlock_irq(&blkcg->lock); |
776 | goto update_io_group; | 1094 | goto update_io_group; |
777 | } | 1095 | } |
778 | spin_unlock_irq(&blkcg->lock); | 1096 | spin_unlock_irq(&blkcg->lock); |
779 | 1097 | ||
780 | pn->weight = newpn->weight; | 1098 | blkio_update_policy_rule(pn, newpn); |
781 | 1099 | ||
782 | update_io_group: | 1100 | update_io_group: |
783 | /* update weight for each cfqg */ | 1101 | blkio_update_policy_node_blkg(blkcg, newpn); |
784 | spin_lock(&blkio_list_lock); | ||
785 | spin_lock_irq(&blkcg->lock); | ||
786 | |||
787 | hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) { | ||
788 | if (newpn->dev == blkg->dev) { | ||
789 | list_for_each_entry(blkiop, &blkio_list, list) | ||
790 | blkiop->ops.blkio_update_group_weight_fn(blkg, | ||
791 | newpn->weight ? | ||
792 | newpn->weight : | ||
793 | blkcg->weight); | ||
794 | } | ||
795 | } | ||
796 | |||
797 | spin_unlock_irq(&blkcg->lock); | ||
798 | spin_unlock(&blkio_list_lock); | ||
799 | 1102 | ||
800 | free_newpn: | 1103 | free_newpn: |
801 | if (!keep_newpn) | 1104 | if (!keep_newpn) |
@@ -805,23 +1108,264 @@ free_buf: | |||
805 | return ret; | 1108 | return ret; |
806 | } | 1109 | } |
807 | 1110 | ||
808 | static int blkiocg_weight_device_read(struct cgroup *cgrp, struct cftype *cft, | 1111 | static void |
809 | struct seq_file *m) | 1112 | blkio_print_policy_node(struct seq_file *m, struct blkio_policy_node *pn) |
810 | { | 1113 | { |
811 | struct blkio_cgroup *blkcg; | 1114 | switch(pn->plid) { |
812 | struct blkio_policy_node *pn; | 1115 | case BLKIO_POLICY_PROP: |
1116 | if (pn->fileid == BLKIO_PROP_weight_device) | ||
1117 | seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev), | ||
1118 | MINOR(pn->dev), pn->val.weight); | ||
1119 | break; | ||
1120 | case BLKIO_POLICY_THROTL: | ||
1121 | switch(pn->fileid) { | ||
1122 | case BLKIO_THROTL_read_bps_device: | ||
1123 | case BLKIO_THROTL_write_bps_device: | ||
1124 | seq_printf(m, "%u:%u\t%llu\n", MAJOR(pn->dev), | ||
1125 | MINOR(pn->dev), pn->val.bps); | ||
1126 | break; | ||
1127 | case BLKIO_THROTL_read_iops_device: | ||
1128 | case BLKIO_THROTL_write_iops_device: | ||
1129 | seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev), | ||
1130 | MINOR(pn->dev), pn->val.iops); | ||
1131 | break; | ||
1132 | } | ||
1133 | break; | ||
1134 | default: | ||
1135 | BUG(); | ||
1136 | } | ||
1137 | } | ||
813 | 1138 | ||
814 | seq_printf(m, "dev\tweight\n"); | 1139 | /* cgroup files which read their data from policy nodes end up here */ |
1140 | static void blkio_read_policy_node_files(struct cftype *cft, | ||
1141 | struct blkio_cgroup *blkcg, struct seq_file *m) | ||
1142 | { | ||
1143 | struct blkio_policy_node *pn; | ||
815 | 1144 | ||
816 | blkcg = cgroup_to_blkio_cgroup(cgrp); | ||
817 | if (!list_empty(&blkcg->policy_list)) { | 1145 | if (!list_empty(&blkcg->policy_list)) { |
818 | spin_lock_irq(&blkcg->lock); | 1146 | spin_lock_irq(&blkcg->lock); |
819 | list_for_each_entry(pn, &blkcg->policy_list, node) { | 1147 | list_for_each_entry(pn, &blkcg->policy_list, node) { |
820 | seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev), | 1148 | if (!pn_matches_cftype(cft, pn)) |
821 | MINOR(pn->dev), pn->weight); | 1149 | continue; |
1150 | blkio_print_policy_node(m, pn); | ||
822 | } | 1151 | } |
823 | spin_unlock_irq(&blkcg->lock); | 1152 | spin_unlock_irq(&blkcg->lock); |
824 | } | 1153 | } |
1154 | } | ||
1155 | |||
1156 | static int blkiocg_file_read(struct cgroup *cgrp, struct cftype *cft, | ||
1157 | struct seq_file *m) | ||
1158 | { | ||
1159 | struct blkio_cgroup *blkcg; | ||
1160 | enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); | ||
1161 | int name = BLKIOFILE_ATTR(cft->private); | ||
1162 | |||
1163 | blkcg = cgroup_to_blkio_cgroup(cgrp); | ||
1164 | |||
1165 | switch(plid) { | ||
1166 | case BLKIO_POLICY_PROP: | ||
1167 | switch(name) { | ||
1168 | case BLKIO_PROP_weight_device: | ||
1169 | blkio_read_policy_node_files(cft, blkcg, m); | ||
1170 | return 0; | ||
1171 | default: | ||
1172 | BUG(); | ||
1173 | } | ||
1174 | break; | ||
1175 | case BLKIO_POLICY_THROTL: | ||
1176 | switch(name){ | ||
1177 | case BLKIO_THROTL_read_bps_device: | ||
1178 | case BLKIO_THROTL_write_bps_device: | ||
1179 | case BLKIO_THROTL_read_iops_device: | ||
1180 | case BLKIO_THROTL_write_iops_device: | ||
1181 | blkio_read_policy_node_files(cft, blkcg, m); | ||
1182 | return 0; | ||
1183 | default: | ||
1184 | BUG(); | ||
1185 | } | ||
1186 | break; | ||
1187 | default: | ||
1188 | BUG(); | ||
1189 | } | ||
1190 | |||
1191 | return 0; | ||
1192 | } | ||
1193 | |||
1194 | static int blkio_read_blkg_stats(struct blkio_cgroup *blkcg, | ||
1195 | struct cftype *cft, struct cgroup_map_cb *cb, | ||
1196 | enum stat_type type, bool show_total, bool pcpu) | ||
1197 | { | ||
1198 | struct blkio_group *blkg; | ||
1199 | struct hlist_node *n; | ||
1200 | uint64_t cgroup_total = 0; | ||
1201 | |||
1202 | rcu_read_lock(); | ||
1203 | hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) { | ||
1204 | if (blkg->dev) { | ||
1205 | if (!cftype_blkg_same_policy(cft, blkg)) | ||
1206 | continue; | ||
1207 | if (pcpu) | ||
1208 | cgroup_total += blkio_get_stat_cpu(blkg, cb, | ||
1209 | blkg->dev, type); | ||
1210 | else { | ||
1211 | spin_lock_irq(&blkg->stats_lock); | ||
1212 | cgroup_total += blkio_get_stat(blkg, cb, | ||
1213 | blkg->dev, type); | ||
1214 | spin_unlock_irq(&blkg->stats_lock); | ||
1215 | } | ||
1216 | } | ||
1217 | } | ||
1218 | if (show_total) | ||
1219 | cb->fill(cb, "Total", cgroup_total); | ||
1220 | rcu_read_unlock(); | ||
1221 | return 0; | ||
1222 | } | ||
1223 | |||
1224 | /* All map kind of cgroup file get serviced by this function */ | ||
1225 | static int blkiocg_file_read_map(struct cgroup *cgrp, struct cftype *cft, | ||
1226 | struct cgroup_map_cb *cb) | ||
1227 | { | ||
1228 | struct blkio_cgroup *blkcg; | ||
1229 | enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); | ||
1230 | int name = BLKIOFILE_ATTR(cft->private); | ||
1231 | |||
1232 | blkcg = cgroup_to_blkio_cgroup(cgrp); | ||
1233 | |||
1234 | switch(plid) { | ||
1235 | case BLKIO_POLICY_PROP: | ||
1236 | switch(name) { | ||
1237 | case BLKIO_PROP_time: | ||
1238 | return blkio_read_blkg_stats(blkcg, cft, cb, | ||
1239 | BLKIO_STAT_TIME, 0, 0); | ||
1240 | case BLKIO_PROP_sectors: | ||
1241 | return blkio_read_blkg_stats(blkcg, cft, cb, | ||
1242 | BLKIO_STAT_CPU_SECTORS, 0, 1); | ||
1243 | case BLKIO_PROP_io_service_bytes: | ||
1244 | return blkio_read_blkg_stats(blkcg, cft, cb, | ||
1245 | BLKIO_STAT_CPU_SERVICE_BYTES, 1, 1); | ||
1246 | case BLKIO_PROP_io_serviced: | ||
1247 | return blkio_read_blkg_stats(blkcg, cft, cb, | ||
1248 | BLKIO_STAT_CPU_SERVICED, 1, 1); | ||
1249 | case BLKIO_PROP_io_service_time: | ||
1250 | return blkio_read_blkg_stats(blkcg, cft, cb, | ||
1251 | BLKIO_STAT_SERVICE_TIME, 1, 0); | ||
1252 | case BLKIO_PROP_io_wait_time: | ||
1253 | return blkio_read_blkg_stats(blkcg, cft, cb, | ||
1254 | BLKIO_STAT_WAIT_TIME, 1, 0); | ||
1255 | case BLKIO_PROP_io_merged: | ||
1256 | return blkio_read_blkg_stats(blkcg, cft, cb, | ||
1257 | BLKIO_STAT_CPU_MERGED, 1, 1); | ||
1258 | case BLKIO_PROP_io_queued: | ||
1259 | return blkio_read_blkg_stats(blkcg, cft, cb, | ||
1260 | BLKIO_STAT_QUEUED, 1, 0); | ||
1261 | #ifdef CONFIG_DEBUG_BLK_CGROUP | ||
1262 | case BLKIO_PROP_unaccounted_time: | ||
1263 | return blkio_read_blkg_stats(blkcg, cft, cb, | ||
1264 | BLKIO_STAT_UNACCOUNTED_TIME, 0, 0); | ||
1265 | case BLKIO_PROP_dequeue: | ||
1266 | return blkio_read_blkg_stats(blkcg, cft, cb, | ||
1267 | BLKIO_STAT_DEQUEUE, 0, 0); | ||
1268 | case BLKIO_PROP_avg_queue_size: | ||
1269 | return blkio_read_blkg_stats(blkcg, cft, cb, | ||
1270 | BLKIO_STAT_AVG_QUEUE_SIZE, 0, 0); | ||
1271 | case BLKIO_PROP_group_wait_time: | ||
1272 | return blkio_read_blkg_stats(blkcg, cft, cb, | ||
1273 | BLKIO_STAT_GROUP_WAIT_TIME, 0, 0); | ||
1274 | case BLKIO_PROP_idle_time: | ||
1275 | return blkio_read_blkg_stats(blkcg, cft, cb, | ||
1276 | BLKIO_STAT_IDLE_TIME, 0, 0); | ||
1277 | case BLKIO_PROP_empty_time: | ||
1278 | return blkio_read_blkg_stats(blkcg, cft, cb, | ||
1279 | BLKIO_STAT_EMPTY_TIME, 0, 0); | ||
1280 | #endif | ||
1281 | default: | ||
1282 | BUG(); | ||
1283 | } | ||
1284 | break; | ||
1285 | case BLKIO_POLICY_THROTL: | ||
1286 | switch(name){ | ||
1287 | case BLKIO_THROTL_io_service_bytes: | ||
1288 | return blkio_read_blkg_stats(blkcg, cft, cb, | ||
1289 | BLKIO_STAT_CPU_SERVICE_BYTES, 1, 1); | ||
1290 | case BLKIO_THROTL_io_serviced: | ||
1291 | return blkio_read_blkg_stats(blkcg, cft, cb, | ||
1292 | BLKIO_STAT_CPU_SERVICED, 1, 1); | ||
1293 | default: | ||
1294 | BUG(); | ||
1295 | } | ||
1296 | break; | ||
1297 | default: | ||
1298 | BUG(); | ||
1299 | } | ||
1300 | |||
1301 | return 0; | ||
1302 | } | ||
1303 | |||
1304 | static int blkio_weight_write(struct blkio_cgroup *blkcg, u64 val) | ||
1305 | { | ||
1306 | struct blkio_group *blkg; | ||
1307 | struct hlist_node *n; | ||
1308 | struct blkio_policy_node *pn; | ||
1309 | |||
1310 | if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX) | ||
1311 | return -EINVAL; | ||
1312 | |||
1313 | spin_lock(&blkio_list_lock); | ||
1314 | spin_lock_irq(&blkcg->lock); | ||
1315 | blkcg->weight = (unsigned int)val; | ||
1316 | |||
1317 | hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) { | ||
1318 | pn = blkio_policy_search_node(blkcg, blkg->dev, | ||
1319 | BLKIO_POLICY_PROP, BLKIO_PROP_weight_device); | ||
1320 | if (pn) | ||
1321 | continue; | ||
1322 | |||
1323 | blkio_update_group_weight(blkg, blkcg->weight); | ||
1324 | } | ||
1325 | spin_unlock_irq(&blkcg->lock); | ||
1326 | spin_unlock(&blkio_list_lock); | ||
1327 | return 0; | ||
1328 | } | ||
1329 | |||
1330 | static u64 blkiocg_file_read_u64 (struct cgroup *cgrp, struct cftype *cft) { | ||
1331 | struct blkio_cgroup *blkcg; | ||
1332 | enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); | ||
1333 | int name = BLKIOFILE_ATTR(cft->private); | ||
1334 | |||
1335 | blkcg = cgroup_to_blkio_cgroup(cgrp); | ||
1336 | |||
1337 | switch(plid) { | ||
1338 | case BLKIO_POLICY_PROP: | ||
1339 | switch(name) { | ||
1340 | case BLKIO_PROP_weight: | ||
1341 | return (u64)blkcg->weight; | ||
1342 | } | ||
1343 | break; | ||
1344 | default: | ||
1345 | BUG(); | ||
1346 | } | ||
1347 | return 0; | ||
1348 | } | ||
1349 | |||
1350 | static int | ||
1351 | blkiocg_file_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val) | ||
1352 | { | ||
1353 | struct blkio_cgroup *blkcg; | ||
1354 | enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); | ||
1355 | int name = BLKIOFILE_ATTR(cft->private); | ||
1356 | |||
1357 | blkcg = cgroup_to_blkio_cgroup(cgrp); | ||
1358 | |||
1359 | switch(plid) { | ||
1360 | case BLKIO_POLICY_PROP: | ||
1361 | switch(name) { | ||
1362 | case BLKIO_PROP_weight: | ||
1363 | return blkio_weight_write(blkcg, val); | ||
1364 | } | ||
1365 | break; | ||
1366 | default: | ||
1367 | BUG(); | ||
1368 | } | ||
825 | 1369 | ||
826 | return 0; | 1370 | return 0; |
827 | } | 1371 | } |
@@ -829,71 +1373,157 @@ static int blkiocg_weight_device_read(struct cgroup *cgrp, struct cftype *cft, | |||
829 | struct cftype blkio_files[] = { | 1373 | struct cftype blkio_files[] = { |
830 | { | 1374 | { |
831 | .name = "weight_device", | 1375 | .name = "weight_device", |
832 | .read_seq_string = blkiocg_weight_device_read, | 1376 | .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, |
833 | .write_string = blkiocg_weight_device_write, | 1377 | BLKIO_PROP_weight_device), |
1378 | .read_seq_string = blkiocg_file_read, | ||
1379 | .write_string = blkiocg_file_write, | ||
834 | .max_write_len = 256, | 1380 | .max_write_len = 256, |
835 | }, | 1381 | }, |
836 | { | 1382 | { |
837 | .name = "weight", | 1383 | .name = "weight", |
838 | .read_u64 = blkiocg_weight_read, | 1384 | .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, |
839 | .write_u64 = blkiocg_weight_write, | 1385 | BLKIO_PROP_weight), |
1386 | .read_u64 = blkiocg_file_read_u64, | ||
1387 | .write_u64 = blkiocg_file_write_u64, | ||
840 | }, | 1388 | }, |
841 | { | 1389 | { |
842 | .name = "time", | 1390 | .name = "time", |
843 | .read_map = blkiocg_time_read, | 1391 | .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, |
1392 | BLKIO_PROP_time), | ||
1393 | .read_map = blkiocg_file_read_map, | ||
844 | }, | 1394 | }, |
845 | { | 1395 | { |
846 | .name = "sectors", | 1396 | .name = "sectors", |
847 | .read_map = blkiocg_sectors_read, | 1397 | .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, |
1398 | BLKIO_PROP_sectors), | ||
1399 | .read_map = blkiocg_file_read_map, | ||
848 | }, | 1400 | }, |
849 | { | 1401 | { |
850 | .name = "io_service_bytes", | 1402 | .name = "io_service_bytes", |
851 | .read_map = blkiocg_io_service_bytes_read, | 1403 | .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, |
1404 | BLKIO_PROP_io_service_bytes), | ||
1405 | .read_map = blkiocg_file_read_map, | ||
852 | }, | 1406 | }, |
853 | { | 1407 | { |
854 | .name = "io_serviced", | 1408 | .name = "io_serviced", |
855 | .read_map = blkiocg_io_serviced_read, | 1409 | .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, |
1410 | BLKIO_PROP_io_serviced), | ||
1411 | .read_map = blkiocg_file_read_map, | ||
856 | }, | 1412 | }, |
857 | { | 1413 | { |
858 | .name = "io_service_time", | 1414 | .name = "io_service_time", |
859 | .read_map = blkiocg_io_service_time_read, | 1415 | .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, |
1416 | BLKIO_PROP_io_service_time), | ||
1417 | .read_map = blkiocg_file_read_map, | ||
860 | }, | 1418 | }, |
861 | { | 1419 | { |
862 | .name = "io_wait_time", | 1420 | .name = "io_wait_time", |
863 | .read_map = blkiocg_io_wait_time_read, | 1421 | .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, |
1422 | BLKIO_PROP_io_wait_time), | ||
1423 | .read_map = blkiocg_file_read_map, | ||
864 | }, | 1424 | }, |
865 | { | 1425 | { |
866 | .name = "io_merged", | 1426 | .name = "io_merged", |
867 | .read_map = blkiocg_io_merged_read, | 1427 | .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, |
1428 | BLKIO_PROP_io_merged), | ||
1429 | .read_map = blkiocg_file_read_map, | ||
868 | }, | 1430 | }, |
869 | { | 1431 | { |
870 | .name = "io_queued", | 1432 | .name = "io_queued", |
871 | .read_map = blkiocg_io_queued_read, | 1433 | .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, |
1434 | BLKIO_PROP_io_queued), | ||
1435 | .read_map = blkiocg_file_read_map, | ||
872 | }, | 1436 | }, |
873 | { | 1437 | { |
874 | .name = "reset_stats", | 1438 | .name = "reset_stats", |
875 | .write_u64 = blkiocg_reset_stats, | 1439 | .write_u64 = blkiocg_reset_stats, |
876 | }, | 1440 | }, |
1441 | #ifdef CONFIG_BLK_DEV_THROTTLING | ||
1442 | { | ||
1443 | .name = "throttle.read_bps_device", | ||
1444 | .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL, | ||
1445 | BLKIO_THROTL_read_bps_device), | ||
1446 | .read_seq_string = blkiocg_file_read, | ||
1447 | .write_string = blkiocg_file_write, | ||
1448 | .max_write_len = 256, | ||
1449 | }, | ||
1450 | |||
1451 | { | ||
1452 | .name = "throttle.write_bps_device", | ||
1453 | .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL, | ||
1454 | BLKIO_THROTL_write_bps_device), | ||
1455 | .read_seq_string = blkiocg_file_read, | ||
1456 | .write_string = blkiocg_file_write, | ||
1457 | .max_write_len = 256, | ||
1458 | }, | ||
1459 | |||
1460 | { | ||
1461 | .name = "throttle.read_iops_device", | ||
1462 | .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL, | ||
1463 | BLKIO_THROTL_read_iops_device), | ||
1464 | .read_seq_string = blkiocg_file_read, | ||
1465 | .write_string = blkiocg_file_write, | ||
1466 | .max_write_len = 256, | ||
1467 | }, | ||
1468 | |||
1469 | { | ||
1470 | .name = "throttle.write_iops_device", | ||
1471 | .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL, | ||
1472 | BLKIO_THROTL_write_iops_device), | ||
1473 | .read_seq_string = blkiocg_file_read, | ||
1474 | .write_string = blkiocg_file_write, | ||
1475 | .max_write_len = 256, | ||
1476 | }, | ||
1477 | { | ||
1478 | .name = "throttle.io_service_bytes", | ||
1479 | .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL, | ||
1480 | BLKIO_THROTL_io_service_bytes), | ||
1481 | .read_map = blkiocg_file_read_map, | ||
1482 | }, | ||
1483 | { | ||
1484 | .name = "throttle.io_serviced", | ||
1485 | .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL, | ||
1486 | BLKIO_THROTL_io_serviced), | ||
1487 | .read_map = blkiocg_file_read_map, | ||
1488 | }, | ||
1489 | #endif /* CONFIG_BLK_DEV_THROTTLING */ | ||
1490 | |||
877 | #ifdef CONFIG_DEBUG_BLK_CGROUP | 1491 | #ifdef CONFIG_DEBUG_BLK_CGROUP |
878 | { | 1492 | { |
879 | .name = "avg_queue_size", | 1493 | .name = "avg_queue_size", |
880 | .read_map = blkiocg_avg_queue_size_read, | 1494 | .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, |
1495 | BLKIO_PROP_avg_queue_size), | ||
1496 | .read_map = blkiocg_file_read_map, | ||
881 | }, | 1497 | }, |
882 | { | 1498 | { |
883 | .name = "group_wait_time", | 1499 | .name = "group_wait_time", |
884 | .read_map = blkiocg_group_wait_time_read, | 1500 | .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, |
1501 | BLKIO_PROP_group_wait_time), | ||
1502 | .read_map = blkiocg_file_read_map, | ||
885 | }, | 1503 | }, |
886 | { | 1504 | { |
887 | .name = "idle_time", | 1505 | .name = "idle_time", |
888 | .read_map = blkiocg_idle_time_read, | 1506 | .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, |
1507 | BLKIO_PROP_idle_time), | ||
1508 | .read_map = blkiocg_file_read_map, | ||
889 | }, | 1509 | }, |
890 | { | 1510 | { |
891 | .name = "empty_time", | 1511 | .name = "empty_time", |
892 | .read_map = blkiocg_empty_time_read, | 1512 | .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, |
1513 | BLKIO_PROP_empty_time), | ||
1514 | .read_map = blkiocg_file_read_map, | ||
893 | }, | 1515 | }, |
894 | { | 1516 | { |
895 | .name = "dequeue", | 1517 | .name = "dequeue", |
896 | .read_map = blkiocg_dequeue_read, | 1518 | .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, |
1519 | BLKIO_PROP_dequeue), | ||
1520 | .read_map = blkiocg_file_read_map, | ||
1521 | }, | ||
1522 | { | ||
1523 | .name = "unaccounted_time", | ||
1524 | .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, | ||
1525 | BLKIO_PROP_unaccounted_time), | ||
1526 | .read_map = blkiocg_file_read_map, | ||
897 | }, | 1527 | }, |
898 | #endif | 1528 | #endif |
899 | }; | 1529 | }; |
@@ -932,13 +1562,14 @@ static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup) | |||
932 | /* | 1562 | /* |
933 | * This blkio_group is being unlinked as associated cgroup is | 1563 | * This blkio_group is being unlinked as associated cgroup is |
934 | * going away. Let all the IO controlling policies know about | 1564 | * going away. Let all the IO controlling policies know about |
935 | * this event. Currently this is static call to one io | 1565 | * this event. |
936 | * controlling policy. Once we have more policies in place, we | ||
937 | * need some dynamic registration of callback function. | ||
938 | */ | 1566 | */ |
939 | spin_lock(&blkio_list_lock); | 1567 | spin_lock(&blkio_list_lock); |
940 | list_for_each_entry(blkiop, &blkio_list, list) | 1568 | list_for_each_entry(blkiop, &blkio_list, list) { |
1569 | if (blkiop->plid != blkg->plid) | ||
1570 | continue; | ||
941 | blkiop->ops.blkio_unlink_group_fn(key, blkg); | 1571 | blkiop->ops.blkio_unlink_group_fn(key, blkg); |
1572 | } | ||
942 | spin_unlock(&blkio_list_lock); | 1573 | spin_unlock(&blkio_list_lock); |
943 | } while (1); | 1574 | } while (1); |
944 | 1575 | ||
@@ -964,10 +1595,6 @@ blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup) | |||
964 | goto done; | 1595 | goto done; |
965 | } | 1596 | } |
966 | 1597 | ||
967 | /* Currently we do not support hierarchy deeper than two level (0,1) */ | ||
968 | if (parent != cgroup->top_cgroup) | ||
969 | return ERR_PTR(-EPERM); | ||
970 | |||
971 | blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL); | 1598 | blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL); |
972 | if (!blkcg) | 1599 | if (!blkcg) |
973 | return ERR_PTR(-ENOMEM); | 1600 | return ERR_PTR(-ENOMEM); |
@@ -987,9 +1614,7 @@ done: | |||
987 | * of the main cic data structures. For now we allow a task to change | 1614 | * of the main cic data structures. For now we allow a task to change |
988 | * its cgroup only if it's the only owner of its ioc. | 1615 | * its cgroup only if it's the only owner of its ioc. |
989 | */ | 1616 | */ |
990 | static int blkiocg_can_attach(struct cgroup_subsys *subsys, | 1617 | static int blkiocg_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk) |
991 | struct cgroup *cgroup, struct task_struct *tsk, | ||
992 | bool threadgroup) | ||
993 | { | 1618 | { |
994 | struct io_context *ioc; | 1619 | struct io_context *ioc; |
995 | int ret = 0; | 1620 | int ret = 0; |
@@ -1004,9 +1629,7 @@ static int blkiocg_can_attach(struct cgroup_subsys *subsys, | |||
1004 | return ret; | 1629 | return ret; |
1005 | } | 1630 | } |
1006 | 1631 | ||
1007 | static void blkiocg_attach(struct cgroup_subsys *subsys, struct cgroup *cgroup, | 1632 | static void blkiocg_attach_task(struct cgroup *cgrp, struct task_struct *tsk) |
1008 | struct cgroup *prev, struct task_struct *tsk, | ||
1009 | bool threadgroup) | ||
1010 | { | 1633 | { |
1011 | struct io_context *ioc; | 1634 | struct io_context *ioc; |
1012 | 1635 | ||
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index 2b866ec1dcea..a71d2904ffb9 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h | |||
@@ -14,6 +14,15 @@ | |||
14 | */ | 14 | */ |
15 | 15 | ||
16 | #include <linux/cgroup.h> | 16 | #include <linux/cgroup.h> |
17 | #include <linux/u64_stats_sync.h> | ||
18 | |||
19 | enum blkio_policy_id { | ||
20 | BLKIO_POLICY_PROP = 0, /* Proportional Bandwidth division */ | ||
21 | BLKIO_POLICY_THROTL, /* Throttling */ | ||
22 | }; | ||
23 | |||
24 | /* Max limits for throttle policy */ | ||
25 | #define THROTL_IOPS_MAX UINT_MAX | ||
17 | 26 | ||
18 | #if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE) | 27 | #if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE) |
19 | 28 | ||
@@ -28,20 +37,15 @@ enum stat_type { | |||
28 | * request completion for IOs doen by this cgroup. This may not be | 37 | * request completion for IOs doen by this cgroup. This may not be |
29 | * accurate when NCQ is turned on. */ | 38 | * accurate when NCQ is turned on. */ |
30 | BLKIO_STAT_SERVICE_TIME = 0, | 39 | BLKIO_STAT_SERVICE_TIME = 0, |
31 | /* Total bytes transferred */ | ||
32 | BLKIO_STAT_SERVICE_BYTES, | ||
33 | /* Total IOs serviced, post merge */ | ||
34 | BLKIO_STAT_SERVICED, | ||
35 | /* Total time spent waiting in scheduler queue in ns */ | 40 | /* Total time spent waiting in scheduler queue in ns */ |
36 | BLKIO_STAT_WAIT_TIME, | 41 | BLKIO_STAT_WAIT_TIME, |
37 | /* Number of IOs merged */ | ||
38 | BLKIO_STAT_MERGED, | ||
39 | /* Number of IOs queued up */ | 42 | /* Number of IOs queued up */ |
40 | BLKIO_STAT_QUEUED, | 43 | BLKIO_STAT_QUEUED, |
41 | /* All the single valued stats go below this */ | 44 | /* All the single valued stats go below this */ |
42 | BLKIO_STAT_TIME, | 45 | BLKIO_STAT_TIME, |
43 | BLKIO_STAT_SECTORS, | ||
44 | #ifdef CONFIG_DEBUG_BLK_CGROUP | 46 | #ifdef CONFIG_DEBUG_BLK_CGROUP |
47 | /* Time not charged to this cgroup */ | ||
48 | BLKIO_STAT_UNACCOUNTED_TIME, | ||
45 | BLKIO_STAT_AVG_QUEUE_SIZE, | 49 | BLKIO_STAT_AVG_QUEUE_SIZE, |
46 | BLKIO_STAT_IDLE_TIME, | 50 | BLKIO_STAT_IDLE_TIME, |
47 | BLKIO_STAT_EMPTY_TIME, | 51 | BLKIO_STAT_EMPTY_TIME, |
@@ -50,6 +54,18 @@ enum stat_type { | |||
50 | #endif | 54 | #endif |
51 | }; | 55 | }; |
52 | 56 | ||
57 | /* Per cpu stats */ | ||
58 | enum stat_type_cpu { | ||
59 | BLKIO_STAT_CPU_SECTORS, | ||
60 | /* Total bytes transferred */ | ||
61 | BLKIO_STAT_CPU_SERVICE_BYTES, | ||
62 | /* Total IOs serviced, post merge */ | ||
63 | BLKIO_STAT_CPU_SERVICED, | ||
64 | /* Number of IOs merged */ | ||
65 | BLKIO_STAT_CPU_MERGED, | ||
66 | BLKIO_STAT_CPU_NR | ||
67 | }; | ||
68 | |||
53 | enum stat_sub_type { | 69 | enum stat_sub_type { |
54 | BLKIO_STAT_READ = 0, | 70 | BLKIO_STAT_READ = 0, |
55 | BLKIO_STAT_WRITE, | 71 | BLKIO_STAT_WRITE, |
@@ -65,6 +81,36 @@ enum blkg_state_flags { | |||
65 | BLKG_empty, | 81 | BLKG_empty, |
66 | }; | 82 | }; |
67 | 83 | ||
84 | /* cgroup files owned by proportional weight policy */ | ||
85 | enum blkcg_file_name_prop { | ||
86 | BLKIO_PROP_weight = 1, | ||
87 | BLKIO_PROP_weight_device, | ||
88 | BLKIO_PROP_io_service_bytes, | ||
89 | BLKIO_PROP_io_serviced, | ||
90 | BLKIO_PROP_time, | ||
91 | BLKIO_PROP_sectors, | ||
92 | BLKIO_PROP_unaccounted_time, | ||
93 | BLKIO_PROP_io_service_time, | ||
94 | BLKIO_PROP_io_wait_time, | ||
95 | BLKIO_PROP_io_merged, | ||
96 | BLKIO_PROP_io_queued, | ||
97 | BLKIO_PROP_avg_queue_size, | ||
98 | BLKIO_PROP_group_wait_time, | ||
99 | BLKIO_PROP_idle_time, | ||
100 | BLKIO_PROP_empty_time, | ||
101 | BLKIO_PROP_dequeue, | ||
102 | }; | ||
103 | |||
104 | /* cgroup files owned by throttle policy */ | ||
105 | enum blkcg_file_name_throtl { | ||
106 | BLKIO_THROTL_read_bps_device, | ||
107 | BLKIO_THROTL_write_bps_device, | ||
108 | BLKIO_THROTL_read_iops_device, | ||
109 | BLKIO_THROTL_write_iops_device, | ||
110 | BLKIO_THROTL_io_service_bytes, | ||
111 | BLKIO_THROTL_io_serviced, | ||
112 | }; | ||
113 | |||
68 | struct blkio_cgroup { | 114 | struct blkio_cgroup { |
69 | struct cgroup_subsys_state css; | 115 | struct cgroup_subsys_state css; |
70 | unsigned int weight; | 116 | unsigned int weight; |
@@ -76,9 +122,11 @@ struct blkio_cgroup { | |||
76 | struct blkio_group_stats { | 122 | struct blkio_group_stats { |
77 | /* total disk time and nr sectors dispatched by this group */ | 123 | /* total disk time and nr sectors dispatched by this group */ |
78 | uint64_t time; | 124 | uint64_t time; |
79 | uint64_t sectors; | ||
80 | uint64_t stat_arr[BLKIO_STAT_QUEUED + 1][BLKIO_STAT_TOTAL]; | 125 | uint64_t stat_arr[BLKIO_STAT_QUEUED + 1][BLKIO_STAT_TOTAL]; |
81 | #ifdef CONFIG_DEBUG_BLK_CGROUP | 126 | #ifdef CONFIG_DEBUG_BLK_CGROUP |
127 | /* Time not charged to this cgroup */ | ||
128 | uint64_t unaccounted_time; | ||
129 | |||
82 | /* Sum of number of IOs queued across all samples */ | 130 | /* Sum of number of IOs queued across all samples */ |
83 | uint64_t avg_queue_size_sum; | 131 | uint64_t avg_queue_size_sum; |
84 | /* Count of samples taken for average */ | 132 | /* Count of samples taken for average */ |
@@ -103,6 +151,13 @@ struct blkio_group_stats { | |||
103 | #endif | 151 | #endif |
104 | }; | 152 | }; |
105 | 153 | ||
154 | /* Per cpu blkio group stats */ | ||
155 | struct blkio_group_stats_cpu { | ||
156 | uint64_t sectors; | ||
157 | uint64_t stat_arr_cpu[BLKIO_STAT_CPU_NR][BLKIO_STAT_TOTAL]; | ||
158 | struct u64_stats_sync syncp; | ||
159 | }; | ||
160 | |||
106 | struct blkio_group { | 161 | struct blkio_group { |
107 | /* An rcu protected unique identifier for the group */ | 162 | /* An rcu protected unique identifier for the group */ |
108 | void *key; | 163 | void *key; |
@@ -112,33 +167,73 @@ struct blkio_group { | |||
112 | char path[128]; | 167 | char path[128]; |
113 | /* The device MKDEV(major, minor), this group has been created for */ | 168 | /* The device MKDEV(major, minor), this group has been created for */ |
114 | dev_t dev; | 169 | dev_t dev; |
170 | /* policy which owns this blk group */ | ||
171 | enum blkio_policy_id plid; | ||
115 | 172 | ||
116 | /* Need to serialize the stats in the case of reset/update */ | 173 | /* Need to serialize the stats in the case of reset/update */ |
117 | spinlock_t stats_lock; | 174 | spinlock_t stats_lock; |
118 | struct blkio_group_stats stats; | 175 | struct blkio_group_stats stats; |
176 | /* Per cpu stats pointer */ | ||
177 | struct blkio_group_stats_cpu __percpu *stats_cpu; | ||
119 | }; | 178 | }; |
120 | 179 | ||
121 | struct blkio_policy_node { | 180 | struct blkio_policy_node { |
122 | struct list_head node; | 181 | struct list_head node; |
123 | dev_t dev; | 182 | dev_t dev; |
124 | unsigned int weight; | 183 | /* This node belongs to max bw policy or porportional weight policy */ |
184 | enum blkio_policy_id plid; | ||
185 | /* cgroup file to which this rule belongs to */ | ||
186 | int fileid; | ||
187 | |||
188 | union { | ||
189 | unsigned int weight; | ||
190 | /* | ||
191 | * Rate read/write in terms of byptes per second | ||
192 | * Whether this rate represents read or write is determined | ||
193 | * by file type "fileid". | ||
194 | */ | ||
195 | u64 bps; | ||
196 | unsigned int iops; | ||
197 | } val; | ||
125 | }; | 198 | }; |
126 | 199 | ||
127 | extern unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg, | 200 | extern unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg, |
128 | dev_t dev); | 201 | dev_t dev); |
202 | extern uint64_t blkcg_get_read_bps(struct blkio_cgroup *blkcg, | ||
203 | dev_t dev); | ||
204 | extern uint64_t blkcg_get_write_bps(struct blkio_cgroup *blkcg, | ||
205 | dev_t dev); | ||
206 | extern unsigned int blkcg_get_read_iops(struct blkio_cgroup *blkcg, | ||
207 | dev_t dev); | ||
208 | extern unsigned int blkcg_get_write_iops(struct blkio_cgroup *blkcg, | ||
209 | dev_t dev); | ||
129 | 210 | ||
130 | typedef void (blkio_unlink_group_fn) (void *key, struct blkio_group *blkg); | 211 | typedef void (blkio_unlink_group_fn) (void *key, struct blkio_group *blkg); |
131 | typedef void (blkio_update_group_weight_fn) (struct blkio_group *blkg, | 212 | |
132 | unsigned int weight); | 213 | typedef void (blkio_update_group_weight_fn) (void *key, |
214 | struct blkio_group *blkg, unsigned int weight); | ||
215 | typedef void (blkio_update_group_read_bps_fn) (void * key, | ||
216 | struct blkio_group *blkg, u64 read_bps); | ||
217 | typedef void (blkio_update_group_write_bps_fn) (void *key, | ||
218 | struct blkio_group *blkg, u64 write_bps); | ||
219 | typedef void (blkio_update_group_read_iops_fn) (void *key, | ||
220 | struct blkio_group *blkg, unsigned int read_iops); | ||
221 | typedef void (blkio_update_group_write_iops_fn) (void *key, | ||
222 | struct blkio_group *blkg, unsigned int write_iops); | ||
133 | 223 | ||
134 | struct blkio_policy_ops { | 224 | struct blkio_policy_ops { |
135 | blkio_unlink_group_fn *blkio_unlink_group_fn; | 225 | blkio_unlink_group_fn *blkio_unlink_group_fn; |
136 | blkio_update_group_weight_fn *blkio_update_group_weight_fn; | 226 | blkio_update_group_weight_fn *blkio_update_group_weight_fn; |
227 | blkio_update_group_read_bps_fn *blkio_update_group_read_bps_fn; | ||
228 | blkio_update_group_write_bps_fn *blkio_update_group_write_bps_fn; | ||
229 | blkio_update_group_read_iops_fn *blkio_update_group_read_iops_fn; | ||
230 | blkio_update_group_write_iops_fn *blkio_update_group_write_iops_fn; | ||
137 | }; | 231 | }; |
138 | 232 | ||
139 | struct blkio_policy_type { | 233 | struct blkio_policy_type { |
140 | struct list_head list; | 234 | struct list_head list; |
141 | struct blkio_policy_ops ops; | 235 | struct blkio_policy_ops ops; |
236 | enum blkio_policy_id plid; | ||
142 | }; | 237 | }; |
143 | 238 | ||
144 | /* Blkio controller policy registration */ | 239 | /* Blkio controller policy registration */ |
@@ -165,7 +260,7 @@ static inline char *blkg_path(struct blkio_group *blkg) { return NULL; } | |||
165 | 260 | ||
166 | #endif | 261 | #endif |
167 | 262 | ||
168 | #define BLKIO_WEIGHT_MIN 100 | 263 | #define BLKIO_WEIGHT_MIN 10 |
169 | #define BLKIO_WEIGHT_MAX 1000 | 264 | #define BLKIO_WEIGHT_MAX 1000 |
170 | #define BLKIO_WEIGHT_DEFAULT 500 | 265 | #define BLKIO_WEIGHT_DEFAULT 500 |
171 | 266 | ||
@@ -211,13 +306,17 @@ static inline void blkiocg_set_start_empty_time(struct blkio_group *blkg) {} | |||
211 | #if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE) | 306 | #if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE) |
212 | extern struct blkio_cgroup blkio_root_cgroup; | 307 | extern struct blkio_cgroup blkio_root_cgroup; |
213 | extern struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup); | 308 | extern struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup); |
309 | extern struct blkio_cgroup *task_blkio_cgroup(struct task_struct *tsk); | ||
214 | extern void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, | 310 | extern void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, |
215 | struct blkio_group *blkg, void *key, dev_t dev); | 311 | struct blkio_group *blkg, void *key, dev_t dev, |
312 | enum blkio_policy_id plid); | ||
313 | extern int blkio_alloc_blkg_stats(struct blkio_group *blkg); | ||
216 | extern int blkiocg_del_blkio_group(struct blkio_group *blkg); | 314 | extern int blkiocg_del_blkio_group(struct blkio_group *blkg); |
217 | extern struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, | 315 | extern struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, |
218 | void *key); | 316 | void *key); |
219 | void blkiocg_update_timeslice_used(struct blkio_group *blkg, | 317 | void blkiocg_update_timeslice_used(struct blkio_group *blkg, |
220 | unsigned long time); | 318 | unsigned long time, |
319 | unsigned long unaccounted_time); | ||
221 | void blkiocg_update_dispatch_stats(struct blkio_group *blkg, uint64_t bytes, | 320 | void blkiocg_update_dispatch_stats(struct blkio_group *blkg, uint64_t bytes, |
222 | bool direction, bool sync); | 321 | bool direction, bool sync); |
223 | void blkiocg_update_completion_stats(struct blkio_group *blkg, | 322 | void blkiocg_update_completion_stats(struct blkio_group *blkg, |
@@ -232,9 +331,14 @@ void blkiocg_update_io_remove_stats(struct blkio_group *blkg, | |||
232 | struct cgroup; | 331 | struct cgroup; |
233 | static inline struct blkio_cgroup * | 332 | static inline struct blkio_cgroup * |
234 | cgroup_to_blkio_cgroup(struct cgroup *cgroup) { return NULL; } | 333 | cgroup_to_blkio_cgroup(struct cgroup *cgroup) { return NULL; } |
334 | static inline struct blkio_cgroup * | ||
335 | task_blkio_cgroup(struct task_struct *tsk) { return NULL; } | ||
235 | 336 | ||
236 | static inline void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, | 337 | static inline void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, |
237 | struct blkio_group *blkg, void *key, dev_t dev) {} | 338 | struct blkio_group *blkg, void *key, dev_t dev, |
339 | enum blkio_policy_id plid) {} | ||
340 | |||
341 | static inline int blkio_alloc_blkg_stats(struct blkio_group *blkg) { return 0; } | ||
238 | 342 | ||
239 | static inline int | 343 | static inline int |
240 | blkiocg_del_blkio_group(struct blkio_group *blkg) { return 0; } | 344 | blkiocg_del_blkio_group(struct blkio_group *blkg) { return 0; } |
@@ -242,7 +346,9 @@ blkiocg_del_blkio_group(struct blkio_group *blkg) { return 0; } | |||
242 | static inline struct blkio_group * | 346 | static inline struct blkio_group * |
243 | blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key) { return NULL; } | 347 | blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key) { return NULL; } |
244 | static inline void blkiocg_update_timeslice_used(struct blkio_group *blkg, | 348 | static inline void blkiocg_update_timeslice_used(struct blkio_group *blkg, |
245 | unsigned long time) {} | 349 | unsigned long time, |
350 | unsigned long unaccounted_time) | ||
351 | {} | ||
246 | static inline void blkiocg_update_dispatch_stats(struct blkio_group *blkg, | 352 | static inline void blkiocg_update_dispatch_stats(struct blkio_group *blkg, |
247 | uint64_t bytes, bool direction, bool sync) {} | 353 | uint64_t bytes, bool direction, bool sync) {} |
248 | static inline void blkiocg_update_completion_stats(struct blkio_group *blkg, | 354 | static inline void blkiocg_update_completion_stats(struct blkio_group *blkg, |
diff --git a/block/blk-core.c b/block/blk-core.c index 32a1c123dfb3..d2f8f4049abd 100644 --- a/block/blk-core.c +++ b/block/blk-core.c | |||
@@ -27,13 +27,14 @@ | |||
27 | #include <linux/writeback.h> | 27 | #include <linux/writeback.h> |
28 | #include <linux/task_io_accounting_ops.h> | 28 | #include <linux/task_io_accounting_ops.h> |
29 | #include <linux/fault-inject.h> | 29 | #include <linux/fault-inject.h> |
30 | #include <linux/list_sort.h> | ||
30 | 31 | ||
31 | #define CREATE_TRACE_POINTS | 32 | #define CREATE_TRACE_POINTS |
32 | #include <trace/events/block.h> | 33 | #include <trace/events/block.h> |
33 | 34 | ||
34 | #include "blk.h" | 35 | #include "blk.h" |
35 | 36 | ||
36 | EXPORT_TRACEPOINT_SYMBOL_GPL(block_remap); | 37 | EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap); |
37 | EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap); | 38 | EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap); |
38 | EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete); | 39 | EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete); |
39 | 40 | ||
@@ -64,13 +65,27 @@ static void drive_stat_acct(struct request *rq, int new_io) | |||
64 | return; | 65 | return; |
65 | 66 | ||
66 | cpu = part_stat_lock(); | 67 | cpu = part_stat_lock(); |
67 | part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq)); | ||
68 | 68 | ||
69 | if (!new_io) | 69 | if (!new_io) { |
70 | part = rq->part; | ||
70 | part_stat_inc(cpu, part, merges[rw]); | 71 | part_stat_inc(cpu, part, merges[rw]); |
71 | else { | 72 | } else { |
73 | part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq)); | ||
74 | if (!hd_struct_try_get(part)) { | ||
75 | /* | ||
76 | * The partition is already being removed, | ||
77 | * the request will be accounted on the disk only | ||
78 | * | ||
79 | * We take a reference on disk->part0 although that | ||
80 | * partition will never be deleted, so we can treat | ||
81 | * it as any other partition. | ||
82 | */ | ||
83 | part = &rq->rq_disk->part0; | ||
84 | hd_struct_get(part); | ||
85 | } | ||
72 | part_round_stats(cpu, part); | 86 | part_round_stats(cpu, part); |
73 | part_inc_in_flight(part, rw); | 87 | part_inc_in_flight(part, rw); |
88 | rq->part = part; | ||
74 | } | 89 | } |
75 | 90 | ||
76 | part_stat_unlock(); | 91 | part_stat_unlock(); |
@@ -128,46 +143,36 @@ void blk_rq_init(struct request_queue *q, struct request *rq) | |||
128 | rq->ref_count = 1; | 143 | rq->ref_count = 1; |
129 | rq->start_time = jiffies; | 144 | rq->start_time = jiffies; |
130 | set_start_time_ns(rq); | 145 | set_start_time_ns(rq); |
146 | rq->part = NULL; | ||
131 | } | 147 | } |
132 | EXPORT_SYMBOL(blk_rq_init); | 148 | EXPORT_SYMBOL(blk_rq_init); |
133 | 149 | ||
134 | static void req_bio_endio(struct request *rq, struct bio *bio, | 150 | static void req_bio_endio(struct request *rq, struct bio *bio, |
135 | unsigned int nbytes, int error) | 151 | unsigned int nbytes, int error) |
136 | { | 152 | { |
137 | struct request_queue *q = rq->q; | 153 | if (error) |
138 | 154 | clear_bit(BIO_UPTODATE, &bio->bi_flags); | |
139 | if (&q->bar_rq != rq) { | 155 | else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) |
140 | if (error) | 156 | error = -EIO; |
141 | clear_bit(BIO_UPTODATE, &bio->bi_flags); | ||
142 | else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) | ||
143 | error = -EIO; | ||
144 | |||
145 | if (unlikely(nbytes > bio->bi_size)) { | ||
146 | printk(KERN_ERR "%s: want %u bytes done, %u left\n", | ||
147 | __func__, nbytes, bio->bi_size); | ||
148 | nbytes = bio->bi_size; | ||
149 | } | ||
150 | 157 | ||
151 | if (unlikely(rq->cmd_flags & REQ_QUIET)) | 158 | if (unlikely(nbytes > bio->bi_size)) { |
152 | set_bit(BIO_QUIET, &bio->bi_flags); | 159 | printk(KERN_ERR "%s: want %u bytes done, %u left\n", |
160 | __func__, nbytes, bio->bi_size); | ||
161 | nbytes = bio->bi_size; | ||
162 | } | ||
153 | 163 | ||
154 | bio->bi_size -= nbytes; | 164 | if (unlikely(rq->cmd_flags & REQ_QUIET)) |
155 | bio->bi_sector += (nbytes >> 9); | 165 | set_bit(BIO_QUIET, &bio->bi_flags); |
156 | 166 | ||
157 | if (bio_integrity(bio)) | 167 | bio->bi_size -= nbytes; |
158 | bio_integrity_advance(bio, nbytes); | 168 | bio->bi_sector += (nbytes >> 9); |
159 | 169 | ||
160 | if (bio->bi_size == 0) | 170 | if (bio_integrity(bio)) |
161 | bio_endio(bio, error); | 171 | bio_integrity_advance(bio, nbytes); |
162 | } else { | ||
163 | 172 | ||
164 | /* | 173 | /* don't actually finish bio if it's part of flush sequence */ |
165 | * Okay, this is the barrier request in progress, just | 174 | if (bio->bi_size == 0 && !(rq->cmd_flags & REQ_FLUSH_SEQ)) |
166 | * record the error; | 175 | bio_endio(bio, error); |
167 | */ | ||
168 | if (error && !q->orderr) | ||
169 | q->orderr = error; | ||
170 | } | ||
171 | } | 176 | } |
172 | 177 | ||
173 | void blk_dump_rq_flags(struct request *rq, char *msg) | 178 | void blk_dump_rq_flags(struct request *rq, char *msg) |
@@ -193,136 +198,32 @@ void blk_dump_rq_flags(struct request *rq, char *msg) | |||
193 | } | 198 | } |
194 | EXPORT_SYMBOL(blk_dump_rq_flags); | 199 | EXPORT_SYMBOL(blk_dump_rq_flags); |
195 | 200 | ||
196 | /* | 201 | static void blk_delay_work(struct work_struct *work) |
197 | * "plug" the device if there are no outstanding requests: this will | ||
198 | * force the transfer to start only after we have put all the requests | ||
199 | * on the list. | ||
200 | * | ||
201 | * This is called with interrupts off and no requests on the queue and | ||
202 | * with the queue lock held. | ||
203 | */ | ||
204 | void blk_plug_device(struct request_queue *q) | ||
205 | { | 202 | { |
206 | WARN_ON(!irqs_disabled()); | 203 | struct request_queue *q; |
207 | |||
208 | /* | ||
209 | * don't plug a stopped queue, it must be paired with blk_start_queue() | ||
210 | * which will restart the queueing | ||
211 | */ | ||
212 | if (blk_queue_stopped(q)) | ||
213 | return; | ||
214 | 204 | ||
215 | if (!queue_flag_test_and_set(QUEUE_FLAG_PLUGGED, q)) { | 205 | q = container_of(work, struct request_queue, delay_work.work); |
216 | mod_timer(&q->unplug_timer, jiffies + q->unplug_delay); | 206 | spin_lock_irq(q->queue_lock); |
217 | trace_block_plug(q); | 207 | __blk_run_queue(q); |
218 | } | 208 | spin_unlock_irq(q->queue_lock); |
219 | } | 209 | } |
220 | EXPORT_SYMBOL(blk_plug_device); | ||
221 | 210 | ||
222 | /** | 211 | /** |
223 | * blk_plug_device_unlocked - plug a device without queue lock held | 212 | * blk_delay_queue - restart queueing after defined interval |
224 | * @q: The &struct request_queue to plug | 213 | * @q: The &struct request_queue in question |
214 | * @msecs: Delay in msecs | ||
225 | * | 215 | * |
226 | * Description: | 216 | * Description: |
227 | * Like @blk_plug_device(), but grabs the queue lock and disables | 217 | * Sometimes queueing needs to be postponed for a little while, to allow |
228 | * interrupts. | 218 | * resources to come back. This function will make sure that queueing is |
229 | **/ | 219 | * restarted around the specified time. |
230 | void blk_plug_device_unlocked(struct request_queue *q) | ||
231 | { | ||
232 | unsigned long flags; | ||
233 | |||
234 | spin_lock_irqsave(q->queue_lock, flags); | ||
235 | blk_plug_device(q); | ||
236 | spin_unlock_irqrestore(q->queue_lock, flags); | ||
237 | } | ||
238 | EXPORT_SYMBOL(blk_plug_device_unlocked); | ||
239 | |||
240 | /* | ||
241 | * remove the queue from the plugged list, if present. called with | ||
242 | * queue lock held and interrupts disabled. | ||
243 | */ | ||
244 | int blk_remove_plug(struct request_queue *q) | ||
245 | { | ||
246 | WARN_ON(!irqs_disabled()); | ||
247 | |||
248 | if (!queue_flag_test_and_clear(QUEUE_FLAG_PLUGGED, q)) | ||
249 | return 0; | ||
250 | |||
251 | del_timer(&q->unplug_timer); | ||
252 | return 1; | ||
253 | } | ||
254 | EXPORT_SYMBOL(blk_remove_plug); | ||
255 | |||
256 | /* | ||
257 | * remove the plug and let it rip.. | ||
258 | */ | 220 | */ |
259 | void __generic_unplug_device(struct request_queue *q) | 221 | void blk_delay_queue(struct request_queue *q, unsigned long msecs) |
260 | { | 222 | { |
261 | if (unlikely(blk_queue_stopped(q))) | 223 | queue_delayed_work(kblockd_workqueue, &q->delay_work, |
262 | return; | 224 | msecs_to_jiffies(msecs)); |
263 | if (!blk_remove_plug(q) && !blk_queue_nonrot(q)) | ||
264 | return; | ||
265 | |||
266 | q->request_fn(q); | ||
267 | } | 225 | } |
268 | 226 | EXPORT_SYMBOL(blk_delay_queue); | |
269 | /** | ||
270 | * generic_unplug_device - fire a request queue | ||
271 | * @q: The &struct request_queue in question | ||
272 | * | ||
273 | * Description: | ||
274 | * Linux uses plugging to build bigger requests queues before letting | ||
275 | * the device have at them. If a queue is plugged, the I/O scheduler | ||
276 | * is still adding and merging requests on the queue. Once the queue | ||
277 | * gets unplugged, the request_fn defined for the queue is invoked and | ||
278 | * transfers started. | ||
279 | **/ | ||
280 | void generic_unplug_device(struct request_queue *q) | ||
281 | { | ||
282 | if (blk_queue_plugged(q)) { | ||
283 | spin_lock_irq(q->queue_lock); | ||
284 | __generic_unplug_device(q); | ||
285 | spin_unlock_irq(q->queue_lock); | ||
286 | } | ||
287 | } | ||
288 | EXPORT_SYMBOL(generic_unplug_device); | ||
289 | |||
290 | static void blk_backing_dev_unplug(struct backing_dev_info *bdi, | ||
291 | struct page *page) | ||
292 | { | ||
293 | struct request_queue *q = bdi->unplug_io_data; | ||
294 | |||
295 | blk_unplug(q); | ||
296 | } | ||
297 | |||
298 | void blk_unplug_work(struct work_struct *work) | ||
299 | { | ||
300 | struct request_queue *q = | ||
301 | container_of(work, struct request_queue, unplug_work); | ||
302 | |||
303 | trace_block_unplug_io(q); | ||
304 | q->unplug_fn(q); | ||
305 | } | ||
306 | |||
307 | void blk_unplug_timeout(unsigned long data) | ||
308 | { | ||
309 | struct request_queue *q = (struct request_queue *)data; | ||
310 | |||
311 | trace_block_unplug_timer(q); | ||
312 | kblockd_schedule_work(q, &q->unplug_work); | ||
313 | } | ||
314 | |||
315 | void blk_unplug(struct request_queue *q) | ||
316 | { | ||
317 | /* | ||
318 | * devices don't necessarily have an ->unplug_fn defined | ||
319 | */ | ||
320 | if (q->unplug_fn) { | ||
321 | trace_block_unplug_io(q); | ||
322 | q->unplug_fn(q); | ||
323 | } | ||
324 | } | ||
325 | EXPORT_SYMBOL(blk_unplug); | ||
326 | 227 | ||
327 | /** | 228 | /** |
328 | * blk_start_queue - restart a previously stopped queue | 229 | * blk_start_queue - restart a previously stopped queue |
@@ -358,7 +259,7 @@ EXPORT_SYMBOL(blk_start_queue); | |||
358 | **/ | 259 | **/ |
359 | void blk_stop_queue(struct request_queue *q) | 260 | void blk_stop_queue(struct request_queue *q) |
360 | { | 261 | { |
361 | blk_remove_plug(q); | 262 | __cancel_delayed_work(&q->delay_work); |
362 | queue_flag_set(QUEUE_FLAG_STOPPED, q); | 263 | queue_flag_set(QUEUE_FLAG_STOPPED, q); |
363 | } | 264 | } |
364 | EXPORT_SYMBOL(blk_stop_queue); | 265 | EXPORT_SYMBOL(blk_stop_queue); |
@@ -376,12 +277,15 @@ EXPORT_SYMBOL(blk_stop_queue); | |||
376 | * that its ->make_request_fn will not re-add plugging prior to calling | 277 | * that its ->make_request_fn will not re-add plugging prior to calling |
377 | * this function. | 278 | * this function. |
378 | * | 279 | * |
280 | * This function does not cancel any asynchronous activity arising | ||
281 | * out of elevator or throttling code. That would require elevaotor_exit() | ||
282 | * and blk_throtl_exit() to be called with queue lock initialized. | ||
283 | * | ||
379 | */ | 284 | */ |
380 | void blk_sync_queue(struct request_queue *q) | 285 | void blk_sync_queue(struct request_queue *q) |
381 | { | 286 | { |
382 | del_timer_sync(&q->unplug_timer); | ||
383 | del_timer_sync(&q->timeout); | 287 | del_timer_sync(&q->timeout); |
384 | cancel_work_sync(&q->unplug_work); | 288 | cancel_delayed_work_sync(&q->delay_work); |
385 | } | 289 | } |
386 | EXPORT_SYMBOL(blk_sync_queue); | 290 | EXPORT_SYMBOL(blk_sync_queue); |
387 | 291 | ||
@@ -392,31 +296,32 @@ EXPORT_SYMBOL(blk_sync_queue); | |||
392 | * Description: | 296 | * Description: |
393 | * See @blk_run_queue. This variant must be called with the queue lock | 297 | * See @blk_run_queue. This variant must be called with the queue lock |
394 | * held and interrupts disabled. | 298 | * held and interrupts disabled. |
395 | * | ||
396 | */ | 299 | */ |
397 | void __blk_run_queue(struct request_queue *q) | 300 | void __blk_run_queue(struct request_queue *q) |
398 | { | 301 | { |
399 | blk_remove_plug(q); | ||
400 | |||
401 | if (unlikely(blk_queue_stopped(q))) | 302 | if (unlikely(blk_queue_stopped(q))) |
402 | return; | 303 | return; |
403 | 304 | ||
404 | if (elv_queue_empty(q)) | 305 | q->request_fn(q); |
405 | return; | 306 | } |
307 | EXPORT_SYMBOL(__blk_run_queue); | ||
406 | 308 | ||
407 | /* | 309 | /** |
408 | * Only recurse once to avoid overrunning the stack, let the unplug | 310 | * blk_run_queue_async - run a single device queue in workqueue context |
409 | * handling reinvoke the handler shortly if we already got there. | 311 | * @q: The queue to run |
410 | */ | 312 | * |
411 | if (!queue_flag_test_and_set(QUEUE_FLAG_REENTER, q)) { | 313 | * Description: |
412 | q->request_fn(q); | 314 | * Tells kblockd to perform the equivalent of @blk_run_queue on behalf |
413 | queue_flag_clear(QUEUE_FLAG_REENTER, q); | 315 | * of us. |
414 | } else { | 316 | */ |
415 | queue_flag_set(QUEUE_FLAG_PLUGGED, q); | 317 | void blk_run_queue_async(struct request_queue *q) |
416 | kblockd_schedule_work(q, &q->unplug_work); | 318 | { |
319 | if (likely(!blk_queue_stopped(q))) { | ||
320 | __cancel_delayed_work(&q->delay_work); | ||
321 | queue_delayed_work(kblockd_workqueue, &q->delay_work, 0); | ||
417 | } | 322 | } |
418 | } | 323 | } |
419 | EXPORT_SYMBOL(__blk_run_queue); | 324 | EXPORT_SYMBOL(blk_run_queue_async); |
420 | 325 | ||
421 | /** | 326 | /** |
422 | * blk_run_queue - run a single device queue | 327 | * blk_run_queue - run a single device queue |
@@ -440,7 +345,13 @@ void blk_put_queue(struct request_queue *q) | |||
440 | { | 345 | { |
441 | kobject_put(&q->kobj); | 346 | kobject_put(&q->kobj); |
442 | } | 347 | } |
348 | EXPORT_SYMBOL(blk_put_queue); | ||
443 | 349 | ||
350 | /* | ||
351 | * Note: If a driver supplied the queue lock, it should not zap that lock | ||
352 | * unexpectedly as some queue cleanup components like elevator_exit() and | ||
353 | * blk_throtl_exit() need queue lock. | ||
354 | */ | ||
444 | void blk_cleanup_queue(struct request_queue *q) | 355 | void blk_cleanup_queue(struct request_queue *q) |
445 | { | 356 | { |
446 | /* | 357 | /* |
@@ -459,6 +370,8 @@ void blk_cleanup_queue(struct request_queue *q) | |||
459 | if (q->elevator) | 370 | if (q->elevator) |
460 | elevator_exit(q->elevator); | 371 | elevator_exit(q->elevator); |
461 | 372 | ||
373 | blk_throtl_exit(q); | ||
374 | |||
462 | blk_put_queue(q); | 375 | blk_put_queue(q); |
463 | } | 376 | } |
464 | EXPORT_SYMBOL(blk_cleanup_queue); | 377 | EXPORT_SYMBOL(blk_cleanup_queue); |
@@ -501,8 +414,6 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) | |||
501 | if (!q) | 414 | if (!q) |
502 | return NULL; | 415 | return NULL; |
503 | 416 | ||
504 | q->backing_dev_info.unplug_io_fn = blk_backing_dev_unplug; | ||
505 | q->backing_dev_info.unplug_io_data = q; | ||
506 | q->backing_dev_info.ra_pages = | 417 | q->backing_dev_info.ra_pages = |
507 | (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; | 418 | (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; |
508 | q->backing_dev_info.state = 0; | 419 | q->backing_dev_info.state = 0; |
@@ -515,18 +426,31 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) | |||
515 | return NULL; | 426 | return NULL; |
516 | } | 427 | } |
517 | 428 | ||
429 | if (blk_throtl_init(q)) { | ||
430 | kmem_cache_free(blk_requestq_cachep, q); | ||
431 | return NULL; | ||
432 | } | ||
433 | |||
518 | setup_timer(&q->backing_dev_info.laptop_mode_wb_timer, | 434 | setup_timer(&q->backing_dev_info.laptop_mode_wb_timer, |
519 | laptop_mode_timer_fn, (unsigned long) q); | 435 | laptop_mode_timer_fn, (unsigned long) q); |
520 | init_timer(&q->unplug_timer); | ||
521 | setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q); | 436 | setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q); |
522 | INIT_LIST_HEAD(&q->timeout_list); | 437 | INIT_LIST_HEAD(&q->timeout_list); |
523 | INIT_WORK(&q->unplug_work, blk_unplug_work); | 438 | INIT_LIST_HEAD(&q->flush_queue[0]); |
439 | INIT_LIST_HEAD(&q->flush_queue[1]); | ||
440 | INIT_LIST_HEAD(&q->flush_data_in_flight); | ||
441 | INIT_DELAYED_WORK(&q->delay_work, blk_delay_work); | ||
524 | 442 | ||
525 | kobject_init(&q->kobj, &blk_queue_ktype); | 443 | kobject_init(&q->kobj, &blk_queue_ktype); |
526 | 444 | ||
527 | mutex_init(&q->sysfs_lock); | 445 | mutex_init(&q->sysfs_lock); |
528 | spin_lock_init(&q->__queue_lock); | 446 | spin_lock_init(&q->__queue_lock); |
529 | 447 | ||
448 | /* | ||
449 | * By default initialize queue_lock to internal lock and driver can | ||
450 | * override it later if need be. | ||
451 | */ | ||
452 | q->queue_lock = &q->__queue_lock; | ||
453 | |||
530 | return q; | 454 | return q; |
531 | } | 455 | } |
532 | EXPORT_SYMBOL(blk_alloc_queue_node); | 456 | EXPORT_SYMBOL(blk_alloc_queue_node); |
@@ -609,9 +533,11 @@ blk_init_allocated_queue_node(struct request_queue *q, request_fn_proc *rfn, | |||
609 | q->request_fn = rfn; | 533 | q->request_fn = rfn; |
610 | q->prep_rq_fn = NULL; | 534 | q->prep_rq_fn = NULL; |
611 | q->unprep_rq_fn = NULL; | 535 | q->unprep_rq_fn = NULL; |
612 | q->unplug_fn = generic_unplug_device; | ||
613 | q->queue_flags = QUEUE_FLAG_DEFAULT; | 536 | q->queue_flags = QUEUE_FLAG_DEFAULT; |
614 | q->queue_lock = lock; | 537 | |
538 | /* Override internal queue lock with supplied lock pointer */ | ||
539 | if (lock) | ||
540 | q->queue_lock = lock; | ||
615 | 541 | ||
616 | /* | 542 | /* |
617 | * This also sets hw/phys segments, boundary and size | 543 | * This also sets hw/phys segments, boundary and size |
@@ -641,6 +567,7 @@ int blk_get_queue(struct request_queue *q) | |||
641 | 567 | ||
642 | return 1; | 568 | return 1; |
643 | } | 569 | } |
570 | EXPORT_SYMBOL(blk_get_queue); | ||
644 | 571 | ||
645 | static inline void blk_free_request(struct request_queue *q, struct request *rq) | 572 | static inline void blk_free_request(struct request_queue *q, struct request *rq) |
646 | { | 573 | { |
@@ -740,6 +667,25 @@ static void freed_request(struct request_queue *q, int sync, int priv) | |||
740 | } | 667 | } |
741 | 668 | ||
742 | /* | 669 | /* |
670 | * Determine if elevator data should be initialized when allocating the | ||
671 | * request associated with @bio. | ||
672 | */ | ||
673 | static bool blk_rq_should_init_elevator(struct bio *bio) | ||
674 | { | ||
675 | if (!bio) | ||
676 | return true; | ||
677 | |||
678 | /* | ||
679 | * Flush requests do not use the elevator so skip initialization. | ||
680 | * This allows a request to share the flush and elevator data. | ||
681 | */ | ||
682 | if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) | ||
683 | return false; | ||
684 | |||
685 | return true; | ||
686 | } | ||
687 | |||
688 | /* | ||
743 | * Get a free request, queue_lock must be held. | 689 | * Get a free request, queue_lock must be held. |
744 | * Returns NULL on failure, with queue_lock held. | 690 | * Returns NULL on failure, with queue_lock held. |
745 | * Returns !NULL on success, with queue_lock *not held*. | 691 | * Returns !NULL on success, with queue_lock *not held*. |
@@ -751,7 +697,7 @@ static struct request *get_request(struct request_queue *q, int rw_flags, | |||
751 | struct request_list *rl = &q->rq; | 697 | struct request_list *rl = &q->rq; |
752 | struct io_context *ioc = NULL; | 698 | struct io_context *ioc = NULL; |
753 | const bool is_sync = rw_is_sync(rw_flags) != 0; | 699 | const bool is_sync = rw_is_sync(rw_flags) != 0; |
754 | int may_queue, priv; | 700 | int may_queue, priv = 0; |
755 | 701 | ||
756 | may_queue = elv_may_queue(q, rw_flags); | 702 | may_queue = elv_may_queue(q, rw_flags); |
757 | if (may_queue == ELV_MQUEUE_NO) | 703 | if (may_queue == ELV_MQUEUE_NO) |
@@ -795,9 +741,11 @@ static struct request *get_request(struct request_queue *q, int rw_flags, | |||
795 | rl->count[is_sync]++; | 741 | rl->count[is_sync]++; |
796 | rl->starved[is_sync] = 0; | 742 | rl->starved[is_sync] = 0; |
797 | 743 | ||
798 | priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags); | 744 | if (blk_rq_should_init_elevator(bio)) { |
799 | if (priv) | 745 | priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags); |
800 | rl->elvpriv++; | 746 | if (priv) |
747 | rl->elvpriv++; | ||
748 | } | ||
801 | 749 | ||
802 | if (blk_queue_io_stat(q)) | 750 | if (blk_queue_io_stat(q)) |
803 | rw_flags |= REQ_IO_STAT; | 751 | rw_flags |= REQ_IO_STAT; |
@@ -844,8 +792,8 @@ out: | |||
844 | } | 792 | } |
845 | 793 | ||
846 | /* | 794 | /* |
847 | * No available requests for this queue, unplug the device and wait for some | 795 | * No available requests for this queue, wait for some requests to become |
848 | * requests to become available. | 796 | * available. |
849 | * | 797 | * |
850 | * Called with q->queue_lock held, and returns with it unlocked. | 798 | * Called with q->queue_lock held, and returns with it unlocked. |
851 | */ | 799 | */ |
@@ -866,7 +814,6 @@ static struct request *get_request_wait(struct request_queue *q, int rw_flags, | |||
866 | 814 | ||
867 | trace_block_sleeprq(q, bio, rw_flags & 1); | 815 | trace_block_sleeprq(q, bio, rw_flags & 1); |
868 | 816 | ||
869 | __generic_unplug_device(q); | ||
870 | spin_unlock_irq(q->queue_lock); | 817 | spin_unlock_irq(q->queue_lock); |
871 | io_schedule(); | 818 | io_schedule(); |
872 | 819 | ||
@@ -988,6 +935,13 @@ void blk_requeue_request(struct request_queue *q, struct request *rq) | |||
988 | } | 935 | } |
989 | EXPORT_SYMBOL(blk_requeue_request); | 936 | EXPORT_SYMBOL(blk_requeue_request); |
990 | 937 | ||
938 | static void add_acct_request(struct request_queue *q, struct request *rq, | ||
939 | int where) | ||
940 | { | ||
941 | drive_stat_acct(rq, 1); | ||
942 | __elv_add_request(q, rq, where); | ||
943 | } | ||
944 | |||
991 | /** | 945 | /** |
992 | * blk_insert_request - insert a special request into a request queue | 946 | * blk_insert_request - insert a special request into a request queue |
993 | * @q: request queue where request should be inserted | 947 | * @q: request queue where request should be inserted |
@@ -1030,29 +984,12 @@ void blk_insert_request(struct request_queue *q, struct request *rq, | |||
1030 | if (blk_rq_tagged(rq)) | 984 | if (blk_rq_tagged(rq)) |
1031 | blk_queue_end_tag(q, rq); | 985 | blk_queue_end_tag(q, rq); |
1032 | 986 | ||
1033 | drive_stat_acct(rq, 1); | 987 | add_acct_request(q, rq, where); |
1034 | __elv_add_request(q, rq, where, 0); | ||
1035 | __blk_run_queue(q); | 988 | __blk_run_queue(q); |
1036 | spin_unlock_irqrestore(q->queue_lock, flags); | 989 | spin_unlock_irqrestore(q->queue_lock, flags); |
1037 | } | 990 | } |
1038 | EXPORT_SYMBOL(blk_insert_request); | 991 | EXPORT_SYMBOL(blk_insert_request); |
1039 | 992 | ||
1040 | /* | ||
1041 | * add-request adds a request to the linked list. | ||
1042 | * queue lock is held and interrupts disabled, as we muck with the | ||
1043 | * request queue list. | ||
1044 | */ | ||
1045 | static inline void add_request(struct request_queue *q, struct request *req) | ||
1046 | { | ||
1047 | drive_stat_acct(req, 1); | ||
1048 | |||
1049 | /* | ||
1050 | * elevator indicated where it wants this request to be | ||
1051 | * inserted at elevator_merge time | ||
1052 | */ | ||
1053 | __elv_add_request(q, req, ELEVATOR_INSERT_SORT, 0); | ||
1054 | } | ||
1055 | |||
1056 | static void part_round_stats_single(int cpu, struct hd_struct *part, | 993 | static void part_round_stats_single(int cpu, struct hd_struct *part, |
1057 | unsigned long now) | 994 | unsigned long now) |
1058 | { | 995 | { |
@@ -1168,6 +1105,96 @@ void blk_add_request_payload(struct request *rq, struct page *page, | |||
1168 | } | 1105 | } |
1169 | EXPORT_SYMBOL_GPL(blk_add_request_payload); | 1106 | EXPORT_SYMBOL_GPL(blk_add_request_payload); |
1170 | 1107 | ||
1108 | static bool bio_attempt_back_merge(struct request_queue *q, struct request *req, | ||
1109 | struct bio *bio) | ||
1110 | { | ||
1111 | const int ff = bio->bi_rw & REQ_FAILFAST_MASK; | ||
1112 | |||
1113 | if (!ll_back_merge_fn(q, req, bio)) | ||
1114 | return false; | ||
1115 | |||
1116 | trace_block_bio_backmerge(q, bio); | ||
1117 | |||
1118 | if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff) | ||
1119 | blk_rq_set_mixed_merge(req); | ||
1120 | |||
1121 | req->biotail->bi_next = bio; | ||
1122 | req->biotail = bio; | ||
1123 | req->__data_len += bio->bi_size; | ||
1124 | req->ioprio = ioprio_best(req->ioprio, bio_prio(bio)); | ||
1125 | |||
1126 | drive_stat_acct(req, 0); | ||
1127 | elv_bio_merged(q, req, bio); | ||
1128 | return true; | ||
1129 | } | ||
1130 | |||
1131 | static bool bio_attempt_front_merge(struct request_queue *q, | ||
1132 | struct request *req, struct bio *bio) | ||
1133 | { | ||
1134 | const int ff = bio->bi_rw & REQ_FAILFAST_MASK; | ||
1135 | |||
1136 | if (!ll_front_merge_fn(q, req, bio)) | ||
1137 | return false; | ||
1138 | |||
1139 | trace_block_bio_frontmerge(q, bio); | ||
1140 | |||
1141 | if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff) | ||
1142 | blk_rq_set_mixed_merge(req); | ||
1143 | |||
1144 | bio->bi_next = req->bio; | ||
1145 | req->bio = bio; | ||
1146 | |||
1147 | /* | ||
1148 | * may not be valid. if the low level driver said | ||
1149 | * it didn't need a bounce buffer then it better | ||
1150 | * not touch req->buffer either... | ||
1151 | */ | ||
1152 | req->buffer = bio_data(bio); | ||
1153 | req->__sector = bio->bi_sector; | ||
1154 | req->__data_len += bio->bi_size; | ||
1155 | req->ioprio = ioprio_best(req->ioprio, bio_prio(bio)); | ||
1156 | |||
1157 | drive_stat_acct(req, 0); | ||
1158 | elv_bio_merged(q, req, bio); | ||
1159 | return true; | ||
1160 | } | ||
1161 | |||
1162 | /* | ||
1163 | * Attempts to merge with the plugged list in the current process. Returns | ||
1164 | * true if merge was successful, otherwise false. | ||
1165 | */ | ||
1166 | static bool attempt_plug_merge(struct task_struct *tsk, struct request_queue *q, | ||
1167 | struct bio *bio) | ||
1168 | { | ||
1169 | struct blk_plug *plug; | ||
1170 | struct request *rq; | ||
1171 | bool ret = false; | ||
1172 | |||
1173 | plug = tsk->plug; | ||
1174 | if (!plug) | ||
1175 | goto out; | ||
1176 | |||
1177 | list_for_each_entry_reverse(rq, &plug->list, queuelist) { | ||
1178 | int el_ret; | ||
1179 | |||
1180 | if (rq->q != q) | ||
1181 | continue; | ||
1182 | |||
1183 | el_ret = elv_try_merge(rq, bio); | ||
1184 | if (el_ret == ELEVATOR_BACK_MERGE) { | ||
1185 | ret = bio_attempt_back_merge(q, rq, bio); | ||
1186 | if (ret) | ||
1187 | break; | ||
1188 | } else if (el_ret == ELEVATOR_FRONT_MERGE) { | ||
1189 | ret = bio_attempt_front_merge(q, rq, bio); | ||
1190 | if (ret) | ||
1191 | break; | ||
1192 | } | ||
1193 | } | ||
1194 | out: | ||
1195 | return ret; | ||
1196 | } | ||
1197 | |||
1171 | void init_request_from_bio(struct request *req, struct bio *bio) | 1198 | void init_request_from_bio(struct request *req, struct bio *bio) |
1172 | { | 1199 | { |
1173 | req->cpu = bio->bi_comp_cpu; | 1200 | req->cpu = bio->bi_comp_cpu; |
@@ -1183,31 +1210,13 @@ void init_request_from_bio(struct request *req, struct bio *bio) | |||
1183 | blk_rq_bio_prep(req->q, req, bio); | 1210 | blk_rq_bio_prep(req->q, req, bio); |
1184 | } | 1211 | } |
1185 | 1212 | ||
1186 | /* | ||
1187 | * Only disabling plugging for non-rotational devices if it does tagging | ||
1188 | * as well, otherwise we do need the proper merging | ||
1189 | */ | ||
1190 | static inline bool queue_should_plug(struct request_queue *q) | ||
1191 | { | ||
1192 | return !(blk_queue_nonrot(q) && blk_queue_tagged(q)); | ||
1193 | } | ||
1194 | |||
1195 | static int __make_request(struct request_queue *q, struct bio *bio) | 1213 | static int __make_request(struct request_queue *q, struct bio *bio) |
1196 | { | 1214 | { |
1197 | struct request *req; | ||
1198 | int el_ret; | ||
1199 | unsigned int bytes = bio->bi_size; | ||
1200 | const unsigned short prio = bio_prio(bio); | ||
1201 | const bool sync = !!(bio->bi_rw & REQ_SYNC); | 1215 | const bool sync = !!(bio->bi_rw & REQ_SYNC); |
1202 | const bool unplug = !!(bio->bi_rw & REQ_UNPLUG); | 1216 | struct blk_plug *plug; |
1203 | const unsigned long ff = bio->bi_rw & REQ_FAILFAST_MASK; | 1217 | int el_ret, rw_flags, where = ELEVATOR_INSERT_SORT; |
1204 | int rw_flags; | 1218 | struct request *req; |
1205 | 1219 | ||
1206 | if ((bio->bi_rw & REQ_HARDBARRIER) && | ||
1207 | (q->next_ordered == QUEUE_ORDERED_NONE)) { | ||
1208 | bio_endio(bio, -EOPNOTSUPP); | ||
1209 | return 0; | ||
1210 | } | ||
1211 | /* | 1220 | /* |
1212 | * low level driver can indicate that it wants pages above a | 1221 | * low level driver can indicate that it wants pages above a |
1213 | * certain limit bounced to low memory (ie for highmem, or even | 1222 | * certain limit bounced to low memory (ie for highmem, or even |
@@ -1215,73 +1224,34 @@ static int __make_request(struct request_queue *q, struct bio *bio) | |||
1215 | */ | 1224 | */ |
1216 | blk_queue_bounce(q, &bio); | 1225 | blk_queue_bounce(q, &bio); |
1217 | 1226 | ||
1218 | spin_lock_irq(q->queue_lock); | 1227 | if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) { |
1219 | 1228 | spin_lock_irq(q->queue_lock); | |
1220 | if (unlikely((bio->bi_rw & REQ_HARDBARRIER)) || elv_queue_empty(q)) | 1229 | where = ELEVATOR_INSERT_FLUSH; |
1221 | goto get_rq; | 1230 | goto get_rq; |
1231 | } | ||
1222 | 1232 | ||
1223 | el_ret = elv_merge(q, &req, bio); | 1233 | /* |
1224 | switch (el_ret) { | 1234 | * Check if we can merge with the plugged list before grabbing |
1225 | case ELEVATOR_BACK_MERGE: | 1235 | * any locks. |
1226 | BUG_ON(!rq_mergeable(req)); | 1236 | */ |
1227 | 1237 | if (attempt_plug_merge(current, q, bio)) | |
1228 | if (!ll_back_merge_fn(q, req, bio)) | ||
1229 | break; | ||
1230 | |||
1231 | trace_block_bio_backmerge(q, bio); | ||
1232 | |||
1233 | if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff) | ||
1234 | blk_rq_set_mixed_merge(req); | ||
1235 | |||
1236 | req->biotail->bi_next = bio; | ||
1237 | req->biotail = bio; | ||
1238 | req->__data_len += bytes; | ||
1239 | req->ioprio = ioprio_best(req->ioprio, prio); | ||
1240 | if (!blk_rq_cpu_valid(req)) | ||
1241 | req->cpu = bio->bi_comp_cpu; | ||
1242 | drive_stat_acct(req, 0); | ||
1243 | elv_bio_merged(q, req, bio); | ||
1244 | if (!attempt_back_merge(q, req)) | ||
1245 | elv_merged_request(q, req, el_ret); | ||
1246 | goto out; | 1238 | goto out; |
1247 | 1239 | ||
1248 | case ELEVATOR_FRONT_MERGE: | 1240 | spin_lock_irq(q->queue_lock); |
1249 | BUG_ON(!rq_mergeable(req)); | ||
1250 | |||
1251 | if (!ll_front_merge_fn(q, req, bio)) | ||
1252 | break; | ||
1253 | |||
1254 | trace_block_bio_frontmerge(q, bio); | ||
1255 | 1241 | ||
1256 | if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff) { | 1242 | el_ret = elv_merge(q, &req, bio); |
1257 | blk_rq_set_mixed_merge(req); | 1243 | if (el_ret == ELEVATOR_BACK_MERGE) { |
1258 | req->cmd_flags &= ~REQ_FAILFAST_MASK; | 1244 | if (bio_attempt_back_merge(q, req, bio)) { |
1259 | req->cmd_flags |= ff; | 1245 | if (!attempt_back_merge(q, req)) |
1246 | elv_merged_request(q, req, el_ret); | ||
1247 | goto out_unlock; | ||
1248 | } | ||
1249 | } else if (el_ret == ELEVATOR_FRONT_MERGE) { | ||
1250 | if (bio_attempt_front_merge(q, req, bio)) { | ||
1251 | if (!attempt_front_merge(q, req)) | ||
1252 | elv_merged_request(q, req, el_ret); | ||
1253 | goto out_unlock; | ||
1260 | } | 1254 | } |
1261 | |||
1262 | bio->bi_next = req->bio; | ||
1263 | req->bio = bio; | ||
1264 | |||
1265 | /* | ||
1266 | * may not be valid. if the low level driver said | ||
1267 | * it didn't need a bounce buffer then it better | ||
1268 | * not touch req->buffer either... | ||
1269 | */ | ||
1270 | req->buffer = bio_data(bio); | ||
1271 | req->__sector = bio->bi_sector; | ||
1272 | req->__data_len += bytes; | ||
1273 | req->ioprio = ioprio_best(req->ioprio, prio); | ||
1274 | if (!blk_rq_cpu_valid(req)) | ||
1275 | req->cpu = bio->bi_comp_cpu; | ||
1276 | drive_stat_acct(req, 0); | ||
1277 | elv_bio_merged(q, req, bio); | ||
1278 | if (!attempt_front_merge(q, req)) | ||
1279 | elv_merged_request(q, req, el_ret); | ||
1280 | goto out; | ||
1281 | |||
1282 | /* ELV_NO_MERGE: elevator says don't/can't merge. */ | ||
1283 | default: | ||
1284 | ; | ||
1285 | } | 1255 | } |
1286 | 1256 | ||
1287 | get_rq: | 1257 | get_rq: |
@@ -1308,17 +1278,39 @@ get_rq: | |||
1308 | */ | 1278 | */ |
1309 | init_request_from_bio(req, bio); | 1279 | init_request_from_bio(req, bio); |
1310 | 1280 | ||
1311 | spin_lock_irq(q->queue_lock); | ||
1312 | if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) || | 1281 | if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) || |
1313 | bio_flagged(bio, BIO_CPU_AFFINE)) | 1282 | bio_flagged(bio, BIO_CPU_AFFINE)) { |
1314 | req->cpu = blk_cpu_to_group(smp_processor_id()); | 1283 | req->cpu = blk_cpu_to_group(get_cpu()); |
1315 | if (queue_should_plug(q) && elv_queue_empty(q)) | 1284 | put_cpu(); |
1316 | blk_plug_device(q); | 1285 | } |
1317 | add_request(q, req); | 1286 | |
1287 | plug = current->plug; | ||
1288 | if (plug) { | ||
1289 | /* | ||
1290 | * If this is the first request added after a plug, fire | ||
1291 | * of a plug trace. If others have been added before, check | ||
1292 | * if we have multiple devices in this plug. If so, make a | ||
1293 | * note to sort the list before dispatch. | ||
1294 | */ | ||
1295 | if (list_empty(&plug->list)) | ||
1296 | trace_block_plug(q); | ||
1297 | else if (!plug->should_sort) { | ||
1298 | struct request *__rq; | ||
1299 | |||
1300 | __rq = list_entry_rq(plug->list.prev); | ||
1301 | if (__rq->q != q) | ||
1302 | plug->should_sort = 1; | ||
1303 | } | ||
1304 | list_add_tail(&req->queuelist, &plug->list); | ||
1305 | drive_stat_acct(req, 1); | ||
1306 | } else { | ||
1307 | spin_lock_irq(q->queue_lock); | ||
1308 | add_acct_request(q, req, where); | ||
1309 | __blk_run_queue(q); | ||
1310 | out_unlock: | ||
1311 | spin_unlock_irq(q->queue_lock); | ||
1312 | } | ||
1318 | out: | 1313 | out: |
1319 | if (unplug || !queue_should_plug(q)) | ||
1320 | __generic_unplug_device(q); | ||
1321 | spin_unlock_irq(q->queue_lock); | ||
1322 | return 0; | 1314 | return 0; |
1323 | } | 1315 | } |
1324 | 1316 | ||
@@ -1335,9 +1327,9 @@ static inline void blk_partition_remap(struct bio *bio) | |||
1335 | bio->bi_sector += p->start_sect; | 1327 | bio->bi_sector += p->start_sect; |
1336 | bio->bi_bdev = bdev->bd_contains; | 1328 | bio->bi_bdev = bdev->bd_contains; |
1337 | 1329 | ||
1338 | trace_block_remap(bdev_get_queue(bio->bi_bdev), bio, | 1330 | trace_block_bio_remap(bdev_get_queue(bio->bi_bdev), bio, |
1339 | bdev->bd_dev, | 1331 | bdev->bd_dev, |
1340 | bio->bi_sector - p->start_sect); | 1332 | bio->bi_sector - p->start_sect); |
1341 | } | 1333 | } |
1342 | } | 1334 | } |
1343 | 1335 | ||
@@ -1350,7 +1342,7 @@ static void handle_bad_sector(struct bio *bio) | |||
1350 | bdevname(bio->bi_bdev, b), | 1342 | bdevname(bio->bi_bdev, b), |
1351 | bio->bi_rw, | 1343 | bio->bi_rw, |
1352 | (unsigned long long)bio->bi_sector + bio_sectors(bio), | 1344 | (unsigned long long)bio->bi_sector + bio_sectors(bio), |
1353 | (long long)(bio->bi_bdev->bd_inode->i_size >> 9)); | 1345 | (long long)(i_size_read(bio->bi_bdev->bd_inode) >> 9)); |
1354 | 1346 | ||
1355 | set_bit(BIO_EOF, &bio->bi_flags); | 1347 | set_bit(BIO_EOF, &bio->bi_flags); |
1356 | } | 1348 | } |
@@ -1403,7 +1395,7 @@ static inline int bio_check_eod(struct bio *bio, unsigned int nr_sectors) | |||
1403 | return 0; | 1395 | return 0; |
1404 | 1396 | ||
1405 | /* Test device or partition size, when known. */ | 1397 | /* Test device or partition size, when known. */ |
1406 | maxsector = bio->bi_bdev->bd_inode->i_size >> 9; | 1398 | maxsector = i_size_read(bio->bi_bdev->bd_inode) >> 9; |
1407 | if (maxsector) { | 1399 | if (maxsector) { |
1408 | sector_t sector = bio->bi_sector; | 1400 | sector_t sector = bio->bi_sector; |
1409 | 1401 | ||
@@ -1506,7 +1498,7 @@ static inline void __generic_make_request(struct bio *bio) | |||
1506 | goto end_io; | 1498 | goto end_io; |
1507 | 1499 | ||
1508 | if (old_sector != -1) | 1500 | if (old_sector != -1) |
1509 | trace_block_remap(q, bio, old_dev, old_sector); | 1501 | trace_block_bio_remap(q, bio, old_dev, old_sector); |
1510 | 1502 | ||
1511 | old_sector = bio->bi_sector; | 1503 | old_sector = bio->bi_sector; |
1512 | old_dev = bio->bi_bdev->bd_dev; | 1504 | old_dev = bio->bi_bdev->bd_dev; |
@@ -1514,6 +1506,19 @@ static inline void __generic_make_request(struct bio *bio) | |||
1514 | if (bio_check_eod(bio, nr_sectors)) | 1506 | if (bio_check_eod(bio, nr_sectors)) |
1515 | goto end_io; | 1507 | goto end_io; |
1516 | 1508 | ||
1509 | /* | ||
1510 | * Filter flush bio's early so that make_request based | ||
1511 | * drivers without flush support don't have to worry | ||
1512 | * about them. | ||
1513 | */ | ||
1514 | if ((bio->bi_rw & (REQ_FLUSH | REQ_FUA)) && !q->flush_flags) { | ||
1515 | bio->bi_rw &= ~(REQ_FLUSH | REQ_FUA); | ||
1516 | if (!nr_sectors) { | ||
1517 | err = 0; | ||
1518 | goto end_io; | ||
1519 | } | ||
1520 | } | ||
1521 | |||
1517 | if ((bio->bi_rw & REQ_DISCARD) && | 1522 | if ((bio->bi_rw & REQ_DISCARD) && |
1518 | (!blk_queue_discard(q) || | 1523 | (!blk_queue_discard(q) || |
1519 | ((bio->bi_rw & REQ_SECURE) && | 1524 | ((bio->bi_rw & REQ_SECURE) && |
@@ -1522,6 +1527,16 @@ static inline void __generic_make_request(struct bio *bio) | |||
1522 | goto end_io; | 1527 | goto end_io; |
1523 | } | 1528 | } |
1524 | 1529 | ||
1530 | if (blk_throtl_bio(q, &bio)) | ||
1531 | goto end_io; | ||
1532 | |||
1533 | /* | ||
1534 | * If bio = NULL, bio has been throttled and will be submitted | ||
1535 | * later. | ||
1536 | */ | ||
1537 | if (!bio) | ||
1538 | break; | ||
1539 | |||
1525 | trace_block_bio_queue(q, bio); | 1540 | trace_block_bio_queue(q, bio); |
1526 | 1541 | ||
1527 | ret = q->make_request_fn(q, bio); | 1542 | ret = q->make_request_fn(q, bio); |
@@ -1612,11 +1627,12 @@ void submit_bio(int rw, struct bio *bio) | |||
1612 | 1627 | ||
1613 | if (unlikely(block_dump)) { | 1628 | if (unlikely(block_dump)) { |
1614 | char b[BDEVNAME_SIZE]; | 1629 | char b[BDEVNAME_SIZE]; |
1615 | printk(KERN_DEBUG "%s(%d): %s block %Lu on %s\n", | 1630 | printk(KERN_DEBUG "%s(%d): %s block %Lu on %s (%u sectors)\n", |
1616 | current->comm, task_pid_nr(current), | 1631 | current->comm, task_pid_nr(current), |
1617 | (rw & WRITE) ? "WRITE" : "READ", | 1632 | (rw & WRITE) ? "WRITE" : "READ", |
1618 | (unsigned long long)bio->bi_sector, | 1633 | (unsigned long long)bio->bi_sector, |
1619 | bdevname(bio->bi_bdev, b)); | 1634 | bdevname(bio->bi_bdev, b), |
1635 | count); | ||
1620 | } | 1636 | } |
1621 | } | 1637 | } |
1622 | 1638 | ||
@@ -1637,7 +1653,7 @@ EXPORT_SYMBOL(submit_bio); | |||
1637 | * the insertion using this generic function. | 1653 | * the insertion using this generic function. |
1638 | * | 1654 | * |
1639 | * This function should also be useful for request stacking drivers | 1655 | * This function should also be useful for request stacking drivers |
1640 | * in some cases below, so export this fuction. | 1656 | * in some cases below, so export this function. |
1641 | * Request stacking drivers like request-based dm may change the queue | 1657 | * Request stacking drivers like request-based dm may change the queue |
1642 | * limits while requests are in the queue (e.g. dm's table swapping). | 1658 | * limits while requests are in the queue (e.g. dm's table swapping). |
1643 | * Such request stacking drivers should check those requests agaist | 1659 | * Such request stacking drivers should check those requests agaist |
@@ -1698,9 +1714,7 @@ int blk_insert_cloned_request(struct request_queue *q, struct request *rq) | |||
1698 | */ | 1714 | */ |
1699 | BUG_ON(blk_queued_rq(rq)); | 1715 | BUG_ON(blk_queued_rq(rq)); |
1700 | 1716 | ||
1701 | drive_stat_acct(rq, 1); | 1717 | add_acct_request(q, rq, ELEVATOR_INSERT_BACK); |
1702 | __elv_add_request(q, rq, ELEVATOR_INSERT_BACK, 0); | ||
1703 | |||
1704 | spin_unlock_irqrestore(q->queue_lock, flags); | 1718 | spin_unlock_irqrestore(q->queue_lock, flags); |
1705 | 1719 | ||
1706 | return 0; | 1720 | return 0; |
@@ -1759,7 +1773,7 @@ static void blk_account_io_completion(struct request *req, unsigned int bytes) | |||
1759 | int cpu; | 1773 | int cpu; |
1760 | 1774 | ||
1761 | cpu = part_stat_lock(); | 1775 | cpu = part_stat_lock(); |
1762 | part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req)); | 1776 | part = req->part; |
1763 | part_stat_add(cpu, part, sectors[rw], bytes >> 9); | 1777 | part_stat_add(cpu, part, sectors[rw], bytes >> 9); |
1764 | part_stat_unlock(); | 1778 | part_stat_unlock(); |
1765 | } | 1779 | } |
@@ -1768,24 +1782,25 @@ static void blk_account_io_completion(struct request *req, unsigned int bytes) | |||
1768 | static void blk_account_io_done(struct request *req) | 1782 | static void blk_account_io_done(struct request *req) |
1769 | { | 1783 | { |
1770 | /* | 1784 | /* |
1771 | * Account IO completion. bar_rq isn't accounted as a normal | 1785 | * Account IO completion. flush_rq isn't accounted as a |
1772 | * IO on queueing nor completion. Accounting the containing | 1786 | * normal IO on queueing nor completion. Accounting the |
1773 | * request is enough. | 1787 | * containing request is enough. |
1774 | */ | 1788 | */ |
1775 | if (blk_do_io_stat(req) && req != &req->q->bar_rq) { | 1789 | if (blk_do_io_stat(req) && !(req->cmd_flags & REQ_FLUSH_SEQ)) { |
1776 | unsigned long duration = jiffies - req->start_time; | 1790 | unsigned long duration = jiffies - req->start_time; |
1777 | const int rw = rq_data_dir(req); | 1791 | const int rw = rq_data_dir(req); |
1778 | struct hd_struct *part; | 1792 | struct hd_struct *part; |
1779 | int cpu; | 1793 | int cpu; |
1780 | 1794 | ||
1781 | cpu = part_stat_lock(); | 1795 | cpu = part_stat_lock(); |
1782 | part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req)); | 1796 | part = req->part; |
1783 | 1797 | ||
1784 | part_stat_inc(cpu, part, ios[rw]); | 1798 | part_stat_inc(cpu, part, ios[rw]); |
1785 | part_stat_add(cpu, part, ticks[rw], duration); | 1799 | part_stat_add(cpu, part, ticks[rw], duration); |
1786 | part_round_stats(cpu, part); | 1800 | part_round_stats(cpu, part); |
1787 | part_dec_in_flight(part, rw); | 1801 | part_dec_in_flight(part, rw); |
1788 | 1802 | ||
1803 | hd_struct_put(part); | ||
1789 | part_stat_unlock(); | 1804 | part_stat_unlock(); |
1790 | } | 1805 | } |
1791 | } | 1806 | } |
@@ -2011,9 +2026,26 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes) | |||
2011 | 2026 | ||
2012 | if (error && req->cmd_type == REQ_TYPE_FS && | 2027 | if (error && req->cmd_type == REQ_TYPE_FS && |
2013 | !(req->cmd_flags & REQ_QUIET)) { | 2028 | !(req->cmd_flags & REQ_QUIET)) { |
2014 | printk(KERN_ERR "end_request: I/O error, dev %s, sector %llu\n", | 2029 | char *error_type; |
2015 | req->rq_disk ? req->rq_disk->disk_name : "?", | 2030 | |
2016 | (unsigned long long)blk_rq_pos(req)); | 2031 | switch (error) { |
2032 | case -ENOLINK: | ||
2033 | error_type = "recoverable transport"; | ||
2034 | break; | ||
2035 | case -EREMOTEIO: | ||
2036 | error_type = "critical target"; | ||
2037 | break; | ||
2038 | case -EBADE: | ||
2039 | error_type = "critical nexus"; | ||
2040 | break; | ||
2041 | case -EIO: | ||
2042 | default: | ||
2043 | error_type = "I/O"; | ||
2044 | break; | ||
2045 | } | ||
2046 | printk(KERN_ERR "end_request: %s error, dev %s, sector %llu\n", | ||
2047 | error_type, req->rq_disk ? req->rq_disk->disk_name : "?", | ||
2048 | (unsigned long long)blk_rq_pos(req)); | ||
2017 | } | 2049 | } |
2018 | 2050 | ||
2019 | blk_account_io_completion(req, nr_bytes); | 2051 | blk_account_io_completion(req, nr_bytes); |
@@ -2111,7 +2143,7 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes) | |||
2111 | * size, something has gone terribly wrong. | 2143 | * size, something has gone terribly wrong. |
2112 | */ | 2144 | */ |
2113 | if (blk_rq_bytes(req) < blk_rq_cur_bytes(req)) { | 2145 | if (blk_rq_bytes(req) < blk_rq_cur_bytes(req)) { |
2114 | printk(KERN_ERR "blk: request botched\n"); | 2146 | blk_dump_rq_flags(req, "request botched"); |
2115 | req->__data_len = blk_rq_cur_bytes(req); | 2147 | req->__data_len = blk_rq_cur_bytes(req); |
2116 | } | 2148 | } |
2117 | 2149 | ||
@@ -2497,9 +2529,7 @@ EXPORT_SYMBOL_GPL(blk_rq_unprep_clone); | |||
2497 | static void __blk_rq_prep_clone(struct request *dst, struct request *src) | 2529 | static void __blk_rq_prep_clone(struct request *dst, struct request *src) |
2498 | { | 2530 | { |
2499 | dst->cpu = src->cpu; | 2531 | dst->cpu = src->cpu; |
2500 | dst->cmd_flags = (rq_data_dir(src) | REQ_NOMERGE); | 2532 | dst->cmd_flags = (src->cmd_flags & REQ_CLONE_MASK) | REQ_NOMERGE; |
2501 | if (src->cmd_flags & REQ_DISCARD) | ||
2502 | dst->cmd_flags |= REQ_DISCARD; | ||
2503 | dst->cmd_type = src->cmd_type; | 2533 | dst->cmd_type = src->cmd_type; |
2504 | dst->__sector = blk_rq_pos(src); | 2534 | dst->__sector = blk_rq_pos(src); |
2505 | dst->__data_len = blk_rq_bytes(src); | 2535 | dst->__data_len = blk_rq_bytes(src); |
@@ -2579,12 +2609,171 @@ int kblockd_schedule_work(struct request_queue *q, struct work_struct *work) | |||
2579 | } | 2609 | } |
2580 | EXPORT_SYMBOL(kblockd_schedule_work); | 2610 | EXPORT_SYMBOL(kblockd_schedule_work); |
2581 | 2611 | ||
2612 | int kblockd_schedule_delayed_work(struct request_queue *q, | ||
2613 | struct delayed_work *dwork, unsigned long delay) | ||
2614 | { | ||
2615 | return queue_delayed_work(kblockd_workqueue, dwork, delay); | ||
2616 | } | ||
2617 | EXPORT_SYMBOL(kblockd_schedule_delayed_work); | ||
2618 | |||
2619 | #define PLUG_MAGIC 0x91827364 | ||
2620 | |||
2621 | void blk_start_plug(struct blk_plug *plug) | ||
2622 | { | ||
2623 | struct task_struct *tsk = current; | ||
2624 | |||
2625 | plug->magic = PLUG_MAGIC; | ||
2626 | INIT_LIST_HEAD(&plug->list); | ||
2627 | INIT_LIST_HEAD(&plug->cb_list); | ||
2628 | plug->should_sort = 0; | ||
2629 | |||
2630 | /* | ||
2631 | * If this is a nested plug, don't actually assign it. It will be | ||
2632 | * flushed on its own. | ||
2633 | */ | ||
2634 | if (!tsk->plug) { | ||
2635 | /* | ||
2636 | * Store ordering should not be needed here, since a potential | ||
2637 | * preempt will imply a full memory barrier | ||
2638 | */ | ||
2639 | tsk->plug = plug; | ||
2640 | } | ||
2641 | } | ||
2642 | EXPORT_SYMBOL(blk_start_plug); | ||
2643 | |||
2644 | static int plug_rq_cmp(void *priv, struct list_head *a, struct list_head *b) | ||
2645 | { | ||
2646 | struct request *rqa = container_of(a, struct request, queuelist); | ||
2647 | struct request *rqb = container_of(b, struct request, queuelist); | ||
2648 | |||
2649 | return !(rqa->q <= rqb->q); | ||
2650 | } | ||
2651 | |||
2652 | /* | ||
2653 | * If 'from_schedule' is true, then postpone the dispatch of requests | ||
2654 | * until a safe kblockd context. We due this to avoid accidental big | ||
2655 | * additional stack usage in driver dispatch, in places where the originally | ||
2656 | * plugger did not intend it. | ||
2657 | */ | ||
2658 | static void queue_unplugged(struct request_queue *q, unsigned int depth, | ||
2659 | bool from_schedule) | ||
2660 | __releases(q->queue_lock) | ||
2661 | { | ||
2662 | trace_block_unplug(q, depth, !from_schedule); | ||
2663 | |||
2664 | /* | ||
2665 | * If we are punting this to kblockd, then we can safely drop | ||
2666 | * the queue_lock before waking kblockd (which needs to take | ||
2667 | * this lock). | ||
2668 | */ | ||
2669 | if (from_schedule) { | ||
2670 | spin_unlock(q->queue_lock); | ||
2671 | blk_run_queue_async(q); | ||
2672 | } else { | ||
2673 | __blk_run_queue(q); | ||
2674 | spin_unlock(q->queue_lock); | ||
2675 | } | ||
2676 | |||
2677 | } | ||
2678 | |||
2679 | static void flush_plug_callbacks(struct blk_plug *plug) | ||
2680 | { | ||
2681 | LIST_HEAD(callbacks); | ||
2682 | |||
2683 | if (list_empty(&plug->cb_list)) | ||
2684 | return; | ||
2685 | |||
2686 | list_splice_init(&plug->cb_list, &callbacks); | ||
2687 | |||
2688 | while (!list_empty(&callbacks)) { | ||
2689 | struct blk_plug_cb *cb = list_first_entry(&callbacks, | ||
2690 | struct blk_plug_cb, | ||
2691 | list); | ||
2692 | list_del(&cb->list); | ||
2693 | cb->callback(cb); | ||
2694 | } | ||
2695 | } | ||
2696 | |||
2697 | void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule) | ||
2698 | { | ||
2699 | struct request_queue *q; | ||
2700 | unsigned long flags; | ||
2701 | struct request *rq; | ||
2702 | LIST_HEAD(list); | ||
2703 | unsigned int depth; | ||
2704 | |||
2705 | BUG_ON(plug->magic != PLUG_MAGIC); | ||
2706 | |||
2707 | flush_plug_callbacks(plug); | ||
2708 | if (list_empty(&plug->list)) | ||
2709 | return; | ||
2710 | |||
2711 | list_splice_init(&plug->list, &list); | ||
2712 | |||
2713 | if (plug->should_sort) { | ||
2714 | list_sort(NULL, &list, plug_rq_cmp); | ||
2715 | plug->should_sort = 0; | ||
2716 | } | ||
2717 | |||
2718 | q = NULL; | ||
2719 | depth = 0; | ||
2720 | |||
2721 | /* | ||
2722 | * Save and disable interrupts here, to avoid doing it for every | ||
2723 | * queue lock we have to take. | ||
2724 | */ | ||
2725 | local_irq_save(flags); | ||
2726 | while (!list_empty(&list)) { | ||
2727 | rq = list_entry_rq(list.next); | ||
2728 | list_del_init(&rq->queuelist); | ||
2729 | BUG_ON(!rq->q); | ||
2730 | if (rq->q != q) { | ||
2731 | /* | ||
2732 | * This drops the queue lock | ||
2733 | */ | ||
2734 | if (q) | ||
2735 | queue_unplugged(q, depth, from_schedule); | ||
2736 | q = rq->q; | ||
2737 | depth = 0; | ||
2738 | spin_lock(q->queue_lock); | ||
2739 | } | ||
2740 | /* | ||
2741 | * rq is already accounted, so use raw insert | ||
2742 | */ | ||
2743 | if (rq->cmd_flags & (REQ_FLUSH | REQ_FUA)) | ||
2744 | __elv_add_request(q, rq, ELEVATOR_INSERT_FLUSH); | ||
2745 | else | ||
2746 | __elv_add_request(q, rq, ELEVATOR_INSERT_SORT_MERGE); | ||
2747 | |||
2748 | depth++; | ||
2749 | } | ||
2750 | |||
2751 | /* | ||
2752 | * This drops the queue lock | ||
2753 | */ | ||
2754 | if (q) | ||
2755 | queue_unplugged(q, depth, from_schedule); | ||
2756 | |||
2757 | local_irq_restore(flags); | ||
2758 | } | ||
2759 | |||
2760 | void blk_finish_plug(struct blk_plug *plug) | ||
2761 | { | ||
2762 | blk_flush_plug_list(plug, false); | ||
2763 | |||
2764 | if (plug == current->plug) | ||
2765 | current->plug = NULL; | ||
2766 | } | ||
2767 | EXPORT_SYMBOL(blk_finish_plug); | ||
2768 | |||
2582 | int __init blk_dev_init(void) | 2769 | int __init blk_dev_init(void) |
2583 | { | 2770 | { |
2584 | BUILD_BUG_ON(__REQ_NR_BITS > 8 * | 2771 | BUILD_BUG_ON(__REQ_NR_BITS > 8 * |
2585 | sizeof(((struct request *)0)->cmd_flags)); | 2772 | sizeof(((struct request *)0)->cmd_flags)); |
2586 | 2773 | ||
2587 | kblockd_workqueue = create_workqueue("kblockd"); | 2774 | /* used for unplugging and affects IO latency/throughput - HIGHPRI */ |
2775 | kblockd_workqueue = alloc_workqueue("kblockd", | ||
2776 | WQ_MEM_RECLAIM | WQ_HIGHPRI, 0); | ||
2588 | if (!kblockd_workqueue) | 2777 | if (!kblockd_workqueue) |
2589 | panic("Failed to create kblockd\n"); | 2778 | panic("Failed to create kblockd\n"); |
2590 | 2779 | ||
diff --git a/block/blk-exec.c b/block/blk-exec.c index e1672f14840e..8a0e7ec056e7 100644 --- a/block/blk-exec.c +++ b/block/blk-exec.c | |||
@@ -54,9 +54,9 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk, | |||
54 | rq->end_io = done; | 54 | rq->end_io = done; |
55 | WARN_ON(irqs_disabled()); | 55 | WARN_ON(irqs_disabled()); |
56 | spin_lock_irq(q->queue_lock); | 56 | spin_lock_irq(q->queue_lock); |
57 | __elv_add_request(q, rq, where, 1); | 57 | __elv_add_request(q, rq, where); |
58 | __generic_unplug_device(q); | 58 | __blk_run_queue(q); |
59 | /* the queue is stopped so it won't be plugged+unplugged */ | 59 | /* the queue is stopped so it won't be run */ |
60 | if (rq->cmd_type == REQ_TYPE_PM_RESUME) | 60 | if (rq->cmd_type == REQ_TYPE_PM_RESUME) |
61 | q->request_fn(q); | 61 | q->request_fn(q); |
62 | spin_unlock_irq(q->queue_lock); | 62 | spin_unlock_irq(q->queue_lock); |
@@ -80,6 +80,7 @@ int blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk, | |||
80 | DECLARE_COMPLETION_ONSTACK(wait); | 80 | DECLARE_COMPLETION_ONSTACK(wait); |
81 | char sense[SCSI_SENSE_BUFFERSIZE]; | 81 | char sense[SCSI_SENSE_BUFFERSIZE]; |
82 | int err = 0; | 82 | int err = 0; |
83 | unsigned long hang_check; | ||
83 | 84 | ||
84 | /* | 85 | /* |
85 | * we need an extra reference to the request, so we can look at | 86 | * we need an extra reference to the request, so we can look at |
@@ -95,7 +96,13 @@ int blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk, | |||
95 | 96 | ||
96 | rq->end_io_data = &wait; | 97 | rq->end_io_data = &wait; |
97 | blk_execute_rq_nowait(q, bd_disk, rq, at_head, blk_end_sync_rq); | 98 | blk_execute_rq_nowait(q, bd_disk, rq, at_head, blk_end_sync_rq); |
98 | wait_for_completion(&wait); | 99 | |
100 | /* Prevent hang_check timer from firing at us during very long I/O */ | ||
101 | hang_check = sysctl_hung_task_timeout_secs; | ||
102 | if (hang_check) | ||
103 | while (!wait_for_completion_timeout(&wait, hang_check * (HZ/2))); | ||
104 | else | ||
105 | wait_for_completion(&wait); | ||
99 | 106 | ||
100 | if (rq->errors) | 107 | if (rq->errors) |
101 | err = -EIO; | 108 | err = -EIO; |
diff --git a/block/blk-flush.c b/block/blk-flush.c new file mode 100644 index 000000000000..bb21e4c36f70 --- /dev/null +++ b/block/blk-flush.c | |||
@@ -0,0 +1,443 @@ | |||
1 | /* | ||
2 | * Functions to sequence FLUSH and FUA writes. | ||
3 | * | ||
4 | * Copyright (C) 2011 Max Planck Institute for Gravitational Physics | ||
5 | * Copyright (C) 2011 Tejun Heo <tj@kernel.org> | ||
6 | * | ||
7 | * This file is released under the GPLv2. | ||
8 | * | ||
9 | * REQ_{FLUSH|FUA} requests are decomposed to sequences consisted of three | ||
10 | * optional steps - PREFLUSH, DATA and POSTFLUSH - according to the request | ||
11 | * properties and hardware capability. | ||
12 | * | ||
13 | * If a request doesn't have data, only REQ_FLUSH makes sense, which | ||
14 | * indicates a simple flush request. If there is data, REQ_FLUSH indicates | ||
15 | * that the device cache should be flushed before the data is executed, and | ||
16 | * REQ_FUA means that the data must be on non-volatile media on request | ||
17 | * completion. | ||
18 | * | ||
19 | * If the device doesn't have writeback cache, FLUSH and FUA don't make any | ||
20 | * difference. The requests are either completed immediately if there's no | ||
21 | * data or executed as normal requests otherwise. | ||
22 | * | ||
23 | * If the device has writeback cache and supports FUA, REQ_FLUSH is | ||
24 | * translated to PREFLUSH but REQ_FUA is passed down directly with DATA. | ||
25 | * | ||
26 | * If the device has writeback cache and doesn't support FUA, REQ_FLUSH is | ||
27 | * translated to PREFLUSH and REQ_FUA to POSTFLUSH. | ||
28 | * | ||
29 | * The actual execution of flush is double buffered. Whenever a request | ||
30 | * needs to execute PRE or POSTFLUSH, it queues at | ||
31 | * q->flush_queue[q->flush_pending_idx]. Once certain criteria are met, a | ||
32 | * flush is issued and the pending_idx is toggled. When the flush | ||
33 | * completes, all the requests which were pending are proceeded to the next | ||
34 | * step. This allows arbitrary merging of different types of FLUSH/FUA | ||
35 | * requests. | ||
36 | * | ||
37 | * Currently, the following conditions are used to determine when to issue | ||
38 | * flush. | ||
39 | * | ||
40 | * C1. At any given time, only one flush shall be in progress. This makes | ||
41 | * double buffering sufficient. | ||
42 | * | ||
43 | * C2. Flush is deferred if any request is executing DATA of its sequence. | ||
44 | * This avoids issuing separate POSTFLUSHes for requests which shared | ||
45 | * PREFLUSH. | ||
46 | * | ||
47 | * C3. The second condition is ignored if there is a request which has | ||
48 | * waited longer than FLUSH_PENDING_TIMEOUT. This is to avoid | ||
49 | * starvation in the unlikely case where there are continuous stream of | ||
50 | * FUA (without FLUSH) requests. | ||
51 | * | ||
52 | * For devices which support FUA, it isn't clear whether C2 (and thus C3) | ||
53 | * is beneficial. | ||
54 | * | ||
55 | * Note that a sequenced FLUSH/FUA request with DATA is completed twice. | ||
56 | * Once while executing DATA and again after the whole sequence is | ||
57 | * complete. The first completion updates the contained bio but doesn't | ||
58 | * finish it so that the bio submitter is notified only after the whole | ||
59 | * sequence is complete. This is implemented by testing REQ_FLUSH_SEQ in | ||
60 | * req_bio_endio(). | ||
61 | * | ||
62 | * The above peculiarity requires that each FLUSH/FUA request has only one | ||
63 | * bio attached to it, which is guaranteed as they aren't allowed to be | ||
64 | * merged in the usual way. | ||
65 | */ | ||
66 | |||
67 | #include <linux/kernel.h> | ||
68 | #include <linux/module.h> | ||
69 | #include <linux/bio.h> | ||
70 | #include <linux/blkdev.h> | ||
71 | #include <linux/gfp.h> | ||
72 | |||
73 | #include "blk.h" | ||
74 | |||
75 | /* FLUSH/FUA sequences */ | ||
76 | enum { | ||
77 | REQ_FSEQ_PREFLUSH = (1 << 0), /* pre-flushing in progress */ | ||
78 | REQ_FSEQ_DATA = (1 << 1), /* data write in progress */ | ||
79 | REQ_FSEQ_POSTFLUSH = (1 << 2), /* post-flushing in progress */ | ||
80 | REQ_FSEQ_DONE = (1 << 3), | ||
81 | |||
82 | REQ_FSEQ_ACTIONS = REQ_FSEQ_PREFLUSH | REQ_FSEQ_DATA | | ||
83 | REQ_FSEQ_POSTFLUSH, | ||
84 | |||
85 | /* | ||
86 | * If flush has been pending longer than the following timeout, | ||
87 | * it's issued even if flush_data requests are still in flight. | ||
88 | */ | ||
89 | FLUSH_PENDING_TIMEOUT = 5 * HZ, | ||
90 | }; | ||
91 | |||
92 | static bool blk_kick_flush(struct request_queue *q); | ||
93 | |||
94 | static unsigned int blk_flush_policy(unsigned int fflags, struct request *rq) | ||
95 | { | ||
96 | unsigned int policy = 0; | ||
97 | |||
98 | if (fflags & REQ_FLUSH) { | ||
99 | if (rq->cmd_flags & REQ_FLUSH) | ||
100 | policy |= REQ_FSEQ_PREFLUSH; | ||
101 | if (blk_rq_sectors(rq)) | ||
102 | policy |= REQ_FSEQ_DATA; | ||
103 | if (!(fflags & REQ_FUA) && (rq->cmd_flags & REQ_FUA)) | ||
104 | policy |= REQ_FSEQ_POSTFLUSH; | ||
105 | } | ||
106 | return policy; | ||
107 | } | ||
108 | |||
109 | static unsigned int blk_flush_cur_seq(struct request *rq) | ||
110 | { | ||
111 | return 1 << ffz(rq->flush.seq); | ||
112 | } | ||
113 | |||
114 | static void blk_flush_restore_request(struct request *rq) | ||
115 | { | ||
116 | /* | ||
117 | * After flush data completion, @rq->bio is %NULL but we need to | ||
118 | * complete the bio again. @rq->biotail is guaranteed to equal the | ||
119 | * original @rq->bio. Restore it. | ||
120 | */ | ||
121 | rq->bio = rq->biotail; | ||
122 | |||
123 | /* make @rq a normal request */ | ||
124 | rq->cmd_flags &= ~REQ_FLUSH_SEQ; | ||
125 | rq->end_io = NULL; | ||
126 | } | ||
127 | |||
128 | /** | ||
129 | * blk_flush_complete_seq - complete flush sequence | ||
130 | * @rq: FLUSH/FUA request being sequenced | ||
131 | * @seq: sequences to complete (mask of %REQ_FSEQ_*, can be zero) | ||
132 | * @error: whether an error occurred | ||
133 | * | ||
134 | * @rq just completed @seq part of its flush sequence, record the | ||
135 | * completion and trigger the next step. | ||
136 | * | ||
137 | * CONTEXT: | ||
138 | * spin_lock_irq(q->queue_lock) | ||
139 | * | ||
140 | * RETURNS: | ||
141 | * %true if requests were added to the dispatch queue, %false otherwise. | ||
142 | */ | ||
143 | static bool blk_flush_complete_seq(struct request *rq, unsigned int seq, | ||
144 | int error) | ||
145 | { | ||
146 | struct request_queue *q = rq->q; | ||
147 | struct list_head *pending = &q->flush_queue[q->flush_pending_idx]; | ||
148 | bool queued = false; | ||
149 | |||
150 | BUG_ON(rq->flush.seq & seq); | ||
151 | rq->flush.seq |= seq; | ||
152 | |||
153 | if (likely(!error)) | ||
154 | seq = blk_flush_cur_seq(rq); | ||
155 | else | ||
156 | seq = REQ_FSEQ_DONE; | ||
157 | |||
158 | switch (seq) { | ||
159 | case REQ_FSEQ_PREFLUSH: | ||
160 | case REQ_FSEQ_POSTFLUSH: | ||
161 | /* queue for flush */ | ||
162 | if (list_empty(pending)) | ||
163 | q->flush_pending_since = jiffies; | ||
164 | list_move_tail(&rq->flush.list, pending); | ||
165 | break; | ||
166 | |||
167 | case REQ_FSEQ_DATA: | ||
168 | list_move_tail(&rq->flush.list, &q->flush_data_in_flight); | ||
169 | list_add(&rq->queuelist, &q->queue_head); | ||
170 | queued = true; | ||
171 | break; | ||
172 | |||
173 | case REQ_FSEQ_DONE: | ||
174 | /* | ||
175 | * @rq was previously adjusted by blk_flush_issue() for | ||
176 | * flush sequencing and may already have gone through the | ||
177 | * flush data request completion path. Restore @rq for | ||
178 | * normal completion and end it. | ||
179 | */ | ||
180 | BUG_ON(!list_empty(&rq->queuelist)); | ||
181 | list_del_init(&rq->flush.list); | ||
182 | blk_flush_restore_request(rq); | ||
183 | __blk_end_request_all(rq, error); | ||
184 | break; | ||
185 | |||
186 | default: | ||
187 | BUG(); | ||
188 | } | ||
189 | |||
190 | return blk_kick_flush(q) | queued; | ||
191 | } | ||
192 | |||
193 | static void flush_end_io(struct request *flush_rq, int error) | ||
194 | { | ||
195 | struct request_queue *q = flush_rq->q; | ||
196 | struct list_head *running = &q->flush_queue[q->flush_running_idx]; | ||
197 | bool queued = false; | ||
198 | struct request *rq, *n; | ||
199 | |||
200 | BUG_ON(q->flush_pending_idx == q->flush_running_idx); | ||
201 | |||
202 | /* account completion of the flush request */ | ||
203 | q->flush_running_idx ^= 1; | ||
204 | elv_completed_request(q, flush_rq); | ||
205 | |||
206 | /* and push the waiting requests to the next stage */ | ||
207 | list_for_each_entry_safe(rq, n, running, flush.list) { | ||
208 | unsigned int seq = blk_flush_cur_seq(rq); | ||
209 | |||
210 | BUG_ON(seq != REQ_FSEQ_PREFLUSH && seq != REQ_FSEQ_POSTFLUSH); | ||
211 | queued |= blk_flush_complete_seq(rq, seq, error); | ||
212 | } | ||
213 | |||
214 | /* | ||
215 | * Kick the queue to avoid stall for two cases: | ||
216 | * 1. Moving a request silently to empty queue_head may stall the | ||
217 | * queue. | ||
218 | * 2. When flush request is running in non-queueable queue, the | ||
219 | * queue is hold. Restart the queue after flush request is finished | ||
220 | * to avoid stall. | ||
221 | * This function is called from request completion path and calling | ||
222 | * directly into request_fn may confuse the driver. Always use | ||
223 | * kblockd. | ||
224 | */ | ||
225 | if (queued || q->flush_queue_delayed) | ||
226 | blk_run_queue_async(q); | ||
227 | q->flush_queue_delayed = 0; | ||
228 | } | ||
229 | |||
230 | /** | ||
231 | * blk_kick_flush - consider issuing flush request | ||
232 | * @q: request_queue being kicked | ||
233 | * | ||
234 | * Flush related states of @q have changed, consider issuing flush request. | ||
235 | * Please read the comment at the top of this file for more info. | ||
236 | * | ||
237 | * CONTEXT: | ||
238 | * spin_lock_irq(q->queue_lock) | ||
239 | * | ||
240 | * RETURNS: | ||
241 | * %true if flush was issued, %false otherwise. | ||
242 | */ | ||
243 | static bool blk_kick_flush(struct request_queue *q) | ||
244 | { | ||
245 | struct list_head *pending = &q->flush_queue[q->flush_pending_idx]; | ||
246 | struct request *first_rq = | ||
247 | list_first_entry(pending, struct request, flush.list); | ||
248 | |||
249 | /* C1 described at the top of this file */ | ||
250 | if (q->flush_pending_idx != q->flush_running_idx || list_empty(pending)) | ||
251 | return false; | ||
252 | |||
253 | /* C2 and C3 */ | ||
254 | if (!list_empty(&q->flush_data_in_flight) && | ||
255 | time_before(jiffies, | ||
256 | q->flush_pending_since + FLUSH_PENDING_TIMEOUT)) | ||
257 | return false; | ||
258 | |||
259 | /* | ||
260 | * Issue flush and toggle pending_idx. This makes pending_idx | ||
261 | * different from running_idx, which means flush is in flight. | ||
262 | */ | ||
263 | blk_rq_init(q, &q->flush_rq); | ||
264 | q->flush_rq.cmd_type = REQ_TYPE_FS; | ||
265 | q->flush_rq.cmd_flags = WRITE_FLUSH | REQ_FLUSH_SEQ; | ||
266 | q->flush_rq.rq_disk = first_rq->rq_disk; | ||
267 | q->flush_rq.end_io = flush_end_io; | ||
268 | |||
269 | q->flush_pending_idx ^= 1; | ||
270 | list_add_tail(&q->flush_rq.queuelist, &q->queue_head); | ||
271 | return true; | ||
272 | } | ||
273 | |||
274 | static void flush_data_end_io(struct request *rq, int error) | ||
275 | { | ||
276 | struct request_queue *q = rq->q; | ||
277 | |||
278 | /* | ||
279 | * After populating an empty queue, kick it to avoid stall. Read | ||
280 | * the comment in flush_end_io(). | ||
281 | */ | ||
282 | if (blk_flush_complete_seq(rq, REQ_FSEQ_DATA, error)) | ||
283 | blk_run_queue_async(q); | ||
284 | } | ||
285 | |||
286 | /** | ||
287 | * blk_insert_flush - insert a new FLUSH/FUA request | ||
288 | * @rq: request to insert | ||
289 | * | ||
290 | * To be called from __elv_add_request() for %ELEVATOR_INSERT_FLUSH insertions. | ||
291 | * @rq is being submitted. Analyze what needs to be done and put it on the | ||
292 | * right queue. | ||
293 | * | ||
294 | * CONTEXT: | ||
295 | * spin_lock_irq(q->queue_lock) | ||
296 | */ | ||
297 | void blk_insert_flush(struct request *rq) | ||
298 | { | ||
299 | struct request_queue *q = rq->q; | ||
300 | unsigned int fflags = q->flush_flags; /* may change, cache */ | ||
301 | unsigned int policy = blk_flush_policy(fflags, rq); | ||
302 | |||
303 | BUG_ON(rq->end_io); | ||
304 | BUG_ON(!rq->bio || rq->bio != rq->biotail); | ||
305 | |||
306 | /* | ||
307 | * @policy now records what operations need to be done. Adjust | ||
308 | * REQ_FLUSH and FUA for the driver. | ||
309 | */ | ||
310 | rq->cmd_flags &= ~REQ_FLUSH; | ||
311 | if (!(fflags & REQ_FUA)) | ||
312 | rq->cmd_flags &= ~REQ_FUA; | ||
313 | |||
314 | /* | ||
315 | * If there's data but flush is not necessary, the request can be | ||
316 | * processed directly without going through flush machinery. Queue | ||
317 | * for normal execution. | ||
318 | */ | ||
319 | if ((policy & REQ_FSEQ_DATA) && | ||
320 | !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) { | ||
321 | list_add_tail(&rq->queuelist, &q->queue_head); | ||
322 | return; | ||
323 | } | ||
324 | |||
325 | /* | ||
326 | * @rq should go through flush machinery. Mark it part of flush | ||
327 | * sequence and submit for further processing. | ||
328 | */ | ||
329 | memset(&rq->flush, 0, sizeof(rq->flush)); | ||
330 | INIT_LIST_HEAD(&rq->flush.list); | ||
331 | rq->cmd_flags |= REQ_FLUSH_SEQ; | ||
332 | rq->end_io = flush_data_end_io; | ||
333 | |||
334 | blk_flush_complete_seq(rq, REQ_FSEQ_ACTIONS & ~policy, 0); | ||
335 | } | ||
336 | |||
337 | /** | ||
338 | * blk_abort_flushes - @q is being aborted, abort flush requests | ||
339 | * @q: request_queue being aborted | ||
340 | * | ||
341 | * To be called from elv_abort_queue(). @q is being aborted. Prepare all | ||
342 | * FLUSH/FUA requests for abortion. | ||
343 | * | ||
344 | * CONTEXT: | ||
345 | * spin_lock_irq(q->queue_lock) | ||
346 | */ | ||
347 | void blk_abort_flushes(struct request_queue *q) | ||
348 | { | ||
349 | struct request *rq, *n; | ||
350 | int i; | ||
351 | |||
352 | /* | ||
353 | * Requests in flight for data are already owned by the dispatch | ||
354 | * queue or the device driver. Just restore for normal completion. | ||
355 | */ | ||
356 | list_for_each_entry_safe(rq, n, &q->flush_data_in_flight, flush.list) { | ||
357 | list_del_init(&rq->flush.list); | ||
358 | blk_flush_restore_request(rq); | ||
359 | } | ||
360 | |||
361 | /* | ||
362 | * We need to give away requests on flush queues. Restore for | ||
363 | * normal completion and put them on the dispatch queue. | ||
364 | */ | ||
365 | for (i = 0; i < ARRAY_SIZE(q->flush_queue); i++) { | ||
366 | list_for_each_entry_safe(rq, n, &q->flush_queue[i], | ||
367 | flush.list) { | ||
368 | list_del_init(&rq->flush.list); | ||
369 | blk_flush_restore_request(rq); | ||
370 | list_add_tail(&rq->queuelist, &q->queue_head); | ||
371 | } | ||
372 | } | ||
373 | } | ||
374 | |||
375 | static void bio_end_flush(struct bio *bio, int err) | ||
376 | { | ||
377 | if (err) | ||
378 | clear_bit(BIO_UPTODATE, &bio->bi_flags); | ||
379 | if (bio->bi_private) | ||
380 | complete(bio->bi_private); | ||
381 | bio_put(bio); | ||
382 | } | ||
383 | |||
384 | /** | ||
385 | * blkdev_issue_flush - queue a flush | ||
386 | * @bdev: blockdev to issue flush for | ||
387 | * @gfp_mask: memory allocation flags (for bio_alloc) | ||
388 | * @error_sector: error sector | ||
389 | * | ||
390 | * Description: | ||
391 | * Issue a flush for the block device in question. Caller can supply | ||
392 | * room for storing the error offset in case of a flush error, if they | ||
393 | * wish to. If WAIT flag is not passed then caller may check only what | ||
394 | * request was pushed in some internal queue for later handling. | ||
395 | */ | ||
396 | int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask, | ||
397 | sector_t *error_sector) | ||
398 | { | ||
399 | DECLARE_COMPLETION_ONSTACK(wait); | ||
400 | struct request_queue *q; | ||
401 | struct bio *bio; | ||
402 | int ret = 0; | ||
403 | |||
404 | if (bdev->bd_disk == NULL) | ||
405 | return -ENXIO; | ||
406 | |||
407 | q = bdev_get_queue(bdev); | ||
408 | if (!q) | ||
409 | return -ENXIO; | ||
410 | |||
411 | /* | ||
412 | * some block devices may not have their queue correctly set up here | ||
413 | * (e.g. loop device without a backing file) and so issuing a flush | ||
414 | * here will panic. Ensure there is a request function before issuing | ||
415 | * the flush. | ||
416 | */ | ||
417 | if (!q->make_request_fn) | ||
418 | return -ENXIO; | ||
419 | |||
420 | bio = bio_alloc(gfp_mask, 0); | ||
421 | bio->bi_end_io = bio_end_flush; | ||
422 | bio->bi_bdev = bdev; | ||
423 | bio->bi_private = &wait; | ||
424 | |||
425 | bio_get(bio); | ||
426 | submit_bio(WRITE_FLUSH, bio); | ||
427 | wait_for_completion(&wait); | ||
428 | |||
429 | /* | ||
430 | * The driver must store the error location in ->bi_sector, if | ||
431 | * it supports it. For non-stacked drivers, this should be | ||
432 | * copied from blk_rq_pos(rq). | ||
433 | */ | ||
434 | if (error_sector) | ||
435 | *error_sector = bio->bi_sector; | ||
436 | |||
437 | if (!bio_flagged(bio, BIO_UPTODATE)) | ||
438 | ret = -EIO; | ||
439 | |||
440 | bio_put(bio); | ||
441 | return ret; | ||
442 | } | ||
443 | EXPORT_SYMBOL(blkdev_issue_flush); | ||
diff --git a/block/blk-integrity.c b/block/blk-integrity.c index edce1ef7933d..129b9e209a3b 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c | |||
@@ -30,26 +30,41 @@ | |||
30 | 30 | ||
31 | static struct kmem_cache *integrity_cachep; | 31 | static struct kmem_cache *integrity_cachep; |
32 | 32 | ||
33 | static const char *bi_unsupported_name = "unsupported"; | ||
34 | |||
33 | /** | 35 | /** |
34 | * blk_rq_count_integrity_sg - Count number of integrity scatterlist elements | 36 | * blk_rq_count_integrity_sg - Count number of integrity scatterlist elements |
35 | * @rq: request with integrity metadata attached | 37 | * @q: request queue |
38 | * @bio: bio with integrity metadata attached | ||
36 | * | 39 | * |
37 | * Description: Returns the number of elements required in a | 40 | * Description: Returns the number of elements required in a |
38 | * scatterlist corresponding to the integrity metadata in a request. | 41 | * scatterlist corresponding to the integrity metadata in a bio. |
39 | */ | 42 | */ |
40 | int blk_rq_count_integrity_sg(struct request *rq) | 43 | int blk_rq_count_integrity_sg(struct request_queue *q, struct bio *bio) |
41 | { | 44 | { |
42 | struct bio_vec *iv, *ivprv; | 45 | struct bio_vec *iv, *ivprv = NULL; |
43 | struct req_iterator iter; | 46 | unsigned int segments = 0; |
44 | unsigned int segments; | 47 | unsigned int seg_size = 0; |
48 | unsigned int i = 0; | ||
49 | |||
50 | bio_for_each_integrity_vec(iv, bio, i) { | ||
51 | |||
52 | if (ivprv) { | ||
53 | if (!BIOVEC_PHYS_MERGEABLE(ivprv, iv)) | ||
54 | goto new_segment; | ||
45 | 55 | ||
46 | ivprv = NULL; | 56 | if (!BIOVEC_SEG_BOUNDARY(q, ivprv, iv)) |
47 | segments = 0; | 57 | goto new_segment; |
48 | 58 | ||
49 | rq_for_each_integrity_segment(iv, rq, iter) { | 59 | if (seg_size + iv->bv_len > queue_max_segment_size(q)) |
60 | goto new_segment; | ||
50 | 61 | ||
51 | if (!ivprv || !BIOVEC_PHYS_MERGEABLE(ivprv, iv)) | 62 | seg_size += iv->bv_len; |
63 | } else { | ||
64 | new_segment: | ||
52 | segments++; | 65 | segments++; |
66 | seg_size = iv->bv_len; | ||
67 | } | ||
53 | 68 | ||
54 | ivprv = iv; | 69 | ivprv = iv; |
55 | } | 70 | } |
@@ -60,30 +75,34 @@ EXPORT_SYMBOL(blk_rq_count_integrity_sg); | |||
60 | 75 | ||
61 | /** | 76 | /** |
62 | * blk_rq_map_integrity_sg - Map integrity metadata into a scatterlist | 77 | * blk_rq_map_integrity_sg - Map integrity metadata into a scatterlist |
63 | * @rq: request with integrity metadata attached | 78 | * @q: request queue |
79 | * @bio: bio with integrity metadata attached | ||
64 | * @sglist: target scatterlist | 80 | * @sglist: target scatterlist |
65 | * | 81 | * |
66 | * Description: Map the integrity vectors in request into a | 82 | * Description: Map the integrity vectors in request into a |
67 | * scatterlist. The scatterlist must be big enough to hold all | 83 | * scatterlist. The scatterlist must be big enough to hold all |
68 | * elements. I.e. sized using blk_rq_count_integrity_sg(). | 84 | * elements. I.e. sized using blk_rq_count_integrity_sg(). |
69 | */ | 85 | */ |
70 | int blk_rq_map_integrity_sg(struct request *rq, struct scatterlist *sglist) | 86 | int blk_rq_map_integrity_sg(struct request_queue *q, struct bio *bio, |
87 | struct scatterlist *sglist) | ||
71 | { | 88 | { |
72 | struct bio_vec *iv, *ivprv; | 89 | struct bio_vec *iv, *ivprv = NULL; |
73 | struct req_iterator iter; | 90 | struct scatterlist *sg = NULL; |
74 | struct scatterlist *sg; | 91 | unsigned int segments = 0; |
75 | unsigned int segments; | 92 | unsigned int i = 0; |
76 | |||
77 | ivprv = NULL; | ||
78 | sg = NULL; | ||
79 | segments = 0; | ||
80 | 93 | ||
81 | rq_for_each_integrity_segment(iv, rq, iter) { | 94 | bio_for_each_integrity_vec(iv, bio, i) { |
82 | 95 | ||
83 | if (ivprv) { | 96 | if (ivprv) { |
84 | if (!BIOVEC_PHYS_MERGEABLE(ivprv, iv)) | 97 | if (!BIOVEC_PHYS_MERGEABLE(ivprv, iv)) |
85 | goto new_segment; | 98 | goto new_segment; |
86 | 99 | ||
100 | if (!BIOVEC_SEG_BOUNDARY(q, ivprv, iv)) | ||
101 | goto new_segment; | ||
102 | |||
103 | if (sg->length + iv->bv_len > queue_max_segment_size(q)) | ||
104 | goto new_segment; | ||
105 | |||
87 | sg->length += iv->bv_len; | 106 | sg->length += iv->bv_len; |
88 | } else { | 107 | } else { |
89 | new_segment: | 108 | new_segment: |
@@ -162,6 +181,40 @@ int blk_integrity_compare(struct gendisk *gd1, struct gendisk *gd2) | |||
162 | } | 181 | } |
163 | EXPORT_SYMBOL(blk_integrity_compare); | 182 | EXPORT_SYMBOL(blk_integrity_compare); |
164 | 183 | ||
184 | int blk_integrity_merge_rq(struct request_queue *q, struct request *req, | ||
185 | struct request *next) | ||
186 | { | ||
187 | if (blk_integrity_rq(req) != blk_integrity_rq(next)) | ||
188 | return -1; | ||
189 | |||
190 | if (req->nr_integrity_segments + next->nr_integrity_segments > | ||
191 | q->limits.max_integrity_segments) | ||
192 | return -1; | ||
193 | |||
194 | return 0; | ||
195 | } | ||
196 | EXPORT_SYMBOL(blk_integrity_merge_rq); | ||
197 | |||
198 | int blk_integrity_merge_bio(struct request_queue *q, struct request *req, | ||
199 | struct bio *bio) | ||
200 | { | ||
201 | int nr_integrity_segs; | ||
202 | struct bio *next = bio->bi_next; | ||
203 | |||
204 | bio->bi_next = NULL; | ||
205 | nr_integrity_segs = blk_rq_count_integrity_sg(q, bio); | ||
206 | bio->bi_next = next; | ||
207 | |||
208 | if (req->nr_integrity_segments + nr_integrity_segs > | ||
209 | q->limits.max_integrity_segments) | ||
210 | return -1; | ||
211 | |||
212 | req->nr_integrity_segments += nr_integrity_segs; | ||
213 | |||
214 | return 0; | ||
215 | } | ||
216 | EXPORT_SYMBOL(blk_integrity_merge_bio); | ||
217 | |||
165 | struct integrity_sysfs_entry { | 218 | struct integrity_sysfs_entry { |
166 | struct attribute attr; | 219 | struct attribute attr; |
167 | ssize_t (*show)(struct blk_integrity *, char *); | 220 | ssize_t (*show)(struct blk_integrity *, char *); |
@@ -307,6 +360,14 @@ static struct kobj_type integrity_ktype = { | |||
307 | .release = blk_integrity_release, | 360 | .release = blk_integrity_release, |
308 | }; | 361 | }; |
309 | 362 | ||
363 | bool blk_integrity_is_initialized(struct gendisk *disk) | ||
364 | { | ||
365 | struct blk_integrity *bi = blk_get_integrity(disk); | ||
366 | |||
367 | return (bi && bi->name && strcmp(bi->name, bi_unsupported_name) != 0); | ||
368 | } | ||
369 | EXPORT_SYMBOL(blk_integrity_is_initialized); | ||
370 | |||
310 | /** | 371 | /** |
311 | * blk_integrity_register - Register a gendisk as being integrity-capable | 372 | * blk_integrity_register - Register a gendisk as being integrity-capable |
312 | * @disk: struct gendisk pointer to make integrity-aware | 373 | * @disk: struct gendisk pointer to make integrity-aware |
@@ -356,7 +417,7 @@ int blk_integrity_register(struct gendisk *disk, struct blk_integrity *template) | |||
356 | bi->get_tag_fn = template->get_tag_fn; | 417 | bi->get_tag_fn = template->get_tag_fn; |
357 | bi->tag_size = template->tag_size; | 418 | bi->tag_size = template->tag_size; |
358 | } else | 419 | } else |
359 | bi->name = "unsupported"; | 420 | bi->name = bi_unsupported_name; |
360 | 421 | ||
361 | return 0; | 422 | return 0; |
362 | } | 423 | } |
@@ -381,7 +442,6 @@ void blk_integrity_unregister(struct gendisk *disk) | |||
381 | kobject_uevent(&bi->kobj, KOBJ_REMOVE); | 442 | kobject_uevent(&bi->kobj, KOBJ_REMOVE); |
382 | kobject_del(&bi->kobj); | 443 | kobject_del(&bi->kobj); |
383 | kobject_put(&bi->kobj); | 444 | kobject_put(&bi->kobj); |
384 | kmem_cache_free(integrity_cachep, bi); | ||
385 | disk->integrity = NULL; | 445 | disk->integrity = NULL; |
386 | } | 446 | } |
387 | EXPORT_SYMBOL(blk_integrity_unregister); | 447 | EXPORT_SYMBOL(blk_integrity_unregister); |
diff --git a/block/blk-ioc.c b/block/blk-ioc.c index d22c4c55c406..342eae9b0d3c 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c | |||
@@ -21,7 +21,7 @@ static void cfq_dtor(struct io_context *ioc) | |||
21 | if (!hlist_empty(&ioc->cic_list)) { | 21 | if (!hlist_empty(&ioc->cic_list)) { |
22 | struct cfq_io_context *cic; | 22 | struct cfq_io_context *cic; |
23 | 23 | ||
24 | cic = list_entry(ioc->cic_list.first, struct cfq_io_context, | 24 | cic = hlist_entry(ioc->cic_list.first, struct cfq_io_context, |
25 | cic_list); | 25 | cic_list); |
26 | cic->dtor(ioc); | 26 | cic->dtor(ioc); |
27 | } | 27 | } |
@@ -57,14 +57,14 @@ static void cfq_exit(struct io_context *ioc) | |||
57 | if (!hlist_empty(&ioc->cic_list)) { | 57 | if (!hlist_empty(&ioc->cic_list)) { |
58 | struct cfq_io_context *cic; | 58 | struct cfq_io_context *cic; |
59 | 59 | ||
60 | cic = list_entry(ioc->cic_list.first, struct cfq_io_context, | 60 | cic = hlist_entry(ioc->cic_list.first, struct cfq_io_context, |
61 | cic_list); | 61 | cic_list); |
62 | cic->exit(ioc); | 62 | cic->exit(ioc); |
63 | } | 63 | } |
64 | rcu_read_unlock(); | 64 | rcu_read_unlock(); |
65 | } | 65 | } |
66 | 66 | ||
67 | /* Called by the exitting task */ | 67 | /* Called by the exiting task */ |
68 | void exit_io_context(struct task_struct *task) | 68 | void exit_io_context(struct task_struct *task) |
69 | { | 69 | { |
70 | struct io_context *ioc; | 70 | struct io_context *ioc; |
@@ -74,10 +74,9 @@ void exit_io_context(struct task_struct *task) | |||
74 | task->io_context = NULL; | 74 | task->io_context = NULL; |
75 | task_unlock(task); | 75 | task_unlock(task); |
76 | 76 | ||
77 | if (atomic_dec_and_test(&ioc->nr_tasks)) { | 77 | if (atomic_dec_and_test(&ioc->nr_tasks)) |
78 | cfq_exit(ioc); | 78 | cfq_exit(ioc); |
79 | 79 | ||
80 | } | ||
81 | put_io_context(ioc); | 80 | put_io_context(ioc); |
82 | } | 81 | } |
83 | 82 | ||
@@ -97,6 +96,9 @@ struct io_context *alloc_io_context(gfp_t gfp_flags, int node) | |||
97 | INIT_RADIX_TREE(&ret->radix_root, GFP_ATOMIC | __GFP_HIGH); | 96 | INIT_RADIX_TREE(&ret->radix_root, GFP_ATOMIC | __GFP_HIGH); |
98 | INIT_HLIST_HEAD(&ret->cic_list); | 97 | INIT_HLIST_HEAD(&ret->cic_list); |
99 | ret->ioc_data = NULL; | 98 | ret->ioc_data = NULL; |
99 | #if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE) | ||
100 | ret->cgroup_changed = 0; | ||
101 | #endif | ||
100 | } | 102 | } |
101 | 103 | ||
102 | return ret; | 104 | return ret; |
@@ -153,20 +155,6 @@ struct io_context *get_io_context(gfp_t gfp_flags, int node) | |||
153 | } | 155 | } |
154 | EXPORT_SYMBOL(get_io_context); | 156 | EXPORT_SYMBOL(get_io_context); |
155 | 157 | ||
156 | void copy_io_context(struct io_context **pdst, struct io_context **psrc) | ||
157 | { | ||
158 | struct io_context *src = *psrc; | ||
159 | struct io_context *dst = *pdst; | ||
160 | |||
161 | if (src) { | ||
162 | BUG_ON(atomic_long_read(&src->refcount) == 0); | ||
163 | atomic_long_inc(&src->refcount); | ||
164 | put_io_context(dst); | ||
165 | *pdst = src; | ||
166 | } | ||
167 | } | ||
168 | EXPORT_SYMBOL(copy_io_context); | ||
169 | |||
170 | static int __init blk_ioc_init(void) | 158 | static int __init blk_ioc_init(void) |
171 | { | 159 | { |
172 | iocontext_cachep = kmem_cache_create("blkdev_ioc", | 160 | iocontext_cachep = kmem_cache_create("blkdev_ioc", |
diff --git a/block/blk-lib.c b/block/blk-lib.c index c392029a104e..78e627e2581d 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c | |||
@@ -9,17 +9,20 @@ | |||
9 | 9 | ||
10 | #include "blk.h" | 10 | #include "blk.h" |
11 | 11 | ||
12 | static void blkdev_discard_end_io(struct bio *bio, int err) | 12 | struct bio_batch { |
13 | { | 13 | atomic_t done; |
14 | if (err) { | 14 | unsigned long flags; |
15 | if (err == -EOPNOTSUPP) | 15 | struct completion *wait; |
16 | set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); | 16 | }; |
17 | clear_bit(BIO_UPTODATE, &bio->bi_flags); | ||
18 | } | ||
19 | 17 | ||
20 | if (bio->bi_private) | 18 | static void bio_batch_end_io(struct bio *bio, int err) |
21 | complete(bio->bi_private); | 19 | { |
20 | struct bio_batch *bb = bio->bi_private; | ||
22 | 21 | ||
22 | if (err && (err != -EOPNOTSUPP)) | ||
23 | clear_bit(BIO_UPTODATE, &bb->flags); | ||
24 | if (atomic_dec_and_test(&bb->done)) | ||
25 | complete(bb->wait); | ||
23 | bio_put(bio); | 26 | bio_put(bio); |
24 | } | 27 | } |
25 | 28 | ||
@@ -39,9 +42,9 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector, | |||
39 | { | 42 | { |
40 | DECLARE_COMPLETION_ONSTACK(wait); | 43 | DECLARE_COMPLETION_ONSTACK(wait); |
41 | struct request_queue *q = bdev_get_queue(bdev); | 44 | struct request_queue *q = bdev_get_queue(bdev); |
42 | int type = flags & BLKDEV_IFL_BARRIER ? | 45 | int type = REQ_WRITE | REQ_DISCARD; |
43 | DISCARD_BARRIER : DISCARD_NOBARRIER; | ||
44 | unsigned int max_discard_sectors; | 46 | unsigned int max_discard_sectors; |
47 | struct bio_batch bb; | ||
45 | struct bio *bio; | 48 | struct bio *bio; |
46 | int ret = 0; | 49 | int ret = 0; |
47 | 50 | ||
@@ -62,13 +65,17 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector, | |||
62 | max_discard_sectors &= ~(disc_sects - 1); | 65 | max_discard_sectors &= ~(disc_sects - 1); |
63 | } | 66 | } |
64 | 67 | ||
65 | if (flags & BLKDEV_IFL_SECURE) { | 68 | if (flags & BLKDEV_DISCARD_SECURE) { |
66 | if (!blk_queue_secdiscard(q)) | 69 | if (!blk_queue_secdiscard(q)) |
67 | return -EOPNOTSUPP; | 70 | return -EOPNOTSUPP; |
68 | type |= DISCARD_SECURE; | 71 | type |= REQ_SECURE; |
69 | } | 72 | } |
70 | 73 | ||
71 | while (nr_sects && !ret) { | 74 | atomic_set(&bb.done, 1); |
75 | bb.flags = 1 << BIO_UPTODATE; | ||
76 | bb.wait = &wait; | ||
77 | |||
78 | while (nr_sects) { | ||
72 | bio = bio_alloc(gfp_mask, 1); | 79 | bio = bio_alloc(gfp_mask, 1); |
73 | if (!bio) { | 80 | if (!bio) { |
74 | ret = -ENOMEM; | 81 | ret = -ENOMEM; |
@@ -76,10 +83,9 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector, | |||
76 | } | 83 | } |
77 | 84 | ||
78 | bio->bi_sector = sector; | 85 | bio->bi_sector = sector; |
79 | bio->bi_end_io = blkdev_discard_end_io; | 86 | bio->bi_end_io = bio_batch_end_io; |
80 | bio->bi_bdev = bdev; | 87 | bio->bi_bdev = bdev; |
81 | if (flags & BLKDEV_IFL_WAIT) | 88 | bio->bi_private = &bb; |
82 | bio->bi_private = &wait; | ||
83 | 89 | ||
84 | if (nr_sects > max_discard_sectors) { | 90 | if (nr_sects > max_discard_sectors) { |
85 | bio->bi_size = max_discard_sectors << 9; | 91 | bio->bi_size = max_discard_sectors << 9; |
@@ -90,85 +96,45 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector, | |||
90 | nr_sects = 0; | 96 | nr_sects = 0; |
91 | } | 97 | } |
92 | 98 | ||
93 | bio_get(bio); | 99 | atomic_inc(&bb.done); |
94 | submit_bio(type, bio); | 100 | submit_bio(type, bio); |
101 | } | ||
95 | 102 | ||
96 | if (flags & BLKDEV_IFL_WAIT) | 103 | /* Wait for bios in-flight */ |
97 | wait_for_completion(&wait); | 104 | if (!atomic_dec_and_test(&bb.done)) |
105 | wait_for_completion(&wait); | ||
98 | 106 | ||
99 | if (bio_flagged(bio, BIO_EOPNOTSUPP)) | 107 | if (!test_bit(BIO_UPTODATE, &bb.flags)) |
100 | ret = -EOPNOTSUPP; | 108 | ret = -EIO; |
101 | else if (!bio_flagged(bio, BIO_UPTODATE)) | ||
102 | ret = -EIO; | ||
103 | bio_put(bio); | ||
104 | } | ||
105 | 109 | ||
106 | return ret; | 110 | return ret; |
107 | } | 111 | } |
108 | EXPORT_SYMBOL(blkdev_issue_discard); | 112 | EXPORT_SYMBOL(blkdev_issue_discard); |
109 | 113 | ||
110 | struct bio_batch | ||
111 | { | ||
112 | atomic_t done; | ||
113 | unsigned long flags; | ||
114 | struct completion *wait; | ||
115 | bio_end_io_t *end_io; | ||
116 | }; | ||
117 | |||
118 | static void bio_batch_end_io(struct bio *bio, int err) | ||
119 | { | ||
120 | struct bio_batch *bb = bio->bi_private; | ||
121 | |||
122 | if (err) { | ||
123 | if (err == -EOPNOTSUPP) | ||
124 | set_bit(BIO_EOPNOTSUPP, &bb->flags); | ||
125 | else | ||
126 | clear_bit(BIO_UPTODATE, &bb->flags); | ||
127 | } | ||
128 | if (bb) { | ||
129 | if (bb->end_io) | ||
130 | bb->end_io(bio, err); | ||
131 | atomic_inc(&bb->done); | ||
132 | complete(bb->wait); | ||
133 | } | ||
134 | bio_put(bio); | ||
135 | } | ||
136 | |||
137 | /** | 114 | /** |
138 | * blkdev_issue_zeroout generate number of zero filed write bios | 115 | * blkdev_issue_zeroout - generate number of zero filed write bios |
139 | * @bdev: blockdev to issue | 116 | * @bdev: blockdev to issue |
140 | * @sector: start sector | 117 | * @sector: start sector |
141 | * @nr_sects: number of sectors to write | 118 | * @nr_sects: number of sectors to write |
142 | * @gfp_mask: memory allocation flags (for bio_alloc) | 119 | * @gfp_mask: memory allocation flags (for bio_alloc) |
143 | * @flags: BLKDEV_IFL_* flags to control behaviour | ||
144 | * | 120 | * |
145 | * Description: | 121 | * Description: |
146 | * Generate and issue number of bios with zerofiled pages. | 122 | * Generate and issue number of bios with zerofiled pages. |
147 | * Send barrier at the beginning and at the end if requested. This guarantie | ||
148 | * correct request ordering. Empty barrier allow us to avoid post queue flush. | ||
149 | */ | 123 | */ |
150 | 124 | ||
151 | int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, | 125 | int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, |
152 | sector_t nr_sects, gfp_t gfp_mask, unsigned long flags) | 126 | sector_t nr_sects, gfp_t gfp_mask) |
153 | { | 127 | { |
154 | int ret; | 128 | int ret; |
155 | struct bio *bio; | 129 | struct bio *bio; |
156 | struct bio_batch bb; | 130 | struct bio_batch bb; |
157 | unsigned int sz, issued = 0; | 131 | unsigned int sz; |
158 | DECLARE_COMPLETION_ONSTACK(wait); | 132 | DECLARE_COMPLETION_ONSTACK(wait); |
159 | 133 | ||
160 | atomic_set(&bb.done, 0); | 134 | atomic_set(&bb.done, 1); |
161 | bb.flags = 1 << BIO_UPTODATE; | 135 | bb.flags = 1 << BIO_UPTODATE; |
162 | bb.wait = &wait; | 136 | bb.wait = &wait; |
163 | bb.end_io = NULL; | ||
164 | 137 | ||
165 | if (flags & BLKDEV_IFL_BARRIER) { | ||
166 | /* issue async barrier before the data */ | ||
167 | ret = blkdev_issue_flush(bdev, gfp_mask, NULL, 0); | ||
168 | if (ret) | ||
169 | return ret; | ||
170 | } | ||
171 | submit: | ||
172 | ret = 0; | 138 | ret = 0; |
173 | while (nr_sects != 0) { | 139 | while (nr_sects != 0) { |
174 | bio = bio_alloc(gfp_mask, | 140 | bio = bio_alloc(gfp_mask, |
@@ -181,14 +147,10 @@ submit: | |||
181 | bio->bi_sector = sector; | 147 | bio->bi_sector = sector; |
182 | bio->bi_bdev = bdev; | 148 | bio->bi_bdev = bdev; |
183 | bio->bi_end_io = bio_batch_end_io; | 149 | bio->bi_end_io = bio_batch_end_io; |
184 | if (flags & BLKDEV_IFL_WAIT) | 150 | bio->bi_private = &bb; |
185 | bio->bi_private = &bb; | ||
186 | 151 | ||
187 | while (nr_sects != 0) { | 152 | while (nr_sects != 0) { |
188 | sz = min((sector_t) PAGE_SIZE >> 9 , nr_sects); | 153 | sz = min((sector_t) PAGE_SIZE >> 9 , nr_sects); |
189 | if (sz == 0) | ||
190 | /* bio has maximum size possible */ | ||
191 | break; | ||
192 | ret = bio_add_page(bio, ZERO_PAGE(0), sz << 9, 0); | 154 | ret = bio_add_page(bio, ZERO_PAGE(0), sz << 9, 0); |
193 | nr_sects -= ret >> 9; | 155 | nr_sects -= ret >> 9; |
194 | sector += ret >> 9; | 156 | sector += ret >> 9; |
@@ -196,36 +158,18 @@ submit: | |||
196 | break; | 158 | break; |
197 | } | 159 | } |
198 | ret = 0; | 160 | ret = 0; |
199 | issued++; | 161 | atomic_inc(&bb.done); |
200 | submit_bio(WRITE, bio); | 162 | submit_bio(WRITE, bio); |
201 | } | 163 | } |
202 | /* | ||
203 | * When all data bios are in flight. Send final barrier if requeted. | ||
204 | */ | ||
205 | if (nr_sects == 0 && flags & BLKDEV_IFL_BARRIER) | ||
206 | ret = blkdev_issue_flush(bdev, gfp_mask, NULL, | ||
207 | flags & BLKDEV_IFL_WAIT); | ||
208 | |||
209 | 164 | ||
210 | if (flags & BLKDEV_IFL_WAIT) | 165 | /* Wait for bios in-flight */ |
211 | /* Wait for bios in-flight */ | 166 | if (!atomic_dec_and_test(&bb.done)) |
212 | while ( issued != atomic_read(&bb.done)) | 167 | wait_for_completion(&wait); |
213 | wait_for_completion(&wait); | ||
214 | 168 | ||
215 | if (!test_bit(BIO_UPTODATE, &bb.flags)) | 169 | if (!test_bit(BIO_UPTODATE, &bb.flags)) |
216 | /* One of bios in the batch was completed with error.*/ | 170 | /* One of bios in the batch was completed with error.*/ |
217 | ret = -EIO; | 171 | ret = -EIO; |
218 | 172 | ||
219 | if (ret) | ||
220 | goto out; | ||
221 | |||
222 | if (test_bit(BIO_EOPNOTSUPP, &bb.flags)) { | ||
223 | ret = -EOPNOTSUPP; | ||
224 | goto out; | ||
225 | } | ||
226 | if (nr_sects != 0) | ||
227 | goto submit; | ||
228 | out: | ||
229 | return ret; | 173 | return ret; |
230 | } | 174 | } |
231 | EXPORT_SYMBOL(blkdev_issue_zeroout); | 175 | EXPORT_SYMBOL(blkdev_issue_zeroout); |
diff --git a/block/blk-map.c b/block/blk-map.c index ade0a08c9099..e663ac2d8e68 100644 --- a/block/blk-map.c +++ b/block/blk-map.c | |||
@@ -54,7 +54,7 @@ static int __blk_rq_map_user(struct request_queue *q, struct request *rq, | |||
54 | * direct dma. else, set up kernel bounce buffers | 54 | * direct dma. else, set up kernel bounce buffers |
55 | */ | 55 | */ |
56 | uaddr = (unsigned long) ubuf; | 56 | uaddr = (unsigned long) ubuf; |
57 | if (blk_rq_aligned(q, ubuf, len) && !map_data) | 57 | if (blk_rq_aligned(q, uaddr, len) && !map_data) |
58 | bio = bio_map_user(q, NULL, uaddr, len, reading, gfp_mask); | 58 | bio = bio_map_user(q, NULL, uaddr, len, reading, gfp_mask); |
59 | else | 59 | else |
60 | bio = bio_copy_user(q, map_data, uaddr, len, reading, gfp_mask); | 60 | bio = bio_copy_user(q, map_data, uaddr, len, reading, gfp_mask); |
@@ -201,6 +201,9 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq, | |||
201 | for (i = 0; i < iov_count; i++) { | 201 | for (i = 0; i < iov_count; i++) { |
202 | unsigned long uaddr = (unsigned long)iov[i].iov_base; | 202 | unsigned long uaddr = (unsigned long)iov[i].iov_base; |
203 | 203 | ||
204 | if (!iov[i].iov_len) | ||
205 | return -EINVAL; | ||
206 | |||
204 | if (uaddr & queue_dma_alignment(q)) { | 207 | if (uaddr & queue_dma_alignment(q)) { |
205 | unaligned = 1; | 208 | unaligned = 1; |
206 | break; | 209 | break; |
@@ -288,6 +291,7 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf, | |||
288 | unsigned int len, gfp_t gfp_mask) | 291 | unsigned int len, gfp_t gfp_mask) |
289 | { | 292 | { |
290 | int reading = rq_data_dir(rq) == READ; | 293 | int reading = rq_data_dir(rq) == READ; |
294 | unsigned long addr = (unsigned long) kbuf; | ||
291 | int do_copy = 0; | 295 | int do_copy = 0; |
292 | struct bio *bio; | 296 | struct bio *bio; |
293 | int ret; | 297 | int ret; |
@@ -297,7 +301,7 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf, | |||
297 | if (!len || !kbuf) | 301 | if (!len || !kbuf) |
298 | return -EINVAL; | 302 | return -EINVAL; |
299 | 303 | ||
300 | do_copy = !blk_rq_aligned(q, kbuf, len) || object_is_on_stack(kbuf); | 304 | do_copy = !blk_rq_aligned(q, addr, len) || object_is_on_stack(kbuf); |
301 | if (do_copy) | 305 | if (do_copy) |
302 | bio = bio_copy_kern(q, kbuf, len, gfp_mask, reading); | 306 | bio = bio_copy_kern(q, kbuf, len, gfp_mask, reading); |
303 | else | 307 | else |
diff --git a/block/blk-merge.c b/block/blk-merge.c index eafc94f68d79..cfcc37cb222b 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c | |||
@@ -21,7 +21,7 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q, | |||
21 | return 0; | 21 | return 0; |
22 | 22 | ||
23 | fbio = bio; | 23 | fbio = bio; |
24 | cluster = test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags); | 24 | cluster = blk_queue_cluster(q); |
25 | seg_size = 0; | 25 | seg_size = 0; |
26 | nr_phys_segs = 0; | 26 | nr_phys_segs = 0; |
27 | for_each_bio(bio) { | 27 | for_each_bio(bio) { |
@@ -87,7 +87,7 @@ EXPORT_SYMBOL(blk_recount_segments); | |||
87 | static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio, | 87 | static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio, |
88 | struct bio *nxt) | 88 | struct bio *nxt) |
89 | { | 89 | { |
90 | if (!test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags)) | 90 | if (!blk_queue_cluster(q)) |
91 | return 0; | 91 | return 0; |
92 | 92 | ||
93 | if (bio->bi_seg_back_size + nxt->bi_seg_front_size > | 93 | if (bio->bi_seg_back_size + nxt->bi_seg_front_size > |
@@ -123,7 +123,7 @@ int blk_rq_map_sg(struct request_queue *q, struct request *rq, | |||
123 | int nsegs, cluster; | 123 | int nsegs, cluster; |
124 | 124 | ||
125 | nsegs = 0; | 125 | nsegs = 0; |
126 | cluster = test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags); | 126 | cluster = blk_queue_cluster(q); |
127 | 127 | ||
128 | /* | 128 | /* |
129 | * for each bio in rq | 129 | * for each bio in rq |
@@ -205,12 +205,11 @@ static inline int ll_new_hw_segment(struct request_queue *q, | |||
205 | { | 205 | { |
206 | int nr_phys_segs = bio_phys_segments(q, bio); | 206 | int nr_phys_segs = bio_phys_segments(q, bio); |
207 | 207 | ||
208 | if (req->nr_phys_segments + nr_phys_segs > queue_max_segments(q)) { | 208 | if (req->nr_phys_segments + nr_phys_segs > queue_max_segments(q)) |
209 | req->cmd_flags |= REQ_NOMERGE; | 209 | goto no_merge; |
210 | if (req == q->last_merge) | 210 | |
211 | q->last_merge = NULL; | 211 | if (bio_integrity(bio) && blk_integrity_merge_bio(q, req, bio)) |
212 | return 0; | 212 | goto no_merge; |
213 | } | ||
214 | 213 | ||
215 | /* | 214 | /* |
216 | * This will form the start of a new hw segment. Bump both | 215 | * This will form the start of a new hw segment. Bump both |
@@ -218,6 +217,12 @@ static inline int ll_new_hw_segment(struct request_queue *q, | |||
218 | */ | 217 | */ |
219 | req->nr_phys_segments += nr_phys_segs; | 218 | req->nr_phys_segments += nr_phys_segs; |
220 | return 1; | 219 | return 1; |
220 | |||
221 | no_merge: | ||
222 | req->cmd_flags |= REQ_NOMERGE; | ||
223 | if (req == q->last_merge) | ||
224 | q->last_merge = NULL; | ||
225 | return 0; | ||
221 | } | 226 | } |
222 | 227 | ||
223 | int ll_back_merge_fn(struct request_queue *q, struct request *req, | 228 | int ll_back_merge_fn(struct request_queue *q, struct request *req, |
@@ -301,6 +306,9 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req, | |||
301 | if (total_phys_segments > queue_max_segments(q)) | 306 | if (total_phys_segments > queue_max_segments(q)) |
302 | return 0; | 307 | return 0; |
303 | 308 | ||
309 | if (blk_integrity_rq(req) && blk_integrity_merge_rq(q, req, next)) | ||
310 | return 0; | ||
311 | |||
304 | /* Merge is OK... */ | 312 | /* Merge is OK... */ |
305 | req->nr_phys_segments = total_phys_segments; | 313 | req->nr_phys_segments = total_phys_segments; |
306 | return 1; | 314 | return 1; |
@@ -343,11 +351,12 @@ static void blk_account_io_merge(struct request *req) | |||
343 | int cpu; | 351 | int cpu; |
344 | 352 | ||
345 | cpu = part_stat_lock(); | 353 | cpu = part_stat_lock(); |
346 | part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req)); | 354 | part = req->part; |
347 | 355 | ||
348 | part_round_stats(cpu, part); | 356 | part_round_stats(cpu, part); |
349 | part_dec_in_flight(part, rq_data_dir(req)); | 357 | part_dec_in_flight(part, rq_data_dir(req)); |
350 | 358 | ||
359 | hd_struct_put(part); | ||
351 | part_stat_unlock(); | 360 | part_stat_unlock(); |
352 | } | 361 | } |
353 | } | 362 | } |
@@ -384,9 +393,6 @@ static int attempt_merge(struct request_queue *q, struct request *req, | |||
384 | || next->special) | 393 | || next->special) |
385 | return 0; | 394 | return 0; |
386 | 395 | ||
387 | if (blk_integrity_rq(req) != blk_integrity_rq(next)) | ||
388 | return 0; | ||
389 | |||
390 | /* | 396 | /* |
391 | * If we are allowed to merge, then append bio list | 397 | * If we are allowed to merge, then append bio list |
392 | * from next to rq and release next. merge_requests_fn | 398 | * from next to rq and release next. merge_requests_fn |
@@ -459,3 +465,9 @@ int attempt_front_merge(struct request_queue *q, struct request *rq) | |||
459 | 465 | ||
460 | return 0; | 466 | return 0; |
461 | } | 467 | } |
468 | |||
469 | int blk_attempt_req_merge(struct request_queue *q, struct request *rq, | ||
470 | struct request *next) | ||
471 | { | ||
472 | return attempt_merge(q, rq, next); | ||
473 | } | ||
diff --git a/block/blk-settings.c b/block/blk-settings.c index a234f4bf1d6f..fa1eb0449a05 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c | |||
@@ -111,6 +111,7 @@ EXPORT_SYMBOL_GPL(blk_queue_lld_busy); | |||
111 | void blk_set_default_limits(struct queue_limits *lim) | 111 | void blk_set_default_limits(struct queue_limits *lim) |
112 | { | 112 | { |
113 | lim->max_segments = BLK_MAX_SEGMENTS; | 113 | lim->max_segments = BLK_MAX_SEGMENTS; |
114 | lim->max_integrity_segments = 0; | ||
114 | lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK; | 115 | lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK; |
115 | lim->max_segment_size = BLK_MAX_SEGMENT_SIZE; | 116 | lim->max_segment_size = BLK_MAX_SEGMENT_SIZE; |
116 | lim->max_sectors = BLK_DEF_MAX_SECTORS; | 117 | lim->max_sectors = BLK_DEF_MAX_SECTORS; |
@@ -119,13 +120,13 @@ void blk_set_default_limits(struct queue_limits *lim) | |||
119 | lim->discard_granularity = 0; | 120 | lim->discard_granularity = 0; |
120 | lim->discard_alignment = 0; | 121 | lim->discard_alignment = 0; |
121 | lim->discard_misaligned = 0; | 122 | lim->discard_misaligned = 0; |
122 | lim->discard_zeroes_data = -1; | 123 | lim->discard_zeroes_data = 1; |
123 | lim->logical_block_size = lim->physical_block_size = lim->io_min = 512; | 124 | lim->logical_block_size = lim->physical_block_size = lim->io_min = 512; |
124 | lim->bounce_pfn = (unsigned long)(BLK_BOUNCE_ANY >> PAGE_SHIFT); | 125 | lim->bounce_pfn = (unsigned long)(BLK_BOUNCE_ANY >> PAGE_SHIFT); |
125 | lim->alignment_offset = 0; | 126 | lim->alignment_offset = 0; |
126 | lim->io_opt = 0; | 127 | lim->io_opt = 0; |
127 | lim->misaligned = 0; | 128 | lim->misaligned = 0; |
128 | lim->no_cluster = 0; | 129 | lim->cluster = 1; |
129 | } | 130 | } |
130 | EXPORT_SYMBOL(blk_set_default_limits); | 131 | EXPORT_SYMBOL(blk_set_default_limits); |
131 | 132 | ||
@@ -163,23 +164,9 @@ void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn) | |||
163 | blk_queue_congestion_threshold(q); | 164 | blk_queue_congestion_threshold(q); |
164 | q->nr_batching = BLK_BATCH_REQ; | 165 | q->nr_batching = BLK_BATCH_REQ; |
165 | 166 | ||
166 | q->unplug_thresh = 4; /* hmm */ | ||
167 | q->unplug_delay = msecs_to_jiffies(3); /* 3 milliseconds */ | ||
168 | if (q->unplug_delay == 0) | ||
169 | q->unplug_delay = 1; | ||
170 | |||
171 | q->unplug_timer.function = blk_unplug_timeout; | ||
172 | q->unplug_timer.data = (unsigned long)q; | ||
173 | |||
174 | blk_set_default_limits(&q->limits); | 167 | blk_set_default_limits(&q->limits); |
175 | blk_queue_max_hw_sectors(q, BLK_SAFE_MAX_SECTORS); | 168 | blk_queue_max_hw_sectors(q, BLK_SAFE_MAX_SECTORS); |
176 | 169 | q->limits.discard_zeroes_data = 0; | |
177 | /* | ||
178 | * If the caller didn't supply a lock, fall back to our embedded | ||
179 | * per-queue locks | ||
180 | */ | ||
181 | if (!q->queue_lock) | ||
182 | q->queue_lock = &q->__queue_lock; | ||
183 | 170 | ||
184 | /* | 171 | /* |
185 | * by default assume old behaviour and bounce for any highmem page | 172 | * by default assume old behaviour and bounce for any highmem page |
@@ -213,7 +200,7 @@ void blk_queue_bounce_limit(struct request_queue *q, u64 dma_mask) | |||
213 | */ | 200 | */ |
214 | if (b_pfn < (min_t(u64, 0xffffffffUL, BLK_BOUNCE_HIGH) >> PAGE_SHIFT)) | 201 | if (b_pfn < (min_t(u64, 0xffffffffUL, BLK_BOUNCE_HIGH) >> PAGE_SHIFT)) |
215 | dma = 1; | 202 | dma = 1; |
216 | q->limits.bounce_pfn = max_low_pfn; | 203 | q->limits.bounce_pfn = max(max_low_pfn, b_pfn); |
217 | #else | 204 | #else |
218 | if (b_pfn < blk_max_low_pfn) | 205 | if (b_pfn < blk_max_low_pfn) |
219 | dma = 1; | 206 | dma = 1; |
@@ -228,8 +215,8 @@ void blk_queue_bounce_limit(struct request_queue *q, u64 dma_mask) | |||
228 | EXPORT_SYMBOL(blk_queue_bounce_limit); | 215 | EXPORT_SYMBOL(blk_queue_bounce_limit); |
229 | 216 | ||
230 | /** | 217 | /** |
231 | * blk_queue_max_hw_sectors - set max sectors for a request for this queue | 218 | * blk_limits_max_hw_sectors - set hard and soft limit of max sectors for request |
232 | * @q: the request queue for the device | 219 | * @limits: the queue limits |
233 | * @max_hw_sectors: max hardware sectors in the usual 512b unit | 220 | * @max_hw_sectors: max hardware sectors in the usual 512b unit |
234 | * | 221 | * |
235 | * Description: | 222 | * Description: |
@@ -243,7 +230,7 @@ EXPORT_SYMBOL(blk_queue_bounce_limit); | |||
243 | * per-device basis in /sys/block/<device>/queue/max_sectors_kb. | 230 | * per-device basis in /sys/block/<device>/queue/max_sectors_kb. |
244 | * The soft limit can not exceed max_hw_sectors. | 231 | * The soft limit can not exceed max_hw_sectors. |
245 | **/ | 232 | **/ |
246 | void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_sectors) | 233 | void blk_limits_max_hw_sectors(struct queue_limits *limits, unsigned int max_hw_sectors) |
247 | { | 234 | { |
248 | if ((max_hw_sectors << 9) < PAGE_CACHE_SIZE) { | 235 | if ((max_hw_sectors << 9) < PAGE_CACHE_SIZE) { |
249 | max_hw_sectors = 1 << (PAGE_CACHE_SHIFT - 9); | 236 | max_hw_sectors = 1 << (PAGE_CACHE_SHIFT - 9); |
@@ -251,9 +238,23 @@ void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_secto | |||
251 | __func__, max_hw_sectors); | 238 | __func__, max_hw_sectors); |
252 | } | 239 | } |
253 | 240 | ||
254 | q->limits.max_hw_sectors = max_hw_sectors; | 241 | limits->max_hw_sectors = max_hw_sectors; |
255 | q->limits.max_sectors = min_t(unsigned int, max_hw_sectors, | 242 | limits->max_sectors = min_t(unsigned int, max_hw_sectors, |
256 | BLK_DEF_MAX_SECTORS); | 243 | BLK_DEF_MAX_SECTORS); |
244 | } | ||
245 | EXPORT_SYMBOL(blk_limits_max_hw_sectors); | ||
246 | |||
247 | /** | ||
248 | * blk_queue_max_hw_sectors - set max sectors for a request for this queue | ||
249 | * @q: the request queue for the device | ||
250 | * @max_hw_sectors: max hardware sectors in the usual 512b unit | ||
251 | * | ||
252 | * Description: | ||
253 | * See description for blk_limits_max_hw_sectors(). | ||
254 | **/ | ||
255 | void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_sectors) | ||
256 | { | ||
257 | blk_limits_max_hw_sectors(&q->limits, max_hw_sectors); | ||
257 | } | 258 | } |
258 | EXPORT_SYMBOL(blk_queue_max_hw_sectors); | 259 | EXPORT_SYMBOL(blk_queue_max_hw_sectors); |
259 | 260 | ||
@@ -343,7 +344,7 @@ EXPORT_SYMBOL(blk_queue_logical_block_size); | |||
343 | * hardware can operate on without reverting to read-modify-write | 344 | * hardware can operate on without reverting to read-modify-write |
344 | * operations. | 345 | * operations. |
345 | */ | 346 | */ |
346 | void blk_queue_physical_block_size(struct request_queue *q, unsigned short size) | 347 | void blk_queue_physical_block_size(struct request_queue *q, unsigned int size) |
347 | { | 348 | { |
348 | q->limits.physical_block_size = size; | 349 | q->limits.physical_block_size = size; |
349 | 350 | ||
@@ -455,11 +456,6 @@ void blk_queue_io_opt(struct request_queue *q, unsigned int opt) | |||
455 | } | 456 | } |
456 | EXPORT_SYMBOL(blk_queue_io_opt); | 457 | EXPORT_SYMBOL(blk_queue_io_opt); |
457 | 458 | ||
458 | /* | ||
459 | * Returns the minimum that is _not_ zero, unless both are zero. | ||
460 | */ | ||
461 | #define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r)) | ||
462 | |||
463 | /** | 459 | /** |
464 | * blk_queue_stack_limits - inherit underlying queue limits for stacked drivers | 460 | * blk_queue_stack_limits - inherit underlying queue limits for stacked drivers |
465 | * @t: the stacking driver (top) | 461 | * @t: the stacking driver (top) |
@@ -468,15 +464,6 @@ EXPORT_SYMBOL(blk_queue_io_opt); | |||
468 | void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b) | 464 | void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b) |
469 | { | 465 | { |
470 | blk_stack_limits(&t->limits, &b->limits, 0); | 466 | blk_stack_limits(&t->limits, &b->limits, 0); |
471 | |||
472 | if (!t->queue_lock) | ||
473 | WARN_ON_ONCE(1); | ||
474 | else if (!test_bit(QUEUE_FLAG_CLUSTER, &b->queue_flags)) { | ||
475 | unsigned long flags; | ||
476 | spin_lock_irqsave(t->queue_lock, flags); | ||
477 | queue_flag_clear(QUEUE_FLAG_CLUSTER, t); | ||
478 | spin_unlock_irqrestore(t->queue_lock, flags); | ||
479 | } | ||
480 | } | 467 | } |
481 | EXPORT_SYMBOL(blk_queue_stack_limits); | 468 | EXPORT_SYMBOL(blk_queue_stack_limits); |
482 | 469 | ||
@@ -514,6 +501,8 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, | |||
514 | b->seg_boundary_mask); | 501 | b->seg_boundary_mask); |
515 | 502 | ||
516 | t->max_segments = min_not_zero(t->max_segments, b->max_segments); | 503 | t->max_segments = min_not_zero(t->max_segments, b->max_segments); |
504 | t->max_integrity_segments = min_not_zero(t->max_integrity_segments, | ||
505 | b->max_integrity_segments); | ||
517 | 506 | ||
518 | t->max_segment_size = min_not_zero(t->max_segment_size, | 507 | t->max_segment_size = min_not_zero(t->max_segment_size, |
519 | b->max_segment_size); | 508 | b->max_segment_size); |
@@ -547,7 +536,7 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, | |||
547 | t->io_min = max(t->io_min, b->io_min); | 536 | t->io_min = max(t->io_min, b->io_min); |
548 | t->io_opt = lcm(t->io_opt, b->io_opt); | 537 | t->io_opt = lcm(t->io_opt, b->io_opt); |
549 | 538 | ||
550 | t->no_cluster |= b->no_cluster; | 539 | t->cluster &= b->cluster; |
551 | t->discard_zeroes_data &= b->discard_zeroes_data; | 540 | t->discard_zeroes_data &= b->discard_zeroes_data; |
552 | 541 | ||
553 | /* Physical block size a multiple of the logical block size? */ | 542 | /* Physical block size a multiple of the logical block size? */ |
@@ -643,7 +632,6 @@ void disk_stack_limits(struct gendisk *disk, struct block_device *bdev, | |||
643 | sector_t offset) | 632 | sector_t offset) |
644 | { | 633 | { |
645 | struct request_queue *t = disk->queue; | 634 | struct request_queue *t = disk->queue; |
646 | struct request_queue *b = bdev_get_queue(bdev); | ||
647 | 635 | ||
648 | if (bdev_stack_limits(&t->limits, bdev, offset >> 9) < 0) { | 636 | if (bdev_stack_limits(&t->limits, bdev, offset >> 9) < 0) { |
649 | char top[BDEVNAME_SIZE], bottom[BDEVNAME_SIZE]; | 637 | char top[BDEVNAME_SIZE], bottom[BDEVNAME_SIZE]; |
@@ -654,17 +642,6 @@ void disk_stack_limits(struct gendisk *disk, struct block_device *bdev, | |||
654 | printk(KERN_NOTICE "%s: Warning: Device %s is misaligned\n", | 642 | printk(KERN_NOTICE "%s: Warning: Device %s is misaligned\n", |
655 | top, bottom); | 643 | top, bottom); |
656 | } | 644 | } |
657 | |||
658 | if (!t->queue_lock) | ||
659 | WARN_ON_ONCE(1); | ||
660 | else if (!test_bit(QUEUE_FLAG_CLUSTER, &b->queue_flags)) { | ||
661 | unsigned long flags; | ||
662 | |||
663 | spin_lock_irqsave(t->queue_lock, flags); | ||
664 | if (!test_bit(QUEUE_FLAG_CLUSTER, &b->queue_flags)) | ||
665 | queue_flag_clear(QUEUE_FLAG_CLUSTER, t); | ||
666 | spin_unlock_irqrestore(t->queue_lock, flags); | ||
667 | } | ||
668 | } | 645 | } |
669 | EXPORT_SYMBOL(disk_stack_limits); | 646 | EXPORT_SYMBOL(disk_stack_limits); |
670 | 647 | ||
@@ -794,6 +771,32 @@ void blk_queue_update_dma_alignment(struct request_queue *q, int mask) | |||
794 | } | 771 | } |
795 | EXPORT_SYMBOL(blk_queue_update_dma_alignment); | 772 | EXPORT_SYMBOL(blk_queue_update_dma_alignment); |
796 | 773 | ||
774 | /** | ||
775 | * blk_queue_flush - configure queue's cache flush capability | ||
776 | * @q: the request queue for the device | ||
777 | * @flush: 0, REQ_FLUSH or REQ_FLUSH | REQ_FUA | ||
778 | * | ||
779 | * Tell block layer cache flush capability of @q. If it supports | ||
780 | * flushing, REQ_FLUSH should be set. If it supports bypassing | ||
781 | * write cache for individual writes, REQ_FUA should be set. | ||
782 | */ | ||
783 | void blk_queue_flush(struct request_queue *q, unsigned int flush) | ||
784 | { | ||
785 | WARN_ON_ONCE(flush & ~(REQ_FLUSH | REQ_FUA)); | ||
786 | |||
787 | if (WARN_ON_ONCE(!(flush & REQ_FLUSH) && (flush & REQ_FUA))) | ||
788 | flush &= ~REQ_FUA; | ||
789 | |||
790 | q->flush_flags = flush & (REQ_FLUSH | REQ_FUA); | ||
791 | } | ||
792 | EXPORT_SYMBOL_GPL(blk_queue_flush); | ||
793 | |||
794 | void blk_queue_flush_queueable(struct request_queue *q, bool queueable) | ||
795 | { | ||
796 | q->flush_not_queueable = !queueable; | ||
797 | } | ||
798 | EXPORT_SYMBOL_GPL(blk_queue_flush_queueable); | ||
799 | |||
797 | static int __init blk_settings_init(void) | 800 | static int __init blk_settings_init(void) |
798 | { | 801 | { |
799 | blk_max_low_pfn = max_low_pfn - 1; | 802 | blk_max_low_pfn = max_low_pfn - 1; |
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 0749b89c6885..d935bd859c87 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c | |||
@@ -66,14 +66,14 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count) | |||
66 | 66 | ||
67 | if (rl->count[BLK_RW_SYNC] >= q->nr_requests) { | 67 | if (rl->count[BLK_RW_SYNC] >= q->nr_requests) { |
68 | blk_set_queue_full(q, BLK_RW_SYNC); | 68 | blk_set_queue_full(q, BLK_RW_SYNC); |
69 | } else if (rl->count[BLK_RW_SYNC]+1 <= q->nr_requests) { | 69 | } else { |
70 | blk_clear_queue_full(q, BLK_RW_SYNC); | 70 | blk_clear_queue_full(q, BLK_RW_SYNC); |
71 | wake_up(&rl->wait[BLK_RW_SYNC]); | 71 | wake_up(&rl->wait[BLK_RW_SYNC]); |
72 | } | 72 | } |
73 | 73 | ||
74 | if (rl->count[BLK_RW_ASYNC] >= q->nr_requests) { | 74 | if (rl->count[BLK_RW_ASYNC] >= q->nr_requests) { |
75 | blk_set_queue_full(q, BLK_RW_ASYNC); | 75 | blk_set_queue_full(q, BLK_RW_ASYNC); |
76 | } else if (rl->count[BLK_RW_ASYNC]+1 <= q->nr_requests) { | 76 | } else { |
77 | blk_clear_queue_full(q, BLK_RW_ASYNC); | 77 | blk_clear_queue_full(q, BLK_RW_ASYNC); |
78 | wake_up(&rl->wait[BLK_RW_ASYNC]); | 78 | wake_up(&rl->wait[BLK_RW_ASYNC]); |
79 | } | 79 | } |
@@ -112,9 +112,14 @@ static ssize_t queue_max_segments_show(struct request_queue *q, char *page) | |||
112 | return queue_var_show(queue_max_segments(q), (page)); | 112 | return queue_var_show(queue_max_segments(q), (page)); |
113 | } | 113 | } |
114 | 114 | ||
115 | static ssize_t queue_max_integrity_segments_show(struct request_queue *q, char *page) | ||
116 | { | ||
117 | return queue_var_show(q->limits.max_integrity_segments, (page)); | ||
118 | } | ||
119 | |||
115 | static ssize_t queue_max_segment_size_show(struct request_queue *q, char *page) | 120 | static ssize_t queue_max_segment_size_show(struct request_queue *q, char *page) |
116 | { | 121 | { |
117 | if (test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags)) | 122 | if (blk_queue_cluster(q)) |
118 | return queue_var_show(queue_max_segment_size(q), (page)); | 123 | return queue_var_show(queue_max_segment_size(q), (page)); |
119 | 124 | ||
120 | return queue_var_show(PAGE_CACHE_SIZE, (page)); | 125 | return queue_var_show(PAGE_CACHE_SIZE, (page)); |
@@ -147,7 +152,8 @@ static ssize_t queue_discard_granularity_show(struct request_queue *q, char *pag | |||
147 | 152 | ||
148 | static ssize_t queue_discard_max_show(struct request_queue *q, char *page) | 153 | static ssize_t queue_discard_max_show(struct request_queue *q, char *page) |
149 | { | 154 | { |
150 | return queue_var_show(q->limits.max_discard_sectors << 9, page); | 155 | return sprintf(page, "%llu\n", |
156 | (unsigned long long)q->limits.max_discard_sectors << 9); | ||
151 | } | 157 | } |
152 | 158 | ||
153 | static ssize_t queue_discard_zeroes_data_show(struct request_queue *q, char *page) | 159 | static ssize_t queue_discard_zeroes_data_show(struct request_queue *q, char *page) |
@@ -288,6 +294,11 @@ static struct queue_sysfs_entry queue_max_segments_entry = { | |||
288 | .show = queue_max_segments_show, | 294 | .show = queue_max_segments_show, |
289 | }; | 295 | }; |
290 | 296 | ||
297 | static struct queue_sysfs_entry queue_max_integrity_segments_entry = { | ||
298 | .attr = {.name = "max_integrity_segments", .mode = S_IRUGO }, | ||
299 | .show = queue_max_integrity_segments_show, | ||
300 | }; | ||
301 | |||
291 | static struct queue_sysfs_entry queue_max_segment_size_entry = { | 302 | static struct queue_sysfs_entry queue_max_segment_size_entry = { |
292 | .attr = {.name = "max_segment_size", .mode = S_IRUGO }, | 303 | .attr = {.name = "max_segment_size", .mode = S_IRUGO }, |
293 | .show = queue_max_segment_size_show, | 304 | .show = queue_max_segment_size_show, |
@@ -375,6 +386,7 @@ static struct attribute *default_attrs[] = { | |||
375 | &queue_max_hw_sectors_entry.attr, | 386 | &queue_max_hw_sectors_entry.attr, |
376 | &queue_max_sectors_entry.attr, | 387 | &queue_max_sectors_entry.attr, |
377 | &queue_max_segments_entry.attr, | 388 | &queue_max_segments_entry.attr, |
389 | &queue_max_integrity_segments_entry.attr, | ||
378 | &queue_max_segment_size_entry.attr, | 390 | &queue_max_segment_size_entry.attr, |
379 | &queue_iosched_entry.attr, | 391 | &queue_iosched_entry.attr, |
380 | &queue_hw_sector_size_entry.attr, | 392 | &queue_hw_sector_size_entry.attr, |
@@ -487,7 +499,6 @@ int blk_register_queue(struct gendisk *disk) | |||
487 | { | 499 | { |
488 | int ret; | 500 | int ret; |
489 | struct device *dev = disk_to_dev(disk); | 501 | struct device *dev = disk_to_dev(disk); |
490 | |||
491 | struct request_queue *q = disk->queue; | 502 | struct request_queue *q = disk->queue; |
492 | 503 | ||
493 | if (WARN_ON(!q)) | 504 | if (WARN_ON(!q)) |
@@ -498,8 +509,10 @@ int blk_register_queue(struct gendisk *disk) | |||
498 | return ret; | 509 | return ret; |
499 | 510 | ||
500 | ret = kobject_add(&q->kobj, kobject_get(&dev->kobj), "%s", "queue"); | 511 | ret = kobject_add(&q->kobj, kobject_get(&dev->kobj), "%s", "queue"); |
501 | if (ret < 0) | 512 | if (ret < 0) { |
513 | blk_trace_remove_sysfs(dev); | ||
502 | return ret; | 514 | return ret; |
515 | } | ||
503 | 516 | ||
504 | kobject_uevent(&q->kobj, KOBJ_ADD); | 517 | kobject_uevent(&q->kobj, KOBJ_ADD); |
505 | 518 | ||
@@ -510,7 +523,7 @@ int blk_register_queue(struct gendisk *disk) | |||
510 | if (ret) { | 523 | if (ret) { |
511 | kobject_uevent(&q->kobj, KOBJ_REMOVE); | 524 | kobject_uevent(&q->kobj, KOBJ_REMOVE); |
512 | kobject_del(&q->kobj); | 525 | kobject_del(&q->kobj); |
513 | blk_trace_remove_sysfs(disk_to_dev(disk)); | 526 | blk_trace_remove_sysfs(dev); |
514 | kobject_put(&dev->kobj); | 527 | kobject_put(&dev->kobj); |
515 | return ret; | 528 | return ret; |
516 | } | 529 | } |
diff --git a/block/blk-throttle.c b/block/blk-throttle.c new file mode 100644 index 000000000000..3689f833afdc --- /dev/null +++ b/block/blk-throttle.c | |||
@@ -0,0 +1,1312 @@ | |||
1 | /* | ||
2 | * Interface for controlling IO bandwidth on a request queue | ||
3 | * | ||
4 | * Copyright (C) 2010 Vivek Goyal <vgoyal@redhat.com> | ||
5 | */ | ||
6 | |||
7 | #include <linux/module.h> | ||
8 | #include <linux/slab.h> | ||
9 | #include <linux/blkdev.h> | ||
10 | #include <linux/bio.h> | ||
11 | #include <linux/blktrace_api.h> | ||
12 | #include "blk-cgroup.h" | ||
13 | |||
14 | /* Max dispatch from a group in 1 round */ | ||
15 | static int throtl_grp_quantum = 8; | ||
16 | |||
17 | /* Total max dispatch from all groups in one round */ | ||
18 | static int throtl_quantum = 32; | ||
19 | |||
20 | /* Throttling is performed over 100ms slice and after that slice is renewed */ | ||
21 | static unsigned long throtl_slice = HZ/10; /* 100 ms */ | ||
22 | |||
23 | /* A workqueue to queue throttle related work */ | ||
24 | static struct workqueue_struct *kthrotld_workqueue; | ||
25 | static void throtl_schedule_delayed_work(struct throtl_data *td, | ||
26 | unsigned long delay); | ||
27 | |||
28 | struct throtl_rb_root { | ||
29 | struct rb_root rb; | ||
30 | struct rb_node *left; | ||
31 | unsigned int count; | ||
32 | unsigned long min_disptime; | ||
33 | }; | ||
34 | |||
35 | #define THROTL_RB_ROOT (struct throtl_rb_root) { .rb = RB_ROOT, .left = NULL, \ | ||
36 | .count = 0, .min_disptime = 0} | ||
37 | |||
38 | #define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node) | ||
39 | |||
40 | struct throtl_grp { | ||
41 | /* List of throtl groups on the request queue*/ | ||
42 | struct hlist_node tg_node; | ||
43 | |||
44 | /* active throtl group service_tree member */ | ||
45 | struct rb_node rb_node; | ||
46 | |||
47 | /* | ||
48 | * Dispatch time in jiffies. This is the estimated time when group | ||
49 | * will unthrottle and is ready to dispatch more bio. It is used as | ||
50 | * key to sort active groups in service tree. | ||
51 | */ | ||
52 | unsigned long disptime; | ||
53 | |||
54 | struct blkio_group blkg; | ||
55 | atomic_t ref; | ||
56 | unsigned int flags; | ||
57 | |||
58 | /* Two lists for READ and WRITE */ | ||
59 | struct bio_list bio_lists[2]; | ||
60 | |||
61 | /* Number of queued bios on READ and WRITE lists */ | ||
62 | unsigned int nr_queued[2]; | ||
63 | |||
64 | /* bytes per second rate limits */ | ||
65 | uint64_t bps[2]; | ||
66 | |||
67 | /* IOPS limits */ | ||
68 | unsigned int iops[2]; | ||
69 | |||
70 | /* Number of bytes disptached in current slice */ | ||
71 | uint64_t bytes_disp[2]; | ||
72 | /* Number of bio's dispatched in current slice */ | ||
73 | unsigned int io_disp[2]; | ||
74 | |||
75 | /* When did we start a new slice */ | ||
76 | unsigned long slice_start[2]; | ||
77 | unsigned long slice_end[2]; | ||
78 | |||
79 | /* Some throttle limits got updated for the group */ | ||
80 | int limits_changed; | ||
81 | |||
82 | struct rcu_head rcu_head; | ||
83 | }; | ||
84 | |||
85 | struct throtl_data | ||
86 | { | ||
87 | /* List of throtl groups */ | ||
88 | struct hlist_head tg_list; | ||
89 | |||
90 | /* service tree for active throtl groups */ | ||
91 | struct throtl_rb_root tg_service_tree; | ||
92 | |||
93 | struct throtl_grp *root_tg; | ||
94 | struct request_queue *queue; | ||
95 | |||
96 | /* Total Number of queued bios on READ and WRITE lists */ | ||
97 | unsigned int nr_queued[2]; | ||
98 | |||
99 | /* | ||
100 | * number of total undestroyed groups | ||
101 | */ | ||
102 | unsigned int nr_undestroyed_grps; | ||
103 | |||
104 | /* Work for dispatching throttled bios */ | ||
105 | struct delayed_work throtl_work; | ||
106 | |||
107 | int limits_changed; | ||
108 | }; | ||
109 | |||
110 | enum tg_state_flags { | ||
111 | THROTL_TG_FLAG_on_rr = 0, /* on round-robin busy list */ | ||
112 | }; | ||
113 | |||
114 | #define THROTL_TG_FNS(name) \ | ||
115 | static inline void throtl_mark_tg_##name(struct throtl_grp *tg) \ | ||
116 | { \ | ||
117 | (tg)->flags |= (1 << THROTL_TG_FLAG_##name); \ | ||
118 | } \ | ||
119 | static inline void throtl_clear_tg_##name(struct throtl_grp *tg) \ | ||
120 | { \ | ||
121 | (tg)->flags &= ~(1 << THROTL_TG_FLAG_##name); \ | ||
122 | } \ | ||
123 | static inline int throtl_tg_##name(const struct throtl_grp *tg) \ | ||
124 | { \ | ||
125 | return ((tg)->flags & (1 << THROTL_TG_FLAG_##name)) != 0; \ | ||
126 | } | ||
127 | |||
128 | THROTL_TG_FNS(on_rr); | ||
129 | |||
130 | #define throtl_log_tg(td, tg, fmt, args...) \ | ||
131 | blk_add_trace_msg((td)->queue, "throtl %s " fmt, \ | ||
132 | blkg_path(&(tg)->blkg), ##args); \ | ||
133 | |||
134 | #define throtl_log(td, fmt, args...) \ | ||
135 | blk_add_trace_msg((td)->queue, "throtl " fmt, ##args) | ||
136 | |||
137 | static inline struct throtl_grp *tg_of_blkg(struct blkio_group *blkg) | ||
138 | { | ||
139 | if (blkg) | ||
140 | return container_of(blkg, struct throtl_grp, blkg); | ||
141 | |||
142 | return NULL; | ||
143 | } | ||
144 | |||
145 | static inline int total_nr_queued(struct throtl_data *td) | ||
146 | { | ||
147 | return (td->nr_queued[0] + td->nr_queued[1]); | ||
148 | } | ||
149 | |||
150 | static inline struct throtl_grp *throtl_ref_get_tg(struct throtl_grp *tg) | ||
151 | { | ||
152 | atomic_inc(&tg->ref); | ||
153 | return tg; | ||
154 | } | ||
155 | |||
156 | static void throtl_free_tg(struct rcu_head *head) | ||
157 | { | ||
158 | struct throtl_grp *tg; | ||
159 | |||
160 | tg = container_of(head, struct throtl_grp, rcu_head); | ||
161 | free_percpu(tg->blkg.stats_cpu); | ||
162 | kfree(tg); | ||
163 | } | ||
164 | |||
165 | static void throtl_put_tg(struct throtl_grp *tg) | ||
166 | { | ||
167 | BUG_ON(atomic_read(&tg->ref) <= 0); | ||
168 | if (!atomic_dec_and_test(&tg->ref)) | ||
169 | return; | ||
170 | |||
171 | /* | ||
172 | * A group is freed in rcu manner. But having an rcu lock does not | ||
173 | * mean that one can access all the fields of blkg and assume these | ||
174 | * are valid. For example, don't try to follow throtl_data and | ||
175 | * request queue links. | ||
176 | * | ||
177 | * Having a reference to blkg under an rcu allows acess to only | ||
178 | * values local to groups like group stats and group rate limits | ||
179 | */ | ||
180 | call_rcu(&tg->rcu_head, throtl_free_tg); | ||
181 | } | ||
182 | |||
183 | static void throtl_init_group(struct throtl_grp *tg) | ||
184 | { | ||
185 | INIT_HLIST_NODE(&tg->tg_node); | ||
186 | RB_CLEAR_NODE(&tg->rb_node); | ||
187 | bio_list_init(&tg->bio_lists[0]); | ||
188 | bio_list_init(&tg->bio_lists[1]); | ||
189 | tg->limits_changed = false; | ||
190 | |||
191 | /* Practically unlimited BW */ | ||
192 | tg->bps[0] = tg->bps[1] = -1; | ||
193 | tg->iops[0] = tg->iops[1] = -1; | ||
194 | |||
195 | /* | ||
196 | * Take the initial reference that will be released on destroy | ||
197 | * This can be thought of a joint reference by cgroup and | ||
198 | * request queue which will be dropped by either request queue | ||
199 | * exit or cgroup deletion path depending on who is exiting first. | ||
200 | */ | ||
201 | atomic_set(&tg->ref, 1); | ||
202 | } | ||
203 | |||
204 | /* Should be called with rcu read lock held (needed for blkcg) */ | ||
205 | static void | ||
206 | throtl_add_group_to_td_list(struct throtl_data *td, struct throtl_grp *tg) | ||
207 | { | ||
208 | hlist_add_head(&tg->tg_node, &td->tg_list); | ||
209 | td->nr_undestroyed_grps++; | ||
210 | } | ||
211 | |||
212 | static void | ||
213 | __throtl_tg_fill_dev_details(struct throtl_data *td, struct throtl_grp *tg) | ||
214 | { | ||
215 | struct backing_dev_info *bdi = &td->queue->backing_dev_info; | ||
216 | unsigned int major, minor; | ||
217 | |||
218 | if (!tg || tg->blkg.dev) | ||
219 | return; | ||
220 | |||
221 | /* | ||
222 | * Fill in device details for a group which might not have been | ||
223 | * filled at group creation time as queue was being instantiated | ||
224 | * and driver had not attached a device yet | ||
225 | */ | ||
226 | if (bdi->dev && dev_name(bdi->dev)) { | ||
227 | sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); | ||
228 | tg->blkg.dev = MKDEV(major, minor); | ||
229 | } | ||
230 | } | ||
231 | |||
232 | /* | ||
233 | * Should be called with without queue lock held. Here queue lock will be | ||
234 | * taken rarely. It will be taken only once during life time of a group | ||
235 | * if need be | ||
236 | */ | ||
237 | static void | ||
238 | throtl_tg_fill_dev_details(struct throtl_data *td, struct throtl_grp *tg) | ||
239 | { | ||
240 | if (!tg || tg->blkg.dev) | ||
241 | return; | ||
242 | |||
243 | spin_lock_irq(td->queue->queue_lock); | ||
244 | __throtl_tg_fill_dev_details(td, tg); | ||
245 | spin_unlock_irq(td->queue->queue_lock); | ||
246 | } | ||
247 | |||
248 | static void throtl_init_add_tg_lists(struct throtl_data *td, | ||
249 | struct throtl_grp *tg, struct blkio_cgroup *blkcg) | ||
250 | { | ||
251 | __throtl_tg_fill_dev_details(td, tg); | ||
252 | |||
253 | /* Add group onto cgroup list */ | ||
254 | blkiocg_add_blkio_group(blkcg, &tg->blkg, (void *)td, | ||
255 | tg->blkg.dev, BLKIO_POLICY_THROTL); | ||
256 | |||
257 | tg->bps[READ] = blkcg_get_read_bps(blkcg, tg->blkg.dev); | ||
258 | tg->bps[WRITE] = blkcg_get_write_bps(blkcg, tg->blkg.dev); | ||
259 | tg->iops[READ] = blkcg_get_read_iops(blkcg, tg->blkg.dev); | ||
260 | tg->iops[WRITE] = blkcg_get_write_iops(blkcg, tg->blkg.dev); | ||
261 | |||
262 | throtl_add_group_to_td_list(td, tg); | ||
263 | } | ||
264 | |||
265 | /* Should be called without queue lock and outside of rcu period */ | ||
266 | static struct throtl_grp *throtl_alloc_tg(struct throtl_data *td) | ||
267 | { | ||
268 | struct throtl_grp *tg = NULL; | ||
269 | int ret; | ||
270 | |||
271 | tg = kzalloc_node(sizeof(*tg), GFP_ATOMIC, td->queue->node); | ||
272 | if (!tg) | ||
273 | return NULL; | ||
274 | |||
275 | ret = blkio_alloc_blkg_stats(&tg->blkg); | ||
276 | |||
277 | if (ret) { | ||
278 | kfree(tg); | ||
279 | return NULL; | ||
280 | } | ||
281 | |||
282 | throtl_init_group(tg); | ||
283 | return tg; | ||
284 | } | ||
285 | |||
286 | static struct | ||
287 | throtl_grp *throtl_find_tg(struct throtl_data *td, struct blkio_cgroup *blkcg) | ||
288 | { | ||
289 | struct throtl_grp *tg = NULL; | ||
290 | void *key = td; | ||
291 | |||
292 | /* | ||
293 | * This is the common case when there are no blkio cgroups. | ||
294 | * Avoid lookup in this case | ||
295 | */ | ||
296 | if (blkcg == &blkio_root_cgroup) | ||
297 | tg = td->root_tg; | ||
298 | else | ||
299 | tg = tg_of_blkg(blkiocg_lookup_group(blkcg, key)); | ||
300 | |||
301 | __throtl_tg_fill_dev_details(td, tg); | ||
302 | return tg; | ||
303 | } | ||
304 | |||
305 | /* | ||
306 | * This function returns with queue lock unlocked in case of error, like | ||
307 | * request queue is no more | ||
308 | */ | ||
309 | static struct throtl_grp * throtl_get_tg(struct throtl_data *td) | ||
310 | { | ||
311 | struct throtl_grp *tg = NULL, *__tg = NULL; | ||
312 | struct blkio_cgroup *blkcg; | ||
313 | struct request_queue *q = td->queue; | ||
314 | |||
315 | rcu_read_lock(); | ||
316 | blkcg = task_blkio_cgroup(current); | ||
317 | tg = throtl_find_tg(td, blkcg); | ||
318 | if (tg) { | ||
319 | rcu_read_unlock(); | ||
320 | return tg; | ||
321 | } | ||
322 | |||
323 | /* | ||
324 | * Need to allocate a group. Allocation of group also needs allocation | ||
325 | * of per cpu stats which in-turn takes a mutex() and can block. Hence | ||
326 | * we need to drop rcu lock and queue_lock before we call alloc | ||
327 | * | ||
328 | * Take the request queue reference to make sure queue does not | ||
329 | * go away once we return from allocation. | ||
330 | */ | ||
331 | blk_get_queue(q); | ||
332 | rcu_read_unlock(); | ||
333 | spin_unlock_irq(q->queue_lock); | ||
334 | |||
335 | tg = throtl_alloc_tg(td); | ||
336 | /* | ||
337 | * We might have slept in group allocation. Make sure queue is not | ||
338 | * dead | ||
339 | */ | ||
340 | if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) { | ||
341 | blk_put_queue(q); | ||
342 | if (tg) | ||
343 | kfree(tg); | ||
344 | |||
345 | return ERR_PTR(-ENODEV); | ||
346 | } | ||
347 | blk_put_queue(q); | ||
348 | |||
349 | /* Group allocated and queue is still alive. take the lock */ | ||
350 | spin_lock_irq(q->queue_lock); | ||
351 | |||
352 | /* | ||
353 | * Initialize the new group. After sleeping, read the blkcg again. | ||
354 | */ | ||
355 | rcu_read_lock(); | ||
356 | blkcg = task_blkio_cgroup(current); | ||
357 | |||
358 | /* | ||
359 | * If some other thread already allocated the group while we were | ||
360 | * not holding queue lock, free up the group | ||
361 | */ | ||
362 | __tg = throtl_find_tg(td, blkcg); | ||
363 | |||
364 | if (__tg) { | ||
365 | kfree(tg); | ||
366 | rcu_read_unlock(); | ||
367 | return __tg; | ||
368 | } | ||
369 | |||
370 | /* Group allocation failed. Account the IO to root group */ | ||
371 | if (!tg) { | ||
372 | tg = td->root_tg; | ||
373 | return tg; | ||
374 | } | ||
375 | |||
376 | throtl_init_add_tg_lists(td, tg, blkcg); | ||
377 | rcu_read_unlock(); | ||
378 | return tg; | ||
379 | } | ||
380 | |||
381 | static struct throtl_grp *throtl_rb_first(struct throtl_rb_root *root) | ||
382 | { | ||
383 | /* Service tree is empty */ | ||
384 | if (!root->count) | ||
385 | return NULL; | ||
386 | |||
387 | if (!root->left) | ||
388 | root->left = rb_first(&root->rb); | ||
389 | |||
390 | if (root->left) | ||
391 | return rb_entry_tg(root->left); | ||
392 | |||
393 | return NULL; | ||
394 | } | ||
395 | |||
396 | static void rb_erase_init(struct rb_node *n, struct rb_root *root) | ||
397 | { | ||
398 | rb_erase(n, root); | ||
399 | RB_CLEAR_NODE(n); | ||
400 | } | ||
401 | |||
402 | static void throtl_rb_erase(struct rb_node *n, struct throtl_rb_root *root) | ||
403 | { | ||
404 | if (root->left == n) | ||
405 | root->left = NULL; | ||
406 | rb_erase_init(n, &root->rb); | ||
407 | --root->count; | ||
408 | } | ||
409 | |||
410 | static void update_min_dispatch_time(struct throtl_rb_root *st) | ||
411 | { | ||
412 | struct throtl_grp *tg; | ||
413 | |||
414 | tg = throtl_rb_first(st); | ||
415 | if (!tg) | ||
416 | return; | ||
417 | |||
418 | st->min_disptime = tg->disptime; | ||
419 | } | ||
420 | |||
421 | static void | ||
422 | tg_service_tree_add(struct throtl_rb_root *st, struct throtl_grp *tg) | ||
423 | { | ||
424 | struct rb_node **node = &st->rb.rb_node; | ||
425 | struct rb_node *parent = NULL; | ||
426 | struct throtl_grp *__tg; | ||
427 | unsigned long key = tg->disptime; | ||
428 | int left = 1; | ||
429 | |||
430 | while (*node != NULL) { | ||
431 | parent = *node; | ||
432 | __tg = rb_entry_tg(parent); | ||
433 | |||
434 | if (time_before(key, __tg->disptime)) | ||
435 | node = &parent->rb_left; | ||
436 | else { | ||
437 | node = &parent->rb_right; | ||
438 | left = 0; | ||
439 | } | ||
440 | } | ||
441 | |||
442 | if (left) | ||
443 | st->left = &tg->rb_node; | ||
444 | |||
445 | rb_link_node(&tg->rb_node, parent, node); | ||
446 | rb_insert_color(&tg->rb_node, &st->rb); | ||
447 | } | ||
448 | |||
449 | static void __throtl_enqueue_tg(struct throtl_data *td, struct throtl_grp *tg) | ||
450 | { | ||
451 | struct throtl_rb_root *st = &td->tg_service_tree; | ||
452 | |||
453 | tg_service_tree_add(st, tg); | ||
454 | throtl_mark_tg_on_rr(tg); | ||
455 | st->count++; | ||
456 | } | ||
457 | |||
458 | static void throtl_enqueue_tg(struct throtl_data *td, struct throtl_grp *tg) | ||
459 | { | ||
460 | if (!throtl_tg_on_rr(tg)) | ||
461 | __throtl_enqueue_tg(td, tg); | ||
462 | } | ||
463 | |||
464 | static void __throtl_dequeue_tg(struct throtl_data *td, struct throtl_grp *tg) | ||
465 | { | ||
466 | throtl_rb_erase(&tg->rb_node, &td->tg_service_tree); | ||
467 | throtl_clear_tg_on_rr(tg); | ||
468 | } | ||
469 | |||
470 | static void throtl_dequeue_tg(struct throtl_data *td, struct throtl_grp *tg) | ||
471 | { | ||
472 | if (throtl_tg_on_rr(tg)) | ||
473 | __throtl_dequeue_tg(td, tg); | ||
474 | } | ||
475 | |||
476 | static void throtl_schedule_next_dispatch(struct throtl_data *td) | ||
477 | { | ||
478 | struct throtl_rb_root *st = &td->tg_service_tree; | ||
479 | |||
480 | /* | ||
481 | * If there are more bios pending, schedule more work. | ||
482 | */ | ||
483 | if (!total_nr_queued(td)) | ||
484 | return; | ||
485 | |||
486 | BUG_ON(!st->count); | ||
487 | |||
488 | update_min_dispatch_time(st); | ||
489 | |||
490 | if (time_before_eq(st->min_disptime, jiffies)) | ||
491 | throtl_schedule_delayed_work(td, 0); | ||
492 | else | ||
493 | throtl_schedule_delayed_work(td, (st->min_disptime - jiffies)); | ||
494 | } | ||
495 | |||
496 | static inline void | ||
497 | throtl_start_new_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw) | ||
498 | { | ||
499 | tg->bytes_disp[rw] = 0; | ||
500 | tg->io_disp[rw] = 0; | ||
501 | tg->slice_start[rw] = jiffies; | ||
502 | tg->slice_end[rw] = jiffies + throtl_slice; | ||
503 | throtl_log_tg(td, tg, "[%c] new slice start=%lu end=%lu jiffies=%lu", | ||
504 | rw == READ ? 'R' : 'W', tg->slice_start[rw], | ||
505 | tg->slice_end[rw], jiffies); | ||
506 | } | ||
507 | |||
508 | static inline void throtl_set_slice_end(struct throtl_data *td, | ||
509 | struct throtl_grp *tg, bool rw, unsigned long jiffy_end) | ||
510 | { | ||
511 | tg->slice_end[rw] = roundup(jiffy_end, throtl_slice); | ||
512 | } | ||
513 | |||
514 | static inline void throtl_extend_slice(struct throtl_data *td, | ||
515 | struct throtl_grp *tg, bool rw, unsigned long jiffy_end) | ||
516 | { | ||
517 | tg->slice_end[rw] = roundup(jiffy_end, throtl_slice); | ||
518 | throtl_log_tg(td, tg, "[%c] extend slice start=%lu end=%lu jiffies=%lu", | ||
519 | rw == READ ? 'R' : 'W', tg->slice_start[rw], | ||
520 | tg->slice_end[rw], jiffies); | ||
521 | } | ||
522 | |||
523 | /* Determine if previously allocated or extended slice is complete or not */ | ||
524 | static bool | ||
525 | throtl_slice_used(struct throtl_data *td, struct throtl_grp *tg, bool rw) | ||
526 | { | ||
527 | if (time_in_range(jiffies, tg->slice_start[rw], tg->slice_end[rw])) | ||
528 | return 0; | ||
529 | |||
530 | return 1; | ||
531 | } | ||
532 | |||
533 | /* Trim the used slices and adjust slice start accordingly */ | ||
534 | static inline void | ||
535 | throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw) | ||
536 | { | ||
537 | unsigned long nr_slices, time_elapsed, io_trim; | ||
538 | u64 bytes_trim, tmp; | ||
539 | |||
540 | BUG_ON(time_before(tg->slice_end[rw], tg->slice_start[rw])); | ||
541 | |||
542 | /* | ||
543 | * If bps are unlimited (-1), then time slice don't get | ||
544 | * renewed. Don't try to trim the slice if slice is used. A new | ||
545 | * slice will start when appropriate. | ||
546 | */ | ||
547 | if (throtl_slice_used(td, tg, rw)) | ||
548 | return; | ||
549 | |||
550 | /* | ||
551 | * A bio has been dispatched. Also adjust slice_end. It might happen | ||
552 | * that initially cgroup limit was very low resulting in high | ||
553 | * slice_end, but later limit was bumped up and bio was dispached | ||
554 | * sooner, then we need to reduce slice_end. A high bogus slice_end | ||
555 | * is bad because it does not allow new slice to start. | ||
556 | */ | ||
557 | |||
558 | throtl_set_slice_end(td, tg, rw, jiffies + throtl_slice); | ||
559 | |||
560 | time_elapsed = jiffies - tg->slice_start[rw]; | ||
561 | |||
562 | nr_slices = time_elapsed / throtl_slice; | ||
563 | |||
564 | if (!nr_slices) | ||
565 | return; | ||
566 | tmp = tg->bps[rw] * throtl_slice * nr_slices; | ||
567 | do_div(tmp, HZ); | ||
568 | bytes_trim = tmp; | ||
569 | |||
570 | io_trim = (tg->iops[rw] * throtl_slice * nr_slices)/HZ; | ||
571 | |||
572 | if (!bytes_trim && !io_trim) | ||
573 | return; | ||
574 | |||
575 | if (tg->bytes_disp[rw] >= bytes_trim) | ||
576 | tg->bytes_disp[rw] -= bytes_trim; | ||
577 | else | ||
578 | tg->bytes_disp[rw] = 0; | ||
579 | |||
580 | if (tg->io_disp[rw] >= io_trim) | ||
581 | tg->io_disp[rw] -= io_trim; | ||
582 | else | ||
583 | tg->io_disp[rw] = 0; | ||
584 | |||
585 | tg->slice_start[rw] += nr_slices * throtl_slice; | ||
586 | |||
587 | throtl_log_tg(td, tg, "[%c] trim slice nr=%lu bytes=%llu io=%lu" | ||
588 | " start=%lu end=%lu jiffies=%lu", | ||
589 | rw == READ ? 'R' : 'W', nr_slices, bytes_trim, io_trim, | ||
590 | tg->slice_start[rw], tg->slice_end[rw], jiffies); | ||
591 | } | ||
592 | |||
593 | static bool tg_with_in_iops_limit(struct throtl_data *td, struct throtl_grp *tg, | ||
594 | struct bio *bio, unsigned long *wait) | ||
595 | { | ||
596 | bool rw = bio_data_dir(bio); | ||
597 | unsigned int io_allowed; | ||
598 | unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; | ||
599 | u64 tmp; | ||
600 | |||
601 | jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw]; | ||
602 | |||
603 | /* Slice has just started. Consider one slice interval */ | ||
604 | if (!jiffy_elapsed) | ||
605 | jiffy_elapsed_rnd = throtl_slice; | ||
606 | |||
607 | jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice); | ||
608 | |||
609 | /* | ||
610 | * jiffy_elapsed_rnd should not be a big value as minimum iops can be | ||
611 | * 1 then at max jiffy elapsed should be equivalent of 1 second as we | ||
612 | * will allow dispatch after 1 second and after that slice should | ||
613 | * have been trimmed. | ||
614 | */ | ||
615 | |||
616 | tmp = (u64)tg->iops[rw] * jiffy_elapsed_rnd; | ||
617 | do_div(tmp, HZ); | ||
618 | |||
619 | if (tmp > UINT_MAX) | ||
620 | io_allowed = UINT_MAX; | ||
621 | else | ||
622 | io_allowed = tmp; | ||
623 | |||
624 | if (tg->io_disp[rw] + 1 <= io_allowed) { | ||
625 | if (wait) | ||
626 | *wait = 0; | ||
627 | return 1; | ||
628 | } | ||
629 | |||
630 | /* Calc approx time to dispatch */ | ||
631 | jiffy_wait = ((tg->io_disp[rw] + 1) * HZ)/tg->iops[rw] + 1; | ||
632 | |||
633 | if (jiffy_wait > jiffy_elapsed) | ||
634 | jiffy_wait = jiffy_wait - jiffy_elapsed; | ||
635 | else | ||
636 | jiffy_wait = 1; | ||
637 | |||
638 | if (wait) | ||
639 | *wait = jiffy_wait; | ||
640 | return 0; | ||
641 | } | ||
642 | |||
643 | static bool tg_with_in_bps_limit(struct throtl_data *td, struct throtl_grp *tg, | ||
644 | struct bio *bio, unsigned long *wait) | ||
645 | { | ||
646 | bool rw = bio_data_dir(bio); | ||
647 | u64 bytes_allowed, extra_bytes, tmp; | ||
648 | unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; | ||
649 | |||
650 | jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw]; | ||
651 | |||
652 | /* Slice has just started. Consider one slice interval */ | ||
653 | if (!jiffy_elapsed) | ||
654 | jiffy_elapsed_rnd = throtl_slice; | ||
655 | |||
656 | jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice); | ||
657 | |||
658 | tmp = tg->bps[rw] * jiffy_elapsed_rnd; | ||
659 | do_div(tmp, HZ); | ||
660 | bytes_allowed = tmp; | ||
661 | |||
662 | if (tg->bytes_disp[rw] + bio->bi_size <= bytes_allowed) { | ||
663 | if (wait) | ||
664 | *wait = 0; | ||
665 | return 1; | ||
666 | } | ||
667 | |||
668 | /* Calc approx time to dispatch */ | ||
669 | extra_bytes = tg->bytes_disp[rw] + bio->bi_size - bytes_allowed; | ||
670 | jiffy_wait = div64_u64(extra_bytes * HZ, tg->bps[rw]); | ||
671 | |||
672 | if (!jiffy_wait) | ||
673 | jiffy_wait = 1; | ||
674 | |||
675 | /* | ||
676 | * This wait time is without taking into consideration the rounding | ||
677 | * up we did. Add that time also. | ||
678 | */ | ||
679 | jiffy_wait = jiffy_wait + (jiffy_elapsed_rnd - jiffy_elapsed); | ||
680 | if (wait) | ||
681 | *wait = jiffy_wait; | ||
682 | return 0; | ||
683 | } | ||
684 | |||
685 | static bool tg_no_rule_group(struct throtl_grp *tg, bool rw) { | ||
686 | if (tg->bps[rw] == -1 && tg->iops[rw] == -1) | ||
687 | return 1; | ||
688 | return 0; | ||
689 | } | ||
690 | |||
691 | /* | ||
692 | * Returns whether one can dispatch a bio or not. Also returns approx number | ||
693 | * of jiffies to wait before this bio is with-in IO rate and can be dispatched | ||
694 | */ | ||
695 | static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg, | ||
696 | struct bio *bio, unsigned long *wait) | ||
697 | { | ||
698 | bool rw = bio_data_dir(bio); | ||
699 | unsigned long bps_wait = 0, iops_wait = 0, max_wait = 0; | ||
700 | |||
701 | /* | ||
702 | * Currently whole state machine of group depends on first bio | ||
703 | * queued in the group bio list. So one should not be calling | ||
704 | * this function with a different bio if there are other bios | ||
705 | * queued. | ||
706 | */ | ||
707 | BUG_ON(tg->nr_queued[rw] && bio != bio_list_peek(&tg->bio_lists[rw])); | ||
708 | |||
709 | /* If tg->bps = -1, then BW is unlimited */ | ||
710 | if (tg->bps[rw] == -1 && tg->iops[rw] == -1) { | ||
711 | if (wait) | ||
712 | *wait = 0; | ||
713 | return 1; | ||
714 | } | ||
715 | |||
716 | /* | ||
717 | * If previous slice expired, start a new one otherwise renew/extend | ||
718 | * existing slice to make sure it is at least throtl_slice interval | ||
719 | * long since now. | ||
720 | */ | ||
721 | if (throtl_slice_used(td, tg, rw)) | ||
722 | throtl_start_new_slice(td, tg, rw); | ||
723 | else { | ||
724 | if (time_before(tg->slice_end[rw], jiffies + throtl_slice)) | ||
725 | throtl_extend_slice(td, tg, rw, jiffies + throtl_slice); | ||
726 | } | ||
727 | |||
728 | if (tg_with_in_bps_limit(td, tg, bio, &bps_wait) | ||
729 | && tg_with_in_iops_limit(td, tg, bio, &iops_wait)) { | ||
730 | if (wait) | ||
731 | *wait = 0; | ||
732 | return 1; | ||
733 | } | ||
734 | |||
735 | max_wait = max(bps_wait, iops_wait); | ||
736 | |||
737 | if (wait) | ||
738 | *wait = max_wait; | ||
739 | |||
740 | if (time_before(tg->slice_end[rw], jiffies + max_wait)) | ||
741 | throtl_extend_slice(td, tg, rw, jiffies + max_wait); | ||
742 | |||
743 | return 0; | ||
744 | } | ||
745 | |||
746 | static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) | ||
747 | { | ||
748 | bool rw = bio_data_dir(bio); | ||
749 | bool sync = bio->bi_rw & REQ_SYNC; | ||
750 | |||
751 | /* Charge the bio to the group */ | ||
752 | tg->bytes_disp[rw] += bio->bi_size; | ||
753 | tg->io_disp[rw]++; | ||
754 | |||
755 | blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size, rw, sync); | ||
756 | } | ||
757 | |||
758 | static void throtl_add_bio_tg(struct throtl_data *td, struct throtl_grp *tg, | ||
759 | struct bio *bio) | ||
760 | { | ||
761 | bool rw = bio_data_dir(bio); | ||
762 | |||
763 | bio_list_add(&tg->bio_lists[rw], bio); | ||
764 | /* Take a bio reference on tg */ | ||
765 | throtl_ref_get_tg(tg); | ||
766 | tg->nr_queued[rw]++; | ||
767 | td->nr_queued[rw]++; | ||
768 | throtl_enqueue_tg(td, tg); | ||
769 | } | ||
770 | |||
771 | static void tg_update_disptime(struct throtl_data *td, struct throtl_grp *tg) | ||
772 | { | ||
773 | unsigned long read_wait = -1, write_wait = -1, min_wait = -1, disptime; | ||
774 | struct bio *bio; | ||
775 | |||
776 | if ((bio = bio_list_peek(&tg->bio_lists[READ]))) | ||
777 | tg_may_dispatch(td, tg, bio, &read_wait); | ||
778 | |||
779 | if ((bio = bio_list_peek(&tg->bio_lists[WRITE]))) | ||
780 | tg_may_dispatch(td, tg, bio, &write_wait); | ||
781 | |||
782 | min_wait = min(read_wait, write_wait); | ||
783 | disptime = jiffies + min_wait; | ||
784 | |||
785 | /* Update dispatch time */ | ||
786 | throtl_dequeue_tg(td, tg); | ||
787 | tg->disptime = disptime; | ||
788 | throtl_enqueue_tg(td, tg); | ||
789 | } | ||
790 | |||
791 | static void tg_dispatch_one_bio(struct throtl_data *td, struct throtl_grp *tg, | ||
792 | bool rw, struct bio_list *bl) | ||
793 | { | ||
794 | struct bio *bio; | ||
795 | |||
796 | bio = bio_list_pop(&tg->bio_lists[rw]); | ||
797 | tg->nr_queued[rw]--; | ||
798 | /* Drop bio reference on tg */ | ||
799 | throtl_put_tg(tg); | ||
800 | |||
801 | BUG_ON(td->nr_queued[rw] <= 0); | ||
802 | td->nr_queued[rw]--; | ||
803 | |||
804 | throtl_charge_bio(tg, bio); | ||
805 | bio_list_add(bl, bio); | ||
806 | bio->bi_rw |= REQ_THROTTLED; | ||
807 | |||
808 | throtl_trim_slice(td, tg, rw); | ||
809 | } | ||
810 | |||
811 | static int throtl_dispatch_tg(struct throtl_data *td, struct throtl_grp *tg, | ||
812 | struct bio_list *bl) | ||
813 | { | ||
814 | unsigned int nr_reads = 0, nr_writes = 0; | ||
815 | unsigned int max_nr_reads = throtl_grp_quantum*3/4; | ||
816 | unsigned int max_nr_writes = throtl_grp_quantum - max_nr_reads; | ||
817 | struct bio *bio; | ||
818 | |||
819 | /* Try to dispatch 75% READS and 25% WRITES */ | ||
820 | |||
821 | while ((bio = bio_list_peek(&tg->bio_lists[READ])) | ||
822 | && tg_may_dispatch(td, tg, bio, NULL)) { | ||
823 | |||
824 | tg_dispatch_one_bio(td, tg, bio_data_dir(bio), bl); | ||
825 | nr_reads++; | ||
826 | |||
827 | if (nr_reads >= max_nr_reads) | ||
828 | break; | ||
829 | } | ||
830 | |||
831 | while ((bio = bio_list_peek(&tg->bio_lists[WRITE])) | ||
832 | && tg_may_dispatch(td, tg, bio, NULL)) { | ||
833 | |||
834 | tg_dispatch_one_bio(td, tg, bio_data_dir(bio), bl); | ||
835 | nr_writes++; | ||
836 | |||
837 | if (nr_writes >= max_nr_writes) | ||
838 | break; | ||
839 | } | ||
840 | |||
841 | return nr_reads + nr_writes; | ||
842 | } | ||
843 | |||
844 | static int throtl_select_dispatch(struct throtl_data *td, struct bio_list *bl) | ||
845 | { | ||
846 | unsigned int nr_disp = 0; | ||
847 | struct throtl_grp *tg; | ||
848 | struct throtl_rb_root *st = &td->tg_service_tree; | ||
849 | |||
850 | while (1) { | ||
851 | tg = throtl_rb_first(st); | ||
852 | |||
853 | if (!tg) | ||
854 | break; | ||
855 | |||
856 | if (time_before(jiffies, tg->disptime)) | ||
857 | break; | ||
858 | |||
859 | throtl_dequeue_tg(td, tg); | ||
860 | |||
861 | nr_disp += throtl_dispatch_tg(td, tg, bl); | ||
862 | |||
863 | if (tg->nr_queued[0] || tg->nr_queued[1]) { | ||
864 | tg_update_disptime(td, tg); | ||
865 | throtl_enqueue_tg(td, tg); | ||
866 | } | ||
867 | |||
868 | if (nr_disp >= throtl_quantum) | ||
869 | break; | ||
870 | } | ||
871 | |||
872 | return nr_disp; | ||
873 | } | ||
874 | |||
875 | static void throtl_process_limit_change(struct throtl_data *td) | ||
876 | { | ||
877 | struct throtl_grp *tg; | ||
878 | struct hlist_node *pos, *n; | ||
879 | |||
880 | if (!td->limits_changed) | ||
881 | return; | ||
882 | |||
883 | xchg(&td->limits_changed, false); | ||
884 | |||
885 | throtl_log(td, "limits changed"); | ||
886 | |||
887 | hlist_for_each_entry_safe(tg, pos, n, &td->tg_list, tg_node) { | ||
888 | if (!tg->limits_changed) | ||
889 | continue; | ||
890 | |||
891 | if (!xchg(&tg->limits_changed, false)) | ||
892 | continue; | ||
893 | |||
894 | throtl_log_tg(td, tg, "limit change rbps=%llu wbps=%llu" | ||
895 | " riops=%u wiops=%u", tg->bps[READ], tg->bps[WRITE], | ||
896 | tg->iops[READ], tg->iops[WRITE]); | ||
897 | |||
898 | /* | ||
899 | * Restart the slices for both READ and WRITES. It | ||
900 | * might happen that a group's limit are dropped | ||
901 | * suddenly and we don't want to account recently | ||
902 | * dispatched IO with new low rate | ||
903 | */ | ||
904 | throtl_start_new_slice(td, tg, 0); | ||
905 | throtl_start_new_slice(td, tg, 1); | ||
906 | |||
907 | if (throtl_tg_on_rr(tg)) | ||
908 | tg_update_disptime(td, tg); | ||
909 | } | ||
910 | } | ||
911 | |||
912 | /* Dispatch throttled bios. Should be called without queue lock held. */ | ||
913 | static int throtl_dispatch(struct request_queue *q) | ||
914 | { | ||
915 | struct throtl_data *td = q->td; | ||
916 | unsigned int nr_disp = 0; | ||
917 | struct bio_list bio_list_on_stack; | ||
918 | struct bio *bio; | ||
919 | struct blk_plug plug; | ||
920 | |||
921 | spin_lock_irq(q->queue_lock); | ||
922 | |||
923 | throtl_process_limit_change(td); | ||
924 | |||
925 | if (!total_nr_queued(td)) | ||
926 | goto out; | ||
927 | |||
928 | bio_list_init(&bio_list_on_stack); | ||
929 | |||
930 | throtl_log(td, "dispatch nr_queued=%d read=%u write=%u", | ||
931 | total_nr_queued(td), td->nr_queued[READ], | ||
932 | td->nr_queued[WRITE]); | ||
933 | |||
934 | nr_disp = throtl_select_dispatch(td, &bio_list_on_stack); | ||
935 | |||
936 | if (nr_disp) | ||
937 | throtl_log(td, "bios disp=%u", nr_disp); | ||
938 | |||
939 | throtl_schedule_next_dispatch(td); | ||
940 | out: | ||
941 | spin_unlock_irq(q->queue_lock); | ||
942 | |||
943 | /* | ||
944 | * If we dispatched some requests, unplug the queue to make sure | ||
945 | * immediate dispatch | ||
946 | */ | ||
947 | if (nr_disp) { | ||
948 | blk_start_plug(&plug); | ||
949 | while((bio = bio_list_pop(&bio_list_on_stack))) | ||
950 | generic_make_request(bio); | ||
951 | blk_finish_plug(&plug); | ||
952 | } | ||
953 | return nr_disp; | ||
954 | } | ||
955 | |||
956 | void blk_throtl_work(struct work_struct *work) | ||
957 | { | ||
958 | struct throtl_data *td = container_of(work, struct throtl_data, | ||
959 | throtl_work.work); | ||
960 | struct request_queue *q = td->queue; | ||
961 | |||
962 | throtl_dispatch(q); | ||
963 | } | ||
964 | |||
965 | /* Call with queue lock held */ | ||
966 | static void | ||
967 | throtl_schedule_delayed_work(struct throtl_data *td, unsigned long delay) | ||
968 | { | ||
969 | |||
970 | struct delayed_work *dwork = &td->throtl_work; | ||
971 | |||
972 | /* schedule work if limits changed even if no bio is queued */ | ||
973 | if (total_nr_queued(td) > 0 || td->limits_changed) { | ||
974 | /* | ||
975 | * We might have a work scheduled to be executed in future. | ||
976 | * Cancel that and schedule a new one. | ||
977 | */ | ||
978 | __cancel_delayed_work(dwork); | ||
979 | queue_delayed_work(kthrotld_workqueue, dwork, delay); | ||
980 | throtl_log(td, "schedule work. delay=%lu jiffies=%lu", | ||
981 | delay, jiffies); | ||
982 | } | ||
983 | } | ||
984 | |||
985 | static void | ||
986 | throtl_destroy_tg(struct throtl_data *td, struct throtl_grp *tg) | ||
987 | { | ||
988 | /* Something wrong if we are trying to remove same group twice */ | ||
989 | BUG_ON(hlist_unhashed(&tg->tg_node)); | ||
990 | |||
991 | hlist_del_init(&tg->tg_node); | ||
992 | |||
993 | /* | ||
994 | * Put the reference taken at the time of creation so that when all | ||
995 | * queues are gone, group can be destroyed. | ||
996 | */ | ||
997 | throtl_put_tg(tg); | ||
998 | td->nr_undestroyed_grps--; | ||
999 | } | ||
1000 | |||
1001 | static void throtl_release_tgs(struct throtl_data *td) | ||
1002 | { | ||
1003 | struct hlist_node *pos, *n; | ||
1004 | struct throtl_grp *tg; | ||
1005 | |||
1006 | hlist_for_each_entry_safe(tg, pos, n, &td->tg_list, tg_node) { | ||
1007 | /* | ||
1008 | * If cgroup removal path got to blk_group first and removed | ||
1009 | * it from cgroup list, then it will take care of destroying | ||
1010 | * cfqg also. | ||
1011 | */ | ||
1012 | if (!blkiocg_del_blkio_group(&tg->blkg)) | ||
1013 | throtl_destroy_tg(td, tg); | ||
1014 | } | ||
1015 | } | ||
1016 | |||
1017 | static void throtl_td_free(struct throtl_data *td) | ||
1018 | { | ||
1019 | kfree(td); | ||
1020 | } | ||
1021 | |||
1022 | /* | ||
1023 | * Blk cgroup controller notification saying that blkio_group object is being | ||
1024 | * delinked as associated cgroup object is going away. That also means that | ||
1025 | * no new IO will come in this group. So get rid of this group as soon as | ||
1026 | * any pending IO in the group is finished. | ||
1027 | * | ||
1028 | * This function is called under rcu_read_lock(). key is the rcu protected | ||
1029 | * pointer. That means "key" is a valid throtl_data pointer as long as we are | ||
1030 | * rcu read lock. | ||
1031 | * | ||
1032 | * "key" was fetched from blkio_group under blkio_cgroup->lock. That means | ||
1033 | * it should not be NULL as even if queue was going away, cgroup deltion | ||
1034 | * path got to it first. | ||
1035 | */ | ||
1036 | void throtl_unlink_blkio_group(void *key, struct blkio_group *blkg) | ||
1037 | { | ||
1038 | unsigned long flags; | ||
1039 | struct throtl_data *td = key; | ||
1040 | |||
1041 | spin_lock_irqsave(td->queue->queue_lock, flags); | ||
1042 | throtl_destroy_tg(td, tg_of_blkg(blkg)); | ||
1043 | spin_unlock_irqrestore(td->queue->queue_lock, flags); | ||
1044 | } | ||
1045 | |||
1046 | static void throtl_update_blkio_group_common(struct throtl_data *td, | ||
1047 | struct throtl_grp *tg) | ||
1048 | { | ||
1049 | xchg(&tg->limits_changed, true); | ||
1050 | xchg(&td->limits_changed, true); | ||
1051 | /* Schedule a work now to process the limit change */ | ||
1052 | throtl_schedule_delayed_work(td, 0); | ||
1053 | } | ||
1054 | |||
1055 | /* | ||
1056 | * For all update functions, key should be a valid pointer because these | ||
1057 | * update functions are called under blkcg_lock, that means, blkg is | ||
1058 | * valid and in turn key is valid. queue exit path can not race because | ||
1059 | * of blkcg_lock | ||
1060 | * | ||
1061 | * Can not take queue lock in update functions as queue lock under blkcg_lock | ||
1062 | * is not allowed. Under other paths we take blkcg_lock under queue_lock. | ||
1063 | */ | ||
1064 | static void throtl_update_blkio_group_read_bps(void *key, | ||
1065 | struct blkio_group *blkg, u64 read_bps) | ||
1066 | { | ||
1067 | struct throtl_data *td = key; | ||
1068 | struct throtl_grp *tg = tg_of_blkg(blkg); | ||
1069 | |||
1070 | tg->bps[READ] = read_bps; | ||
1071 | throtl_update_blkio_group_common(td, tg); | ||
1072 | } | ||
1073 | |||
1074 | static void throtl_update_blkio_group_write_bps(void *key, | ||
1075 | struct blkio_group *blkg, u64 write_bps) | ||
1076 | { | ||
1077 | struct throtl_data *td = key; | ||
1078 | struct throtl_grp *tg = tg_of_blkg(blkg); | ||
1079 | |||
1080 | tg->bps[WRITE] = write_bps; | ||
1081 | throtl_update_blkio_group_common(td, tg); | ||
1082 | } | ||
1083 | |||
1084 | static void throtl_update_blkio_group_read_iops(void *key, | ||
1085 | struct blkio_group *blkg, unsigned int read_iops) | ||
1086 | { | ||
1087 | struct throtl_data *td = key; | ||
1088 | struct throtl_grp *tg = tg_of_blkg(blkg); | ||
1089 | |||
1090 | tg->iops[READ] = read_iops; | ||
1091 | throtl_update_blkio_group_common(td, tg); | ||
1092 | } | ||
1093 | |||
1094 | static void throtl_update_blkio_group_write_iops(void *key, | ||
1095 | struct blkio_group *blkg, unsigned int write_iops) | ||
1096 | { | ||
1097 | struct throtl_data *td = key; | ||
1098 | struct throtl_grp *tg = tg_of_blkg(blkg); | ||
1099 | |||
1100 | tg->iops[WRITE] = write_iops; | ||
1101 | throtl_update_blkio_group_common(td, tg); | ||
1102 | } | ||
1103 | |||
1104 | static void throtl_shutdown_wq(struct request_queue *q) | ||
1105 | { | ||
1106 | struct throtl_data *td = q->td; | ||
1107 | |||
1108 | cancel_delayed_work_sync(&td->throtl_work); | ||
1109 | } | ||
1110 | |||
1111 | static struct blkio_policy_type blkio_policy_throtl = { | ||
1112 | .ops = { | ||
1113 | .blkio_unlink_group_fn = throtl_unlink_blkio_group, | ||
1114 | .blkio_update_group_read_bps_fn = | ||
1115 | throtl_update_blkio_group_read_bps, | ||
1116 | .blkio_update_group_write_bps_fn = | ||
1117 | throtl_update_blkio_group_write_bps, | ||
1118 | .blkio_update_group_read_iops_fn = | ||
1119 | throtl_update_blkio_group_read_iops, | ||
1120 | .blkio_update_group_write_iops_fn = | ||
1121 | throtl_update_blkio_group_write_iops, | ||
1122 | }, | ||
1123 | .plid = BLKIO_POLICY_THROTL, | ||
1124 | }; | ||
1125 | |||
1126 | int blk_throtl_bio(struct request_queue *q, struct bio **biop) | ||
1127 | { | ||
1128 | struct throtl_data *td = q->td; | ||
1129 | struct throtl_grp *tg; | ||
1130 | struct bio *bio = *biop; | ||
1131 | bool rw = bio_data_dir(bio), update_disptime = true; | ||
1132 | struct blkio_cgroup *blkcg; | ||
1133 | |||
1134 | if (bio->bi_rw & REQ_THROTTLED) { | ||
1135 | bio->bi_rw &= ~REQ_THROTTLED; | ||
1136 | return 0; | ||
1137 | } | ||
1138 | |||
1139 | /* | ||
1140 | * A throtl_grp pointer retrieved under rcu can be used to access | ||
1141 | * basic fields like stats and io rates. If a group has no rules, | ||
1142 | * just update the dispatch stats in lockless manner and return. | ||
1143 | */ | ||
1144 | |||
1145 | rcu_read_lock(); | ||
1146 | blkcg = task_blkio_cgroup(current); | ||
1147 | tg = throtl_find_tg(td, blkcg); | ||
1148 | if (tg) { | ||
1149 | throtl_tg_fill_dev_details(td, tg); | ||
1150 | |||
1151 | if (tg_no_rule_group(tg, rw)) { | ||
1152 | blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size, | ||
1153 | rw, bio->bi_rw & REQ_SYNC); | ||
1154 | rcu_read_unlock(); | ||
1155 | return 0; | ||
1156 | } | ||
1157 | } | ||
1158 | rcu_read_unlock(); | ||
1159 | |||
1160 | /* | ||
1161 | * Either group has not been allocated yet or it is not an unlimited | ||
1162 | * IO group | ||
1163 | */ | ||
1164 | |||
1165 | spin_lock_irq(q->queue_lock); | ||
1166 | tg = throtl_get_tg(td); | ||
1167 | |||
1168 | if (IS_ERR(tg)) { | ||
1169 | if (PTR_ERR(tg) == -ENODEV) { | ||
1170 | /* | ||
1171 | * Queue is gone. No queue lock held here. | ||
1172 | */ | ||
1173 | return -ENODEV; | ||
1174 | } | ||
1175 | } | ||
1176 | |||
1177 | if (tg->nr_queued[rw]) { | ||
1178 | /* | ||
1179 | * There is already another bio queued in same dir. No | ||
1180 | * need to update dispatch time. | ||
1181 | */ | ||
1182 | update_disptime = false; | ||
1183 | goto queue_bio; | ||
1184 | |||
1185 | } | ||
1186 | |||
1187 | /* Bio is with-in rate limit of group */ | ||
1188 | if (tg_may_dispatch(td, tg, bio, NULL)) { | ||
1189 | throtl_charge_bio(tg, bio); | ||
1190 | |||
1191 | /* | ||
1192 | * We need to trim slice even when bios are not being queued | ||
1193 | * otherwise it might happen that a bio is not queued for | ||
1194 | * a long time and slice keeps on extending and trim is not | ||
1195 | * called for a long time. Now if limits are reduced suddenly | ||
1196 | * we take into account all the IO dispatched so far at new | ||
1197 | * low rate and * newly queued IO gets a really long dispatch | ||
1198 | * time. | ||
1199 | * | ||
1200 | * So keep on trimming slice even if bio is not queued. | ||
1201 | */ | ||
1202 | throtl_trim_slice(td, tg, rw); | ||
1203 | goto out; | ||
1204 | } | ||
1205 | |||
1206 | queue_bio: | ||
1207 | throtl_log_tg(td, tg, "[%c] bio. bdisp=%llu sz=%u bps=%llu" | ||
1208 | " iodisp=%u iops=%u queued=%d/%d", | ||
1209 | rw == READ ? 'R' : 'W', | ||
1210 | tg->bytes_disp[rw], bio->bi_size, tg->bps[rw], | ||
1211 | tg->io_disp[rw], tg->iops[rw], | ||
1212 | tg->nr_queued[READ], tg->nr_queued[WRITE]); | ||
1213 | |||
1214 | throtl_add_bio_tg(q->td, tg, bio); | ||
1215 | *biop = NULL; | ||
1216 | |||
1217 | if (update_disptime) { | ||
1218 | tg_update_disptime(td, tg); | ||
1219 | throtl_schedule_next_dispatch(td); | ||
1220 | } | ||
1221 | |||
1222 | out: | ||
1223 | spin_unlock_irq(q->queue_lock); | ||
1224 | return 0; | ||
1225 | } | ||
1226 | |||
1227 | int blk_throtl_init(struct request_queue *q) | ||
1228 | { | ||
1229 | struct throtl_data *td; | ||
1230 | struct throtl_grp *tg; | ||
1231 | |||
1232 | td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node); | ||
1233 | if (!td) | ||
1234 | return -ENOMEM; | ||
1235 | |||
1236 | INIT_HLIST_HEAD(&td->tg_list); | ||
1237 | td->tg_service_tree = THROTL_RB_ROOT; | ||
1238 | td->limits_changed = false; | ||
1239 | INIT_DELAYED_WORK(&td->throtl_work, blk_throtl_work); | ||
1240 | |||
1241 | /* alloc and Init root group. */ | ||
1242 | td->queue = q; | ||
1243 | tg = throtl_alloc_tg(td); | ||
1244 | |||
1245 | if (!tg) { | ||
1246 | kfree(td); | ||
1247 | return -ENOMEM; | ||
1248 | } | ||
1249 | |||
1250 | td->root_tg = tg; | ||
1251 | |||
1252 | rcu_read_lock(); | ||
1253 | throtl_init_add_tg_lists(td, tg, &blkio_root_cgroup); | ||
1254 | rcu_read_unlock(); | ||
1255 | |||
1256 | /* Attach throtl data to request queue */ | ||
1257 | q->td = td; | ||
1258 | return 0; | ||
1259 | } | ||
1260 | |||
1261 | void blk_throtl_exit(struct request_queue *q) | ||
1262 | { | ||
1263 | struct throtl_data *td = q->td; | ||
1264 | bool wait = false; | ||
1265 | |||
1266 | BUG_ON(!td); | ||
1267 | |||
1268 | throtl_shutdown_wq(q); | ||
1269 | |||
1270 | spin_lock_irq(q->queue_lock); | ||
1271 | throtl_release_tgs(td); | ||
1272 | |||
1273 | /* If there are other groups */ | ||
1274 | if (td->nr_undestroyed_grps > 0) | ||
1275 | wait = true; | ||
1276 | |||
1277 | spin_unlock_irq(q->queue_lock); | ||
1278 | |||
1279 | /* | ||
1280 | * Wait for tg->blkg->key accessors to exit their grace periods. | ||
1281 | * Do this wait only if there are other undestroyed groups out | ||
1282 | * there (other than root group). This can happen if cgroup deletion | ||
1283 | * path claimed the responsibility of cleaning up a group before | ||
1284 | * queue cleanup code get to the group. | ||
1285 | * | ||
1286 | * Do not call synchronize_rcu() unconditionally as there are drivers | ||
1287 | * which create/delete request queue hundreds of times during scan/boot | ||
1288 | * and synchronize_rcu() can take significant time and slow down boot. | ||
1289 | */ | ||
1290 | if (wait) | ||
1291 | synchronize_rcu(); | ||
1292 | |||
1293 | /* | ||
1294 | * Just being safe to make sure after previous flush if some body did | ||
1295 | * update limits through cgroup and another work got queued, cancel | ||
1296 | * it. | ||
1297 | */ | ||
1298 | throtl_shutdown_wq(q); | ||
1299 | throtl_td_free(td); | ||
1300 | } | ||
1301 | |||
1302 | static int __init throtl_init(void) | ||
1303 | { | ||
1304 | kthrotld_workqueue = alloc_workqueue("kthrotld", WQ_MEM_RECLAIM, 0); | ||
1305 | if (!kthrotld_workqueue) | ||
1306 | panic("Failed to create kthrotld\n"); | ||
1307 | |||
1308 | blkio_policy_register(&blkio_policy_throtl); | ||
1309 | return 0; | ||
1310 | } | ||
1311 | |||
1312 | module_init(throtl_init); | ||
diff --git a/block/blk.h b/block/blk.h index d6b911ac002c..d6586287adc9 100644 --- a/block/blk.h +++ b/block/blk.h | |||
@@ -18,8 +18,6 @@ int blk_rq_append_bio(struct request_queue *q, struct request *rq, | |||
18 | void blk_dequeue_request(struct request *rq); | 18 | void blk_dequeue_request(struct request *rq); |
19 | void __blk_queue_free_tags(struct request_queue *q); | 19 | void __blk_queue_free_tags(struct request_queue *q); |
20 | 20 | ||
21 | void blk_unplug_work(struct work_struct *work); | ||
22 | void blk_unplug_timeout(unsigned long data); | ||
23 | void blk_rq_timed_out_timer(unsigned long data); | 21 | void blk_rq_timed_out_timer(unsigned long data); |
24 | void blk_delete_timer(struct request *); | 22 | void blk_delete_timer(struct request *); |
25 | void blk_add_timer(struct request *); | 23 | void blk_add_timer(struct request *); |
@@ -34,7 +32,7 @@ enum rq_atomic_flags { | |||
34 | 32 | ||
35 | /* | 33 | /* |
36 | * EH timer and IO completion will both attempt to 'grab' the request, make | 34 | * EH timer and IO completion will both attempt to 'grab' the request, make |
37 | * sure that only one of them suceeds | 35 | * sure that only one of them succeeds |
38 | */ | 36 | */ |
39 | static inline int blk_mark_rq_complete(struct request *rq) | 37 | static inline int blk_mark_rq_complete(struct request *rq) |
40 | { | 38 | { |
@@ -51,18 +49,41 @@ static inline void blk_clear_rq_complete(struct request *rq) | |||
51 | */ | 49 | */ |
52 | #define ELV_ON_HASH(rq) (!hlist_unhashed(&(rq)->hash)) | 50 | #define ELV_ON_HASH(rq) (!hlist_unhashed(&(rq)->hash)) |
53 | 51 | ||
52 | void blk_insert_flush(struct request *rq); | ||
53 | void blk_abort_flushes(struct request_queue *q); | ||
54 | |||
54 | static inline struct request *__elv_next_request(struct request_queue *q) | 55 | static inline struct request *__elv_next_request(struct request_queue *q) |
55 | { | 56 | { |
56 | struct request *rq; | 57 | struct request *rq; |
57 | 58 | ||
58 | while (1) { | 59 | while (1) { |
59 | while (!list_empty(&q->queue_head)) { | 60 | if (!list_empty(&q->queue_head)) { |
60 | rq = list_entry_rq(q->queue_head.next); | 61 | rq = list_entry_rq(q->queue_head.next); |
61 | if (blk_do_ordered(q, &rq)) | 62 | return rq; |
62 | return rq; | ||
63 | } | 63 | } |
64 | 64 | ||
65 | if (!q->elevator->ops->elevator_dispatch_fn(q, 0)) | 65 | /* |
66 | * Flush request is running and flush request isn't queueable | ||
67 | * in the drive, we can hold the queue till flush request is | ||
68 | * finished. Even we don't do this, driver can't dispatch next | ||
69 | * requests and will requeue them. And this can improve | ||
70 | * throughput too. For example, we have request flush1, write1, | ||
71 | * flush 2. flush1 is dispatched, then queue is hold, write1 | ||
72 | * isn't inserted to queue. After flush1 is finished, flush2 | ||
73 | * will be dispatched. Since disk cache is already clean, | ||
74 | * flush2 will be finished very soon, so looks like flush2 is | ||
75 | * folded to flush1. | ||
76 | * Since the queue is hold, a flag is set to indicate the queue | ||
77 | * should be restarted later. Please see flush_end_io() for | ||
78 | * details. | ||
79 | */ | ||
80 | if (q->flush_pending_idx != q->flush_running_idx && | ||
81 | !queue_flush_queueable(q)) { | ||
82 | q->flush_queue_delayed = 1; | ||
83 | return NULL; | ||
84 | } | ||
85 | if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags) || | ||
86 | !q->elevator->ops->elevator_dispatch_fn(q, 0)) | ||
66 | return NULL; | 87 | return NULL; |
67 | } | 88 | } |
68 | } | 89 | } |
@@ -103,6 +124,8 @@ int ll_front_merge_fn(struct request_queue *q, struct request *req, | |||
103 | struct bio *bio); | 124 | struct bio *bio); |
104 | int attempt_back_merge(struct request_queue *q, struct request *rq); | 125 | int attempt_back_merge(struct request_queue *q, struct request *rq); |
105 | int attempt_front_merge(struct request_queue *q, struct request *rq); | 126 | int attempt_front_merge(struct request_queue *q, struct request *rq); |
127 | int blk_attempt_req_merge(struct request_queue *q, struct request *rq, | ||
128 | struct request *next); | ||
106 | void blk_recalc_rq_segments(struct request *rq); | 129 | void blk_recalc_rq_segments(struct request *rq); |
107 | void blk_rq_set_mixed_merge(struct request *rq); | 130 | void blk_rq_set_mixed_merge(struct request *rq); |
108 | 131 | ||
@@ -132,14 +155,6 @@ static inline int queue_congestion_off_threshold(struct request_queue *q) | |||
132 | return q->nr_congestion_off; | 155 | return q->nr_congestion_off; |
133 | } | 156 | } |
134 | 157 | ||
135 | #if defined(CONFIG_BLK_DEV_INTEGRITY) | ||
136 | |||
137 | #define rq_for_each_integrity_segment(bvl, _rq, _iter) \ | ||
138 | __rq_for_each_bio(_iter.bio, _rq) \ | ||
139 | bip_for_each_vec(bvl, _iter.bio->bi_integrity, _iter.i) | ||
140 | |||
141 | #endif /* BLK_DEV_INTEGRITY */ | ||
142 | |||
143 | static inline int blk_cpu_to_group(int cpu) | 158 | static inline int blk_cpu_to_group(int cpu) |
144 | { | 159 | { |
145 | int group = NR_CPUS; | 160 | int group = NR_CPUS; |
diff --git a/block/bsg.c b/block/bsg.c index 0c00870553a3..0c8b64a16484 100644 --- a/block/bsg.c +++ b/block/bsg.c | |||
@@ -20,7 +20,6 @@ | |||
20 | #include <linux/uio.h> | 20 | #include <linux/uio.h> |
21 | #include <linux/idr.h> | 21 | #include <linux/idr.h> |
22 | #include <linux/bsg.h> | 22 | #include <linux/bsg.h> |
23 | #include <linux/smp_lock.h> | ||
24 | #include <linux/slab.h> | 23 | #include <linux/slab.h> |
25 | 24 | ||
26 | #include <scsi/scsi.h> | 25 | #include <scsi/scsi.h> |
@@ -251,6 +250,14 @@ bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, fmode_t has_write_perm, | |||
251 | int ret, rw; | 250 | int ret, rw; |
252 | unsigned int dxfer_len; | 251 | unsigned int dxfer_len; |
253 | void *dxferp = NULL; | 252 | void *dxferp = NULL; |
253 | struct bsg_class_device *bcd = &q->bsg_dev; | ||
254 | |||
255 | /* if the LLD has been removed then the bsg_unregister_queue will | ||
256 | * eventually be called and the class_dev was freed, so we can no | ||
257 | * longer use this request_queue. Return no such address. | ||
258 | */ | ||
259 | if (!bcd->class_dev) | ||
260 | return ERR_PTR(-ENXIO); | ||
254 | 261 | ||
255 | dprintk("map hdr %llx/%u %llx/%u\n", (unsigned long long) hdr->dout_xferp, | 262 | dprintk("map hdr %llx/%u %llx/%u\n", (unsigned long long) hdr->dout_xferp, |
256 | hdr->dout_xfer_len, (unsigned long long) hdr->din_xferp, | 263 | hdr->dout_xfer_len, (unsigned long long) hdr->din_xferp, |
@@ -843,9 +850,7 @@ static int bsg_open(struct inode *inode, struct file *file) | |||
843 | { | 850 | { |
844 | struct bsg_device *bd; | 851 | struct bsg_device *bd; |
845 | 852 | ||
846 | lock_kernel(); | ||
847 | bd = bsg_get_device(inode, file); | 853 | bd = bsg_get_device(inode, file); |
848 | unlock_kernel(); | ||
849 | 854 | ||
850 | if (IS_ERR(bd)) | 855 | if (IS_ERR(bd)) |
851 | return PTR_ERR(bd); | 856 | return PTR_ERR(bd); |
@@ -968,6 +973,7 @@ static const struct file_operations bsg_fops = { | |||
968 | .release = bsg_release, | 973 | .release = bsg_release, |
969 | .unlocked_ioctl = bsg_ioctl, | 974 | .unlocked_ioctl = bsg_ioctl, |
970 | .owner = THIS_MODULE, | 975 | .owner = THIS_MODULE, |
976 | .llseek = default_llseek, | ||
971 | }; | 977 | }; |
972 | 978 | ||
973 | void bsg_unregister_queue(struct request_queue *q) | 979 | void bsg_unregister_queue(struct request_queue *q) |
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 9eba291eb6fd..ae21919f15e1 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c | |||
@@ -54,9 +54,9 @@ static const int cfq_hist_divisor = 4; | |||
54 | #define CFQQ_SEEKY(cfqq) (hweight32(cfqq->seek_history) > 32/8) | 54 | #define CFQQ_SEEKY(cfqq) (hweight32(cfqq->seek_history) > 32/8) |
55 | 55 | ||
56 | #define RQ_CIC(rq) \ | 56 | #define RQ_CIC(rq) \ |
57 | ((struct cfq_io_context *) (rq)->elevator_private) | 57 | ((struct cfq_io_context *) (rq)->elevator_private[0]) |
58 | #define RQ_CFQQ(rq) (struct cfq_queue *) ((rq)->elevator_private2) | 58 | #define RQ_CFQQ(rq) (struct cfq_queue *) ((rq)->elevator_private[1]) |
59 | #define RQ_CFQG(rq) (struct cfq_group *) ((rq)->elevator_private3) | 59 | #define RQ_CFQG(rq) (struct cfq_group *) ((rq)->elevator_private[2]) |
60 | 60 | ||
61 | static struct kmem_cache *cfq_pool; | 61 | static struct kmem_cache *cfq_pool; |
62 | static struct kmem_cache *cfq_ioc_pool; | 62 | static struct kmem_cache *cfq_ioc_pool; |
@@ -87,7 +87,6 @@ struct cfq_rb_root { | |||
87 | unsigned count; | 87 | unsigned count; |
88 | unsigned total_weight; | 88 | unsigned total_weight; |
89 | u64 min_vdisktime; | 89 | u64 min_vdisktime; |
90 | struct rb_node *active; | ||
91 | }; | 90 | }; |
92 | #define CFQ_RB_ROOT (struct cfq_rb_root) { .rb = RB_ROOT, .left = NULL, \ | 91 | #define CFQ_RB_ROOT (struct cfq_rb_root) { .rb = RB_ROOT, .left = NULL, \ |
93 | .count = 0, .min_vdisktime = 0, } | 92 | .count = 0, .min_vdisktime = 0, } |
@@ -97,7 +96,7 @@ struct cfq_rb_root { | |||
97 | */ | 96 | */ |
98 | struct cfq_queue { | 97 | struct cfq_queue { |
99 | /* reference count */ | 98 | /* reference count */ |
100 | atomic_t ref; | 99 | int ref; |
101 | /* various state flags, see below */ | 100 | /* various state flags, see below */ |
102 | unsigned int flags; | 101 | unsigned int flags; |
103 | /* parent cfq_data */ | 102 | /* parent cfq_data */ |
@@ -147,7 +146,6 @@ struct cfq_queue { | |||
147 | struct cfq_rb_root *service_tree; | 146 | struct cfq_rb_root *service_tree; |
148 | struct cfq_queue *new_cfqq; | 147 | struct cfq_queue *new_cfqq; |
149 | struct cfq_group *cfqg; | 148 | struct cfq_group *cfqg; |
150 | struct cfq_group *orig_cfqg; | ||
151 | /* Number of sectors dispatched from queue in single dispatch round */ | 149 | /* Number of sectors dispatched from queue in single dispatch round */ |
152 | unsigned long nr_sectors; | 150 | unsigned long nr_sectors; |
153 | }; | 151 | }; |
@@ -160,6 +158,7 @@ enum wl_prio_t { | |||
160 | BE_WORKLOAD = 0, | 158 | BE_WORKLOAD = 0, |
161 | RT_WORKLOAD = 1, | 159 | RT_WORKLOAD = 1, |
162 | IDLE_WORKLOAD = 2, | 160 | IDLE_WORKLOAD = 2, |
161 | CFQ_PRIO_NR, | ||
163 | }; | 162 | }; |
164 | 163 | ||
165 | /* | 164 | /* |
@@ -179,15 +178,25 @@ struct cfq_group { | |||
179 | /* group service_tree key */ | 178 | /* group service_tree key */ |
180 | u64 vdisktime; | 179 | u64 vdisktime; |
181 | unsigned int weight; | 180 | unsigned int weight; |
182 | bool on_st; | 181 | unsigned int new_weight; |
182 | bool needs_update; | ||
183 | 183 | ||
184 | /* number of cfqq currently on this group */ | 184 | /* number of cfqq currently on this group */ |
185 | int nr_cfqq; | 185 | int nr_cfqq; |
186 | 186 | ||
187 | /* Per group busy queus average. Useful for workload slice calc. */ | ||
188 | unsigned int busy_queues_avg[2]; | ||
189 | /* | 187 | /* |
190 | * rr lists of queues with requests, onle rr for each priority class. | 188 | * Per group busy queues average. Useful for workload slice calc. We |
189 | * create the array for each prio class but at run time it is used | ||
190 | * only for RT and BE class and slot for IDLE class remains unused. | ||
191 | * This is primarily done to avoid confusion and a gcc warning. | ||
192 | */ | ||
193 | unsigned int busy_queues_avg[CFQ_PRIO_NR]; | ||
194 | /* | ||
195 | * rr lists of queues with requests. We maintain service trees for | ||
196 | * RT and BE classes. These trees are subdivided in subclasses | ||
197 | * of SYNC, SYNC_NOIDLE and ASYNC based on workload type. For IDLE | ||
198 | * class there is no subclassification and all the cfq queues go on | ||
199 | * a single tree service_tree_idle. | ||
191 | * Counts are embedded in the cfq_rb_root | 200 | * Counts are embedded in the cfq_rb_root |
192 | */ | 201 | */ |
193 | struct cfq_rb_root service_trees[2][3]; | 202 | struct cfq_rb_root service_trees[2][3]; |
@@ -199,7 +208,7 @@ struct cfq_group { | |||
199 | struct blkio_group blkg; | 208 | struct blkio_group blkg; |
200 | #ifdef CONFIG_CFQ_GROUP_IOSCHED | 209 | #ifdef CONFIG_CFQ_GROUP_IOSCHED |
201 | struct hlist_node cfqd_node; | 210 | struct hlist_node cfqd_node; |
202 | atomic_t ref; | 211 | int ref; |
203 | #endif | 212 | #endif |
204 | /* number of requests that are on the dispatch list or inside driver */ | 213 | /* number of requests that are on the dispatch list or inside driver */ |
205 | int dispatched; | 214 | int dispatched; |
@@ -221,7 +230,6 @@ struct cfq_data { | |||
221 | enum wl_type_t serving_type; | 230 | enum wl_type_t serving_type; |
222 | unsigned long workload_expires; | 231 | unsigned long workload_expires; |
223 | struct cfq_group *serving_group; | 232 | struct cfq_group *serving_group; |
224 | bool noidle_tree_requires_idle; | ||
225 | 233 | ||
226 | /* | 234 | /* |
227 | * Each priority tree is sorted by next_request position. These | 235 | * Each priority tree is sorted by next_request position. These |
@@ -231,6 +239,7 @@ struct cfq_data { | |||
231 | struct rb_root prio_trees[CFQ_PRIO_LISTS]; | 239 | struct rb_root prio_trees[CFQ_PRIO_LISTS]; |
232 | 240 | ||
233 | unsigned int busy_queues; | 241 | unsigned int busy_queues; |
242 | unsigned int busy_sync_queues; | ||
234 | 243 | ||
235 | int rq_in_driver; | 244 | int rq_in_driver; |
236 | int rq_in_flight[2]; | 245 | int rq_in_flight[2]; |
@@ -278,7 +287,6 @@ struct cfq_data { | |||
278 | unsigned int cfq_slice_idle; | 287 | unsigned int cfq_slice_idle; |
279 | unsigned int cfq_group_idle; | 288 | unsigned int cfq_group_idle; |
280 | unsigned int cfq_latency; | 289 | unsigned int cfq_latency; |
281 | unsigned int cfq_group_isolation; | ||
282 | 290 | ||
283 | unsigned int cic_index; | 291 | unsigned int cic_index; |
284 | struct list_head cic_list; | 292 | struct list_head cic_list; |
@@ -292,7 +300,9 @@ struct cfq_data { | |||
292 | 300 | ||
293 | /* List of cfq groups being managed on this device*/ | 301 | /* List of cfq groups being managed on this device*/ |
294 | struct hlist_head cfqg_list; | 302 | struct hlist_head cfqg_list; |
295 | struct rcu_head rcu; | 303 | |
304 | /* Number of groups which are on blkcg->blkg_list */ | ||
305 | unsigned int nr_blkcg_linked_grps; | ||
296 | }; | 306 | }; |
297 | 307 | ||
298 | static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd); | 308 | static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd); |
@@ -359,16 +369,16 @@ CFQ_CFQQ_FNS(wait_busy); | |||
359 | #define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \ | 369 | #define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \ |
360 | blk_add_trace_msg((cfqd)->queue, "cfq%d%c %s " fmt, (cfqq)->pid, \ | 370 | blk_add_trace_msg((cfqd)->queue, "cfq%d%c %s " fmt, (cfqq)->pid, \ |
361 | cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \ | 371 | cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \ |
362 | blkg_path(&(cfqq)->cfqg->blkg), ##args); | 372 | blkg_path(&(cfqq)->cfqg->blkg), ##args) |
363 | 373 | ||
364 | #define cfq_log_cfqg(cfqd, cfqg, fmt, args...) \ | 374 | #define cfq_log_cfqg(cfqd, cfqg, fmt, args...) \ |
365 | blk_add_trace_msg((cfqd)->queue, "%s " fmt, \ | 375 | blk_add_trace_msg((cfqd)->queue, "%s " fmt, \ |
366 | blkg_path(&(cfqg)->blkg), ##args); \ | 376 | blkg_path(&(cfqg)->blkg), ##args) \ |
367 | 377 | ||
368 | #else | 378 | #else |
369 | #define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \ | 379 | #define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \ |
370 | blk_add_trace_msg((cfqd)->queue, "cfq%d " fmt, (cfqq)->pid, ##args) | 380 | blk_add_trace_msg((cfqd)->queue, "cfq%d " fmt, (cfqq)->pid, ##args) |
371 | #define cfq_log_cfqg(cfqd, cfqg, fmt, args...) do {} while (0); | 381 | #define cfq_log_cfqg(cfqd, cfqg, fmt, args...) do {} while (0) |
372 | #endif | 382 | #endif |
373 | #define cfq_log(cfqd, fmt, args...) \ | 383 | #define cfq_log(cfqd, fmt, args...) \ |
374 | blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args) | 384 | blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args) |
@@ -494,13 +504,6 @@ static inline void cfq_schedule_dispatch(struct cfq_data *cfqd) | |||
494 | } | 504 | } |
495 | } | 505 | } |
496 | 506 | ||
497 | static int cfq_queue_empty(struct request_queue *q) | ||
498 | { | ||
499 | struct cfq_data *cfqd = q->elevator->elevator_data; | ||
500 | |||
501 | return !cfqd->rq_queued; | ||
502 | } | ||
503 | |||
504 | /* | 507 | /* |
505 | * Scale schedule slice based on io priority. Use the sync time slice only | 508 | * Scale schedule slice based on io priority. Use the sync time slice only |
506 | * if a queue is marked sync and has sync io queued. A sync queue with async | 509 | * if a queue is marked sync and has sync io queued. A sync queue with async |
@@ -551,20 +554,13 @@ static inline u64 min_vdisktime(u64 min_vdisktime, u64 vdisktime) | |||
551 | 554 | ||
552 | static void update_min_vdisktime(struct cfq_rb_root *st) | 555 | static void update_min_vdisktime(struct cfq_rb_root *st) |
553 | { | 556 | { |
554 | u64 vdisktime = st->min_vdisktime; | ||
555 | struct cfq_group *cfqg; | 557 | struct cfq_group *cfqg; |
556 | 558 | ||
557 | if (st->active) { | ||
558 | cfqg = rb_entry_cfqg(st->active); | ||
559 | vdisktime = cfqg->vdisktime; | ||
560 | } | ||
561 | |||
562 | if (st->left) { | 559 | if (st->left) { |
563 | cfqg = rb_entry_cfqg(st->left); | 560 | cfqg = rb_entry_cfqg(st->left); |
564 | vdisktime = min_vdisktime(vdisktime, cfqg->vdisktime); | 561 | st->min_vdisktime = max_vdisktime(st->min_vdisktime, |
562 | cfqg->vdisktime); | ||
565 | } | 563 | } |
566 | |||
567 | st->min_vdisktime = max_vdisktime(st->min_vdisktime, vdisktime); | ||
568 | } | 564 | } |
569 | 565 | ||
570 | /* | 566 | /* |
@@ -596,8 +592,8 @@ cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg) | |||
596 | return cfq_target_latency * cfqg->weight / st->total_weight; | 592 | return cfq_target_latency * cfqg->weight / st->total_weight; |
597 | } | 593 | } |
598 | 594 | ||
599 | static inline void | 595 | static inline unsigned |
600 | cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq) | 596 | cfq_scaled_cfqq_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq) |
601 | { | 597 | { |
602 | unsigned slice = cfq_prio_to_slice(cfqd, cfqq); | 598 | unsigned slice = cfq_prio_to_slice(cfqd, cfqq); |
603 | if (cfqd->cfq_latency) { | 599 | if (cfqd->cfq_latency) { |
@@ -623,6 +619,14 @@ cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq) | |||
623 | low_slice); | 619 | low_slice); |
624 | } | 620 | } |
625 | } | 621 | } |
622 | return slice; | ||
623 | } | ||
624 | |||
625 | static inline void | ||
626 | cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq) | ||
627 | { | ||
628 | unsigned slice = cfq_scaled_cfqq_slice(cfqd, cfqq); | ||
629 | |||
626 | cfqq->slice_start = jiffies; | 630 | cfqq->slice_start = jiffies; |
627 | cfqq->slice_end = jiffies + slice; | 631 | cfqq->slice_end = jiffies + slice; |
628 | cfqq->allocated_slice = slice; | 632 | cfqq->allocated_slice = slice; |
@@ -637,11 +641,11 @@ cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq) | |||
637 | static inline bool cfq_slice_used(struct cfq_queue *cfqq) | 641 | static inline bool cfq_slice_used(struct cfq_queue *cfqq) |
638 | { | 642 | { |
639 | if (cfq_cfqq_slice_new(cfqq)) | 643 | if (cfq_cfqq_slice_new(cfqq)) |
640 | return 0; | 644 | return false; |
641 | if (time_before(jiffies, cfqq->slice_end)) | 645 | if (time_before(jiffies, cfqq->slice_end)) |
642 | return 0; | 646 | return false; |
643 | 647 | ||
644 | return 1; | 648 | return true; |
645 | } | 649 | } |
646 | 650 | ||
647 | /* | 651 | /* |
@@ -663,15 +667,11 @@ cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2, | |||
663 | if (rq2 == NULL) | 667 | if (rq2 == NULL) |
664 | return rq1; | 668 | return rq1; |
665 | 669 | ||
666 | if (rq_is_sync(rq1) && !rq_is_sync(rq2)) | 670 | if (rq_is_sync(rq1) != rq_is_sync(rq2)) |
667 | return rq1; | 671 | return rq_is_sync(rq1) ? rq1 : rq2; |
668 | else if (rq_is_sync(rq2) && !rq_is_sync(rq1)) | 672 | |
669 | return rq2; | 673 | if ((rq1->cmd_flags ^ rq2->cmd_flags) & REQ_META) |
670 | if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META)) | 674 | return rq1->cmd_flags & REQ_META ? rq1 : rq2; |
671 | return rq1; | ||
672 | else if ((rq2->cmd_flags & REQ_META) && | ||
673 | !(rq1->cmd_flags & REQ_META)) | ||
674 | return rq2; | ||
675 | 675 | ||
676 | s1 = blk_rq_pos(rq1); | 676 | s1 = blk_rq_pos(rq1); |
677 | s2 = blk_rq_pos(rq2); | 677 | s2 = blk_rq_pos(rq2); |
@@ -853,20 +853,40 @@ __cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg) | |||
853 | } | 853 | } |
854 | 854 | ||
855 | static void | 855 | static void |
856 | cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg) | 856 | cfq_update_group_weight(struct cfq_group *cfqg) |
857 | { | ||
858 | BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node)); | ||
859 | if (cfqg->needs_update) { | ||
860 | cfqg->weight = cfqg->new_weight; | ||
861 | cfqg->needs_update = false; | ||
862 | } | ||
863 | } | ||
864 | |||
865 | static void | ||
866 | cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg) | ||
867 | { | ||
868 | BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node)); | ||
869 | |||
870 | cfq_update_group_weight(cfqg); | ||
871 | __cfq_group_service_tree_add(st, cfqg); | ||
872 | st->total_weight += cfqg->weight; | ||
873 | } | ||
874 | |||
875 | static void | ||
876 | cfq_group_notify_queue_add(struct cfq_data *cfqd, struct cfq_group *cfqg) | ||
857 | { | 877 | { |
858 | struct cfq_rb_root *st = &cfqd->grp_service_tree; | 878 | struct cfq_rb_root *st = &cfqd->grp_service_tree; |
859 | struct cfq_group *__cfqg; | 879 | struct cfq_group *__cfqg; |
860 | struct rb_node *n; | 880 | struct rb_node *n; |
861 | 881 | ||
862 | cfqg->nr_cfqq++; | 882 | cfqg->nr_cfqq++; |
863 | if (cfqg->on_st) | 883 | if (!RB_EMPTY_NODE(&cfqg->rb_node)) |
864 | return; | 884 | return; |
865 | 885 | ||
866 | /* | 886 | /* |
867 | * Currently put the group at the end. Later implement something | 887 | * Currently put the group at the end. Later implement something |
868 | * so that groups get lesser vtime based on their weights, so that | 888 | * so that groups get lesser vtime based on their weights, so that |
869 | * if group does not loose all if it was not continously backlogged. | 889 | * if group does not loose all if it was not continuously backlogged. |
870 | */ | 890 | */ |
871 | n = rb_last(&st->rb); | 891 | n = rb_last(&st->rb); |
872 | if (n) { | 892 | if (n) { |
@@ -874,20 +894,22 @@ cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg) | |||
874 | cfqg->vdisktime = __cfqg->vdisktime + CFQ_IDLE_DELAY; | 894 | cfqg->vdisktime = __cfqg->vdisktime + CFQ_IDLE_DELAY; |
875 | } else | 895 | } else |
876 | cfqg->vdisktime = st->min_vdisktime; | 896 | cfqg->vdisktime = st->min_vdisktime; |
897 | cfq_group_service_tree_add(st, cfqg); | ||
898 | } | ||
877 | 899 | ||
878 | __cfq_group_service_tree_add(st, cfqg); | 900 | static void |
879 | cfqg->on_st = true; | 901 | cfq_group_service_tree_del(struct cfq_rb_root *st, struct cfq_group *cfqg) |
880 | st->total_weight += cfqg->weight; | 902 | { |
903 | st->total_weight -= cfqg->weight; | ||
904 | if (!RB_EMPTY_NODE(&cfqg->rb_node)) | ||
905 | cfq_rb_erase(&cfqg->rb_node, st); | ||
881 | } | 906 | } |
882 | 907 | ||
883 | static void | 908 | static void |
884 | cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg) | 909 | cfq_group_notify_queue_del(struct cfq_data *cfqd, struct cfq_group *cfqg) |
885 | { | 910 | { |
886 | struct cfq_rb_root *st = &cfqd->grp_service_tree; | 911 | struct cfq_rb_root *st = &cfqd->grp_service_tree; |
887 | 912 | ||
888 | if (st->active == &cfqg->rb_node) | ||
889 | st->active = NULL; | ||
890 | |||
891 | BUG_ON(cfqg->nr_cfqq < 1); | 913 | BUG_ON(cfqg->nr_cfqq < 1); |
892 | cfqg->nr_cfqq--; | 914 | cfqg->nr_cfqq--; |
893 | 915 | ||
@@ -896,15 +918,13 @@ cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg) | |||
896 | return; | 918 | return; |
897 | 919 | ||
898 | cfq_log_cfqg(cfqd, cfqg, "del_from_rr group"); | 920 | cfq_log_cfqg(cfqd, cfqg, "del_from_rr group"); |
899 | cfqg->on_st = false; | 921 | cfq_group_service_tree_del(st, cfqg); |
900 | st->total_weight -= cfqg->weight; | ||
901 | if (!RB_EMPTY_NODE(&cfqg->rb_node)) | ||
902 | cfq_rb_erase(&cfqg->rb_node, st); | ||
903 | cfqg->saved_workload_slice = 0; | 922 | cfqg->saved_workload_slice = 0; |
904 | cfq_blkiocg_update_dequeue_stats(&cfqg->blkg, 1); | 923 | cfq_blkiocg_update_dequeue_stats(&cfqg->blkg, 1); |
905 | } | 924 | } |
906 | 925 | ||
907 | static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq) | 926 | static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq, |
927 | unsigned int *unaccounted_time) | ||
908 | { | 928 | { |
909 | unsigned int slice_used; | 929 | unsigned int slice_used; |
910 | 930 | ||
@@ -923,8 +943,13 @@ static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq) | |||
923 | 1); | 943 | 1); |
924 | } else { | 944 | } else { |
925 | slice_used = jiffies - cfqq->slice_start; | 945 | slice_used = jiffies - cfqq->slice_start; |
926 | if (slice_used > cfqq->allocated_slice) | 946 | if (slice_used > cfqq->allocated_slice) { |
947 | *unaccounted_time = slice_used - cfqq->allocated_slice; | ||
927 | slice_used = cfqq->allocated_slice; | 948 | slice_used = cfqq->allocated_slice; |
949 | } | ||
950 | if (time_after(cfqq->slice_start, cfqq->dispatch_start)) | ||
951 | *unaccounted_time += cfqq->slice_start - | ||
952 | cfqq->dispatch_start; | ||
928 | } | 953 | } |
929 | 954 | ||
930 | return slice_used; | 955 | return slice_used; |
@@ -934,12 +959,12 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg, | |||
934 | struct cfq_queue *cfqq) | 959 | struct cfq_queue *cfqq) |
935 | { | 960 | { |
936 | struct cfq_rb_root *st = &cfqd->grp_service_tree; | 961 | struct cfq_rb_root *st = &cfqd->grp_service_tree; |
937 | unsigned int used_sl, charge; | 962 | unsigned int used_sl, charge, unaccounted_sl = 0; |
938 | int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg) | 963 | int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg) |
939 | - cfqg->service_tree_idle.count; | 964 | - cfqg->service_tree_idle.count; |
940 | 965 | ||
941 | BUG_ON(nr_sync < 0); | 966 | BUG_ON(nr_sync < 0); |
942 | used_sl = charge = cfq_cfqq_slice_usage(cfqq); | 967 | used_sl = charge = cfq_cfqq_slice_usage(cfqq, &unaccounted_sl); |
943 | 968 | ||
944 | if (iops_mode(cfqd)) | 969 | if (iops_mode(cfqd)) |
945 | charge = cfqq->slice_dispatch; | 970 | charge = cfqq->slice_dispatch; |
@@ -947,9 +972,10 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg, | |||
947 | charge = cfqq->allocated_slice; | 972 | charge = cfqq->allocated_slice; |
948 | 973 | ||
949 | /* Can't update vdisktime while group is on service tree */ | 974 | /* Can't update vdisktime while group is on service tree */ |
950 | cfq_rb_erase(&cfqg->rb_node, st); | 975 | cfq_group_service_tree_del(st, cfqg); |
951 | cfqg->vdisktime += cfq_scale_slice(charge, cfqg); | 976 | cfqg->vdisktime += cfq_scale_slice(charge, cfqg); |
952 | __cfq_group_service_tree_add(st, cfqg); | 977 | /* If a new weight was requested, update now, off tree */ |
978 | cfq_group_service_tree_add(st, cfqg); | ||
953 | 979 | ||
954 | /* This group is being expired. Save the context */ | 980 | /* This group is being expired. Save the context */ |
955 | if (time_after(cfqd->workload_expires, jiffies)) { | 981 | if (time_after(cfqd->workload_expires, jiffies)) { |
@@ -962,10 +988,12 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg, | |||
962 | 988 | ||
963 | cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime, | 989 | cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime, |
964 | st->min_vdisktime); | 990 | st->min_vdisktime); |
965 | cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u disp=%u charge=%u iops=%u" | 991 | cfq_log_cfqq(cfqq->cfqd, cfqq, |
966 | " sect=%u", used_sl, cfqq->slice_dispatch, charge, | 992 | "sl_used=%u disp=%u charge=%u iops=%u sect=%lu", |
967 | iops_mode(cfqd), cfqq->nr_sectors); | 993 | used_sl, cfqq->slice_dispatch, charge, |
968 | cfq_blkiocg_update_timeslice_used(&cfqg->blkg, used_sl); | 994 | iops_mode(cfqd), cfqq->nr_sectors); |
995 | cfq_blkiocg_update_timeslice_used(&cfqg->blkg, used_sl, | ||
996 | unaccounted_sl); | ||
969 | cfq_blkiocg_set_start_empty_time(&cfqg->blkg); | 997 | cfq_blkiocg_set_start_empty_time(&cfqg->blkg); |
970 | } | 998 | } |
971 | 999 | ||
@@ -977,35 +1005,55 @@ static inline struct cfq_group *cfqg_of_blkg(struct blkio_group *blkg) | |||
977 | return NULL; | 1005 | return NULL; |
978 | } | 1006 | } |
979 | 1007 | ||
980 | void | 1008 | void cfq_update_blkio_group_weight(void *key, struct blkio_group *blkg, |
981 | cfq_update_blkio_group_weight(struct blkio_group *blkg, unsigned int weight) | 1009 | unsigned int weight) |
982 | { | 1010 | { |
983 | cfqg_of_blkg(blkg)->weight = weight; | 1011 | struct cfq_group *cfqg = cfqg_of_blkg(blkg); |
1012 | cfqg->new_weight = weight; | ||
1013 | cfqg->needs_update = true; | ||
984 | } | 1014 | } |
985 | 1015 | ||
986 | static struct cfq_group * | 1016 | static void cfq_init_add_cfqg_lists(struct cfq_data *cfqd, |
987 | cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create) | 1017 | struct cfq_group *cfqg, struct blkio_cgroup *blkcg) |
988 | { | 1018 | { |
989 | struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup); | ||
990 | struct cfq_group *cfqg = NULL; | ||
991 | void *key = cfqd; | ||
992 | int i, j; | ||
993 | struct cfq_rb_root *st; | ||
994 | struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info; | 1019 | struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info; |
995 | unsigned int major, minor; | 1020 | unsigned int major, minor; |
996 | 1021 | ||
997 | cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key)); | 1022 | /* |
998 | if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) { | 1023 | * Add group onto cgroup list. It might happen that bdi->dev is |
1024 | * not initialized yet. Initialize this new group without major | ||
1025 | * and minor info and this info will be filled in once a new thread | ||
1026 | * comes for IO. | ||
1027 | */ | ||
1028 | if (bdi->dev) { | ||
999 | sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); | 1029 | sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); |
1000 | cfqg->blkg.dev = MKDEV(major, minor); | 1030 | cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, |
1001 | goto done; | 1031 | (void *)cfqd, MKDEV(major, minor)); |
1002 | } | 1032 | } else |
1003 | if (cfqg || !create) | 1033 | cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, |
1004 | goto done; | 1034 | (void *)cfqd, 0); |
1035 | |||
1036 | cfqd->nr_blkcg_linked_grps++; | ||
1037 | cfqg->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev); | ||
1038 | |||
1039 | /* Add group on cfqd list */ | ||
1040 | hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list); | ||
1041 | } | ||
1042 | |||
1043 | /* | ||
1044 | * Should be called from sleepable context. No request queue lock as per | ||
1045 | * cpu stats are allocated dynamically and alloc_percpu needs to be called | ||
1046 | * from sleepable context. | ||
1047 | */ | ||
1048 | static struct cfq_group * cfq_alloc_cfqg(struct cfq_data *cfqd) | ||
1049 | { | ||
1050 | struct cfq_group *cfqg = NULL; | ||
1051 | int i, j, ret; | ||
1052 | struct cfq_rb_root *st; | ||
1005 | 1053 | ||
1006 | cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node); | 1054 | cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node); |
1007 | if (!cfqg) | 1055 | if (!cfqg) |
1008 | goto done; | 1056 | return NULL; |
1009 | 1057 | ||
1010 | for_each_cfqg_st(cfqg, i, j, st) | 1058 | for_each_cfqg_st(cfqg, i, j, st) |
1011 | *st = CFQ_RB_ROOT; | 1059 | *st = CFQ_RB_ROOT; |
@@ -1017,52 +1065,103 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create) | |||
1017 | * elevator which will be dropped by either elevator exit | 1065 | * elevator which will be dropped by either elevator exit |
1018 | * or cgroup deletion path depending on who is exiting first. | 1066 | * or cgroup deletion path depending on who is exiting first. |
1019 | */ | 1067 | */ |
1020 | atomic_set(&cfqg->ref, 1); | 1068 | cfqg->ref = 1; |
1069 | |||
1070 | ret = blkio_alloc_blkg_stats(&cfqg->blkg); | ||
1071 | if (ret) { | ||
1072 | kfree(cfqg); | ||
1073 | return NULL; | ||
1074 | } | ||
1075 | |||
1076 | return cfqg; | ||
1077 | } | ||
1078 | |||
1079 | static struct cfq_group * | ||
1080 | cfq_find_cfqg(struct cfq_data *cfqd, struct blkio_cgroup *blkcg) | ||
1081 | { | ||
1082 | struct cfq_group *cfqg = NULL; | ||
1083 | void *key = cfqd; | ||
1084 | struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info; | ||
1085 | unsigned int major, minor; | ||
1021 | 1086 | ||
1022 | /* | 1087 | /* |
1023 | * Add group onto cgroup list. It might happen that bdi->dev is | 1088 | * This is the common case when there are no blkio cgroups. |
1024 | * not initiliazed yet. Initialize this new group without major | 1089 | * Avoid lookup in this case |
1025 | * and minor info and this info will be filled in once a new thread | ||
1026 | * comes for IO. See code above. | ||
1027 | */ | 1090 | */ |
1028 | if (bdi->dev) { | 1091 | if (blkcg == &blkio_root_cgroup) |
1029 | sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); | 1092 | cfqg = &cfqd->root_group; |
1030 | cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd, | 1093 | else |
1031 | MKDEV(major, minor)); | 1094 | cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key)); |
1032 | } else | ||
1033 | cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd, | ||
1034 | 0); | ||
1035 | |||
1036 | cfqg->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev); | ||
1037 | 1095 | ||
1038 | /* Add group on cfqd list */ | 1096 | if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) { |
1039 | hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list); | 1097 | sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); |
1098 | cfqg->blkg.dev = MKDEV(major, minor); | ||
1099 | } | ||
1040 | 1100 | ||
1041 | done: | ||
1042 | return cfqg; | 1101 | return cfqg; |
1043 | } | 1102 | } |
1044 | 1103 | ||
1045 | /* | 1104 | /* |
1046 | * Search for the cfq group current task belongs to. If create = 1, then also | 1105 | * Search for the cfq group current task belongs to. request_queue lock must |
1047 | * create the cfq group if it does not exist. request_queue lock must be held. | 1106 | * be held. |
1048 | */ | 1107 | */ |
1049 | static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create) | 1108 | static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd) |
1050 | { | 1109 | { |
1051 | struct cgroup *cgroup; | 1110 | struct blkio_cgroup *blkcg; |
1052 | struct cfq_group *cfqg = NULL; | 1111 | struct cfq_group *cfqg = NULL, *__cfqg = NULL; |
1112 | struct request_queue *q = cfqd->queue; | ||
1113 | |||
1114 | rcu_read_lock(); | ||
1115 | blkcg = task_blkio_cgroup(current); | ||
1116 | cfqg = cfq_find_cfqg(cfqd, blkcg); | ||
1117 | if (cfqg) { | ||
1118 | rcu_read_unlock(); | ||
1119 | return cfqg; | ||
1120 | } | ||
1121 | |||
1122 | /* | ||
1123 | * Need to allocate a group. Allocation of group also needs allocation | ||
1124 | * of per cpu stats which in-turn takes a mutex() and can block. Hence | ||
1125 | * we need to drop rcu lock and queue_lock before we call alloc. | ||
1126 | * | ||
1127 | * Not taking any queue reference here and assuming that queue is | ||
1128 | * around by the time we return. CFQ queue allocation code does | ||
1129 | * the same. It might be racy though. | ||
1130 | */ | ||
1131 | |||
1132 | rcu_read_unlock(); | ||
1133 | spin_unlock_irq(q->queue_lock); | ||
1134 | |||
1135 | cfqg = cfq_alloc_cfqg(cfqd); | ||
1136 | |||
1137 | spin_lock_irq(q->queue_lock); | ||
1053 | 1138 | ||
1054 | rcu_read_lock(); | 1139 | rcu_read_lock(); |
1055 | cgroup = task_cgroup(current, blkio_subsys_id); | 1140 | blkcg = task_blkio_cgroup(current); |
1056 | cfqg = cfq_find_alloc_cfqg(cfqd, cgroup, create); | 1141 | |
1057 | if (!cfqg && create) | 1142 | /* |
1143 | * If some other thread already allocated the group while we were | ||
1144 | * not holding queue lock, free up the group | ||
1145 | */ | ||
1146 | __cfqg = cfq_find_cfqg(cfqd, blkcg); | ||
1147 | |||
1148 | if (__cfqg) { | ||
1149 | kfree(cfqg); | ||
1150 | rcu_read_unlock(); | ||
1151 | return __cfqg; | ||
1152 | } | ||
1153 | |||
1154 | if (!cfqg) | ||
1058 | cfqg = &cfqd->root_group; | 1155 | cfqg = &cfqd->root_group; |
1156 | |||
1157 | cfq_init_add_cfqg_lists(cfqd, cfqg, blkcg); | ||
1059 | rcu_read_unlock(); | 1158 | rcu_read_unlock(); |
1060 | return cfqg; | 1159 | return cfqg; |
1061 | } | 1160 | } |
1062 | 1161 | ||
1063 | static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg) | 1162 | static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg) |
1064 | { | 1163 | { |
1065 | atomic_inc(&cfqg->ref); | 1164 | cfqg->ref++; |
1066 | return cfqg; | 1165 | return cfqg; |
1067 | } | 1166 | } |
1068 | 1167 | ||
@@ -1074,7 +1173,7 @@ static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) | |||
1074 | 1173 | ||
1075 | cfqq->cfqg = cfqg; | 1174 | cfqq->cfqg = cfqg; |
1076 | /* cfqq reference on cfqg */ | 1175 | /* cfqq reference on cfqg */ |
1077 | atomic_inc(&cfqq->cfqg->ref); | 1176 | cfqq->cfqg->ref++; |
1078 | } | 1177 | } |
1079 | 1178 | ||
1080 | static void cfq_put_cfqg(struct cfq_group *cfqg) | 1179 | static void cfq_put_cfqg(struct cfq_group *cfqg) |
@@ -1082,11 +1181,13 @@ static void cfq_put_cfqg(struct cfq_group *cfqg) | |||
1082 | struct cfq_rb_root *st; | 1181 | struct cfq_rb_root *st; |
1083 | int i, j; | 1182 | int i, j; |
1084 | 1183 | ||
1085 | BUG_ON(atomic_read(&cfqg->ref) <= 0); | 1184 | BUG_ON(cfqg->ref <= 0); |
1086 | if (!atomic_dec_and_test(&cfqg->ref)) | 1185 | cfqg->ref--; |
1186 | if (cfqg->ref) | ||
1087 | return; | 1187 | return; |
1088 | for_each_cfqg_st(cfqg, i, j, st) | 1188 | for_each_cfqg_st(cfqg, i, j, st) |
1089 | BUG_ON(!RB_EMPTY_ROOT(&st->rb) || st->active != NULL); | 1189 | BUG_ON(!RB_EMPTY_ROOT(&st->rb)); |
1190 | free_percpu(cfqg->blkg.stats_cpu); | ||
1090 | kfree(cfqg); | 1191 | kfree(cfqg); |
1091 | } | 1192 | } |
1092 | 1193 | ||
@@ -1145,7 +1246,7 @@ void cfq_unlink_blkio_group(void *key, struct blkio_group *blkg) | |||
1145 | } | 1246 | } |
1146 | 1247 | ||
1147 | #else /* GROUP_IOSCHED */ | 1248 | #else /* GROUP_IOSCHED */ |
1148 | static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create) | 1249 | static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd) |
1149 | { | 1250 | { |
1150 | return &cfqd->root_group; | 1251 | return &cfqd->root_group; |
1151 | } | 1252 | } |
@@ -1179,33 +1280,6 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq, | |||
1179 | struct cfq_rb_root *service_tree; | 1280 | struct cfq_rb_root *service_tree; |
1180 | int left; | 1281 | int left; |
1181 | int new_cfqq = 1; | 1282 | int new_cfqq = 1; |
1182 | int group_changed = 0; | ||
1183 | |||
1184 | #ifdef CONFIG_CFQ_GROUP_IOSCHED | ||
1185 | if (!cfqd->cfq_group_isolation | ||
1186 | && cfqq_type(cfqq) == SYNC_NOIDLE_WORKLOAD | ||
1187 | && cfqq->cfqg && cfqq->cfqg != &cfqd->root_group) { | ||
1188 | /* Move this cfq to root group */ | ||
1189 | cfq_log_cfqq(cfqd, cfqq, "moving to root group"); | ||
1190 | if (!RB_EMPTY_NODE(&cfqq->rb_node)) | ||
1191 | cfq_group_service_tree_del(cfqd, cfqq->cfqg); | ||
1192 | cfqq->orig_cfqg = cfqq->cfqg; | ||
1193 | cfqq->cfqg = &cfqd->root_group; | ||
1194 | atomic_inc(&cfqd->root_group.ref); | ||
1195 | group_changed = 1; | ||
1196 | } else if (!cfqd->cfq_group_isolation | ||
1197 | && cfqq_type(cfqq) == SYNC_WORKLOAD && cfqq->orig_cfqg) { | ||
1198 | /* cfqq is sequential now needs to go to its original group */ | ||
1199 | BUG_ON(cfqq->cfqg != &cfqd->root_group); | ||
1200 | if (!RB_EMPTY_NODE(&cfqq->rb_node)) | ||
1201 | cfq_group_service_tree_del(cfqd, cfqq->cfqg); | ||
1202 | cfq_put_cfqg(cfqq->cfqg); | ||
1203 | cfqq->cfqg = cfqq->orig_cfqg; | ||
1204 | cfqq->orig_cfqg = NULL; | ||
1205 | group_changed = 1; | ||
1206 | cfq_log_cfqq(cfqd, cfqq, "moved to origin group"); | ||
1207 | } | ||
1208 | #endif | ||
1209 | 1283 | ||
1210 | service_tree = service_tree_for(cfqq->cfqg, cfqq_prio(cfqq), | 1284 | service_tree = service_tree_for(cfqq->cfqg, cfqq_prio(cfqq), |
1211 | cfqq_type(cfqq)); | 1285 | cfqq_type(cfqq)); |
@@ -1276,9 +1350,9 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq, | |||
1276 | rb_link_node(&cfqq->rb_node, parent, p); | 1350 | rb_link_node(&cfqq->rb_node, parent, p); |
1277 | rb_insert_color(&cfqq->rb_node, &service_tree->rb); | 1351 | rb_insert_color(&cfqq->rb_node, &service_tree->rb); |
1278 | service_tree->count++; | 1352 | service_tree->count++; |
1279 | if ((add_front || !new_cfqq) && !group_changed) | 1353 | if (add_front || !new_cfqq) |
1280 | return; | 1354 | return; |
1281 | cfq_group_service_tree_add(cfqd, cfqq->cfqg); | 1355 | cfq_group_notify_queue_add(cfqd, cfqq->cfqg); |
1282 | } | 1356 | } |
1283 | 1357 | ||
1284 | static struct cfq_queue * | 1358 | static struct cfq_queue * |
@@ -1366,6 +1440,8 @@ static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq) | |||
1366 | BUG_ON(cfq_cfqq_on_rr(cfqq)); | 1440 | BUG_ON(cfq_cfqq_on_rr(cfqq)); |
1367 | cfq_mark_cfqq_on_rr(cfqq); | 1441 | cfq_mark_cfqq_on_rr(cfqq); |
1368 | cfqd->busy_queues++; | 1442 | cfqd->busy_queues++; |
1443 | if (cfq_cfqq_sync(cfqq)) | ||
1444 | cfqd->busy_sync_queues++; | ||
1369 | 1445 | ||
1370 | cfq_resort_rr_list(cfqd, cfqq); | 1446 | cfq_resort_rr_list(cfqd, cfqq); |
1371 | } | 1447 | } |
@@ -1389,9 +1465,11 @@ static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq) | |||
1389 | cfqq->p_root = NULL; | 1465 | cfqq->p_root = NULL; |
1390 | } | 1466 | } |
1391 | 1467 | ||
1392 | cfq_group_service_tree_del(cfqd, cfqq->cfqg); | 1468 | cfq_group_notify_queue_del(cfqd, cfqq->cfqg); |
1393 | BUG_ON(!cfqd->busy_queues); | 1469 | BUG_ON(!cfqd->busy_queues); |
1394 | cfqd->busy_queues--; | 1470 | cfqd->busy_queues--; |
1471 | if (cfq_cfqq_sync(cfqq)) | ||
1472 | cfqd->busy_sync_queues--; | ||
1395 | } | 1473 | } |
1396 | 1474 | ||
1397 | /* | 1475 | /* |
@@ -1663,8 +1741,11 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq, | |||
1663 | /* | 1741 | /* |
1664 | * store what was left of this slice, if the queue idled/timed out | 1742 | * store what was left of this slice, if the queue idled/timed out |
1665 | */ | 1743 | */ |
1666 | if (timed_out && !cfq_cfqq_slice_new(cfqq)) { | 1744 | if (timed_out) { |
1667 | cfqq->slice_resid = cfqq->slice_end - jiffies; | 1745 | if (cfq_cfqq_slice_new(cfqq)) |
1746 | cfqq->slice_resid = cfq_scaled_cfqq_slice(cfqd, cfqq); | ||
1747 | else | ||
1748 | cfqq->slice_resid = cfqq->slice_end - jiffies; | ||
1668 | cfq_log_cfqq(cfqd, cfqq, "resid=%ld", cfqq->slice_resid); | 1749 | cfq_log_cfqq(cfqd, cfqq, "resid=%ld", cfqq->slice_resid); |
1669 | } | 1750 | } |
1670 | 1751 | ||
@@ -1678,9 +1759,6 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq, | |||
1678 | if (cfqq == cfqd->active_queue) | 1759 | if (cfqq == cfqd->active_queue) |
1679 | cfqd->active_queue = NULL; | 1760 | cfqd->active_queue = NULL; |
1680 | 1761 | ||
1681 | if (&cfqq->cfqg->rb_node == cfqd->grp_service_tree.active) | ||
1682 | cfqd->grp_service_tree.active = NULL; | ||
1683 | |||
1684 | if (cfqd->active_cic) { | 1762 | if (cfqd->active_cic) { |
1685 | put_io_context(cfqd->active_cic->ioc); | 1763 | put_io_context(cfqd->active_cic->ioc); |
1686 | cfqd->active_cic = NULL; | 1764 | cfqd->active_cic = NULL; |
@@ -1892,10 +1970,10 @@ static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq) | |||
1892 | * in their service tree. | 1970 | * in their service tree. |
1893 | */ | 1971 | */ |
1894 | if (service_tree->count == 1 && cfq_cfqq_sync(cfqq)) | 1972 | if (service_tree->count == 1 && cfq_cfqq_sync(cfqq)) |
1895 | return 1; | 1973 | return true; |
1896 | cfq_log_cfqq(cfqd, cfqq, "Not idling. st->count:%d", | 1974 | cfq_log_cfqq(cfqd, cfqq, "Not idling. st->count:%d", |
1897 | service_tree->count); | 1975 | service_tree->count); |
1898 | return 0; | 1976 | return false; |
1899 | } | 1977 | } |
1900 | 1978 | ||
1901 | static void cfq_arm_slice_timer(struct cfq_data *cfqd) | 1979 | static void cfq_arm_slice_timer(struct cfq_data *cfqd) |
@@ -1946,8 +2024,8 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd) | |||
1946 | */ | 2024 | */ |
1947 | if (sample_valid(cic->ttime_samples) && | 2025 | if (sample_valid(cic->ttime_samples) && |
1948 | (cfqq->slice_end - jiffies < cic->ttime_mean)) { | 2026 | (cfqq->slice_end - jiffies < cic->ttime_mean)) { |
1949 | cfq_log_cfqq(cfqd, cfqq, "Not idling. think_time:%d", | 2027 | cfq_log_cfqq(cfqd, cfqq, "Not idling. think_time:%lu", |
1950 | cic->ttime_mean); | 2028 | cic->ttime_mean); |
1951 | return; | 2029 | return; |
1952 | } | 2030 | } |
1953 | 2031 | ||
@@ -2020,7 +2098,7 @@ cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq) | |||
2020 | 2098 | ||
2021 | WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR); | 2099 | WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR); |
2022 | 2100 | ||
2023 | return 2 * (base_rq + base_rq * (CFQ_PRIO_LISTS - 1 - cfqq->ioprio)); | 2101 | return 2 * base_rq * (IOPRIO_BE_NR - cfqq->ioprio); |
2024 | } | 2102 | } |
2025 | 2103 | ||
2026 | /* | 2104 | /* |
@@ -2031,7 +2109,7 @@ static int cfqq_process_refs(struct cfq_queue *cfqq) | |||
2031 | int process_refs, io_refs; | 2109 | int process_refs, io_refs; |
2032 | 2110 | ||
2033 | io_refs = cfqq->allocated[READ] + cfqq->allocated[WRITE]; | 2111 | io_refs = cfqq->allocated[READ] + cfqq->allocated[WRITE]; |
2034 | process_refs = atomic_read(&cfqq->ref) - io_refs; | 2112 | process_refs = cfqq->ref - io_refs; |
2035 | BUG_ON(process_refs < 0); | 2113 | BUG_ON(process_refs < 0); |
2036 | return process_refs; | 2114 | return process_refs; |
2037 | } | 2115 | } |
@@ -2071,10 +2149,10 @@ static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq) | |||
2071 | */ | 2149 | */ |
2072 | if (new_process_refs >= process_refs) { | 2150 | if (new_process_refs >= process_refs) { |
2073 | cfqq->new_cfqq = new_cfqq; | 2151 | cfqq->new_cfqq = new_cfqq; |
2074 | atomic_add(process_refs, &new_cfqq->ref); | 2152 | new_cfqq->ref += process_refs; |
2075 | } else { | 2153 | } else { |
2076 | new_cfqq->new_cfqq = cfqq; | 2154 | new_cfqq->new_cfqq = cfqq; |
2077 | atomic_add(new_process_refs, &cfqq->ref); | 2155 | cfqq->ref += new_process_refs; |
2078 | } | 2156 | } |
2079 | } | 2157 | } |
2080 | 2158 | ||
@@ -2107,12 +2185,7 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg) | |||
2107 | unsigned count; | 2185 | unsigned count; |
2108 | struct cfq_rb_root *st; | 2186 | struct cfq_rb_root *st; |
2109 | unsigned group_slice; | 2187 | unsigned group_slice; |
2110 | 2188 | enum wl_prio_t original_prio = cfqd->serving_prio; | |
2111 | if (!cfqg) { | ||
2112 | cfqd->serving_prio = IDLE_WORKLOAD; | ||
2113 | cfqd->workload_expires = jiffies + 1; | ||
2114 | return; | ||
2115 | } | ||
2116 | 2189 | ||
2117 | /* Choose next priority. RT > BE > IDLE */ | 2190 | /* Choose next priority. RT > BE > IDLE */ |
2118 | if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg)) | 2191 | if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg)) |
@@ -2125,6 +2198,9 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg) | |||
2125 | return; | 2198 | return; |
2126 | } | 2199 | } |
2127 | 2200 | ||
2201 | if (original_prio != cfqd->serving_prio) | ||
2202 | goto new_workload; | ||
2203 | |||
2128 | /* | 2204 | /* |
2129 | * For RT and BE, we have to choose also the type | 2205 | * For RT and BE, we have to choose also the type |
2130 | * (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload | 2206 | * (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload |
@@ -2139,6 +2215,7 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg) | |||
2139 | if (count && !time_after(jiffies, cfqd->workload_expires)) | 2215 | if (count && !time_after(jiffies, cfqd->workload_expires)) |
2140 | return; | 2216 | return; |
2141 | 2217 | ||
2218 | new_workload: | ||
2142 | /* otherwise select new workload type */ | 2219 | /* otherwise select new workload type */ |
2143 | cfqd->serving_type = | 2220 | cfqd->serving_type = |
2144 | cfq_choose_wl(cfqd, cfqg, cfqd->serving_prio); | 2221 | cfq_choose_wl(cfqd, cfqg, cfqd->serving_prio); |
@@ -2180,7 +2257,6 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg) | |||
2180 | slice = max_t(unsigned, slice, CFQ_MIN_TT); | 2257 | slice = max_t(unsigned, slice, CFQ_MIN_TT); |
2181 | cfq_log(cfqd, "workload slice:%d", slice); | 2258 | cfq_log(cfqd, "workload slice:%d", slice); |
2182 | cfqd->workload_expires = jiffies + slice; | 2259 | cfqd->workload_expires = jiffies + slice; |
2183 | cfqd->noidle_tree_requires_idle = false; | ||
2184 | } | 2260 | } |
2185 | 2261 | ||
2186 | static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd) | 2262 | static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd) |
@@ -2191,7 +2267,6 @@ static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd) | |||
2191 | if (RB_EMPTY_ROOT(&st->rb)) | 2267 | if (RB_EMPTY_ROOT(&st->rb)) |
2192 | return NULL; | 2268 | return NULL; |
2193 | cfqg = cfq_rb_first_group(st); | 2269 | cfqg = cfq_rb_first_group(st); |
2194 | st->active = &cfqg->rb_node; | ||
2195 | update_min_vdisktime(st); | 2270 | update_min_vdisktime(st); |
2196 | return cfqg; | 2271 | return cfqg; |
2197 | } | 2272 | } |
@@ -2285,6 +2360,17 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd) | |||
2285 | goto keep_queue; | 2360 | goto keep_queue; |
2286 | } | 2361 | } |
2287 | 2362 | ||
2363 | /* | ||
2364 | * This is a deep seek queue, but the device is much faster than | ||
2365 | * the queue can deliver, don't idle | ||
2366 | **/ | ||
2367 | if (CFQQ_SEEKY(cfqq) && cfq_cfqq_idle_window(cfqq) && | ||
2368 | (cfq_cfqq_slice_new(cfqq) || | ||
2369 | (cfqq->slice_end - jiffies > jiffies - cfqq->slice_start))) { | ||
2370 | cfq_clear_cfqq_deep(cfqq); | ||
2371 | cfq_clear_cfqq_idle_window(cfqq); | ||
2372 | } | ||
2373 | |||
2288 | if (cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) { | 2374 | if (cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) { |
2289 | cfqq = NULL; | 2375 | cfqq = NULL; |
2290 | goto keep_queue; | 2376 | goto keep_queue; |
@@ -2359,12 +2445,12 @@ static inline bool cfq_slice_used_soon(struct cfq_data *cfqd, | |||
2359 | { | 2445 | { |
2360 | /* the queue hasn't finished any request, can't estimate */ | 2446 | /* the queue hasn't finished any request, can't estimate */ |
2361 | if (cfq_cfqq_slice_new(cfqq)) | 2447 | if (cfq_cfqq_slice_new(cfqq)) |
2362 | return 1; | 2448 | return true; |
2363 | if (time_after(jiffies + cfqd->cfq_slice_idle * cfqq->dispatched, | 2449 | if (time_after(jiffies + cfqd->cfq_slice_idle * cfqq->dispatched, |
2364 | cfqq->slice_end)) | 2450 | cfqq->slice_end)) |
2365 | return 1; | 2451 | return true; |
2366 | 2452 | ||
2367 | return 0; | 2453 | return false; |
2368 | } | 2454 | } |
2369 | 2455 | ||
2370 | static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq) | 2456 | static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq) |
@@ -2391,6 +2477,7 @@ static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq) | |||
2391 | * Does this cfqq already have too much IO in flight? | 2477 | * Does this cfqq already have too much IO in flight? |
2392 | */ | 2478 | */ |
2393 | if (cfqq->dispatched >= max_dispatch) { | 2479 | if (cfqq->dispatched >= max_dispatch) { |
2480 | bool promote_sync = false; | ||
2394 | /* | 2481 | /* |
2395 | * idle queue must always only have a single IO in flight | 2482 | * idle queue must always only have a single IO in flight |
2396 | */ | 2483 | */ |
@@ -2398,15 +2485,26 @@ static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq) | |||
2398 | return false; | 2485 | return false; |
2399 | 2486 | ||
2400 | /* | 2487 | /* |
2488 | * If there is only one sync queue | ||
2489 | * we can ignore async queue here and give the sync | ||
2490 | * queue no dispatch limit. The reason is a sync queue can | ||
2491 | * preempt async queue, limiting the sync queue doesn't make | ||
2492 | * sense. This is useful for aiostress test. | ||
2493 | */ | ||
2494 | if (cfq_cfqq_sync(cfqq) && cfqd->busy_sync_queues == 1) | ||
2495 | promote_sync = true; | ||
2496 | |||
2497 | /* | ||
2401 | * We have other queues, don't allow more IO from this one | 2498 | * We have other queues, don't allow more IO from this one |
2402 | */ | 2499 | */ |
2403 | if (cfqd->busy_queues > 1 && cfq_slice_used_soon(cfqd, cfqq)) | 2500 | if (cfqd->busy_queues > 1 && cfq_slice_used_soon(cfqd, cfqq) && |
2501 | !promote_sync) | ||
2404 | return false; | 2502 | return false; |
2405 | 2503 | ||
2406 | /* | 2504 | /* |
2407 | * Sole queue user, no limit | 2505 | * Sole queue user, no limit |
2408 | */ | 2506 | */ |
2409 | if (cfqd->busy_queues == 1) | 2507 | if (cfqd->busy_queues == 1 || promote_sync) |
2410 | max_dispatch = -1; | 2508 | max_dispatch = -1; |
2411 | else | 2509 | else |
2412 | /* | 2510 | /* |
@@ -2528,18 +2626,18 @@ static int cfq_dispatch_requests(struct request_queue *q, int force) | |||
2528 | static void cfq_put_queue(struct cfq_queue *cfqq) | 2626 | static void cfq_put_queue(struct cfq_queue *cfqq) |
2529 | { | 2627 | { |
2530 | struct cfq_data *cfqd = cfqq->cfqd; | 2628 | struct cfq_data *cfqd = cfqq->cfqd; |
2531 | struct cfq_group *cfqg, *orig_cfqg; | 2629 | struct cfq_group *cfqg; |
2532 | 2630 | ||
2533 | BUG_ON(atomic_read(&cfqq->ref) <= 0); | 2631 | BUG_ON(cfqq->ref <= 0); |
2534 | 2632 | ||
2535 | if (!atomic_dec_and_test(&cfqq->ref)) | 2633 | cfqq->ref--; |
2634 | if (cfqq->ref) | ||
2536 | return; | 2635 | return; |
2537 | 2636 | ||
2538 | cfq_log_cfqq(cfqd, cfqq, "put_queue"); | 2637 | cfq_log_cfqq(cfqd, cfqq, "put_queue"); |
2539 | BUG_ON(rb_first(&cfqq->sort_list)); | 2638 | BUG_ON(rb_first(&cfqq->sort_list)); |
2540 | BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]); | 2639 | BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]); |
2541 | cfqg = cfqq->cfqg; | 2640 | cfqg = cfqq->cfqg; |
2542 | orig_cfqg = cfqq->orig_cfqg; | ||
2543 | 2641 | ||
2544 | if (unlikely(cfqd->active_queue == cfqq)) { | 2642 | if (unlikely(cfqd->active_queue == cfqq)) { |
2545 | __cfq_slice_expired(cfqd, cfqq, 0); | 2643 | __cfq_slice_expired(cfqd, cfqq, 0); |
@@ -2549,33 +2647,23 @@ static void cfq_put_queue(struct cfq_queue *cfqq) | |||
2549 | BUG_ON(cfq_cfqq_on_rr(cfqq)); | 2647 | BUG_ON(cfq_cfqq_on_rr(cfqq)); |
2550 | kmem_cache_free(cfq_pool, cfqq); | 2648 | kmem_cache_free(cfq_pool, cfqq); |
2551 | cfq_put_cfqg(cfqg); | 2649 | cfq_put_cfqg(cfqg); |
2552 | if (orig_cfqg) | ||
2553 | cfq_put_cfqg(orig_cfqg); | ||
2554 | } | 2650 | } |
2555 | 2651 | ||
2556 | /* | 2652 | /* |
2557 | * Must always be called with the rcu_read_lock() held | 2653 | * Call func for each cic attached to this ioc. |
2558 | */ | 2654 | */ |
2559 | static void | 2655 | static void |
2560 | __call_for_each_cic(struct io_context *ioc, | 2656 | call_for_each_cic(struct io_context *ioc, |
2561 | void (*func)(struct io_context *, struct cfq_io_context *)) | 2657 | void (*func)(struct io_context *, struct cfq_io_context *)) |
2562 | { | 2658 | { |
2563 | struct cfq_io_context *cic; | 2659 | struct cfq_io_context *cic; |
2564 | struct hlist_node *n; | 2660 | struct hlist_node *n; |
2565 | 2661 | ||
2662 | rcu_read_lock(); | ||
2663 | |||
2566 | hlist_for_each_entry_rcu(cic, n, &ioc->cic_list, cic_list) | 2664 | hlist_for_each_entry_rcu(cic, n, &ioc->cic_list, cic_list) |
2567 | func(ioc, cic); | 2665 | func(ioc, cic); |
2568 | } | ||
2569 | 2666 | ||
2570 | /* | ||
2571 | * Call func for each cic attached to this ioc. | ||
2572 | */ | ||
2573 | static void | ||
2574 | call_for_each_cic(struct io_context *ioc, | ||
2575 | void (*func)(struct io_context *, struct cfq_io_context *)) | ||
2576 | { | ||
2577 | rcu_read_lock(); | ||
2578 | __call_for_each_cic(ioc, func); | ||
2579 | rcu_read_unlock(); | 2667 | rcu_read_unlock(); |
2580 | } | 2668 | } |
2581 | 2669 | ||
@@ -2636,7 +2724,7 @@ static void cfq_free_io_context(struct io_context *ioc) | |||
2636 | * should be ok to iterate over the known list, we will see all cic's | 2724 | * should be ok to iterate over the known list, we will see all cic's |
2637 | * since no new ones are added. | 2725 | * since no new ones are added. |
2638 | */ | 2726 | */ |
2639 | __call_for_each_cic(ioc, cic_free_func); | 2727 | call_for_each_cic(ioc, cic_free_func); |
2640 | } | 2728 | } |
2641 | 2729 | ||
2642 | static void cfq_put_cooperator(struct cfq_queue *cfqq) | 2730 | static void cfq_put_cooperator(struct cfq_queue *cfqq) |
@@ -2685,8 +2773,14 @@ static void __cfq_exit_single_io_context(struct cfq_data *cfqd, | |||
2685 | smp_wmb(); | 2773 | smp_wmb(); |
2686 | cic->key = cfqd_dead_key(cfqd); | 2774 | cic->key = cfqd_dead_key(cfqd); |
2687 | 2775 | ||
2688 | if (ioc->ioc_data == cic) | 2776 | rcu_read_lock(); |
2777 | if (rcu_dereference(ioc->ioc_data) == cic) { | ||
2778 | rcu_read_unlock(); | ||
2779 | spin_lock(&ioc->lock); | ||
2689 | rcu_assign_pointer(ioc->ioc_data, NULL); | 2780 | rcu_assign_pointer(ioc->ioc_data, NULL); |
2781 | spin_unlock(&ioc->lock); | ||
2782 | } else | ||
2783 | rcu_read_unlock(); | ||
2690 | 2784 | ||
2691 | if (cic->cfqq[BLK_RW_ASYNC]) { | 2785 | if (cic->cfqq[BLK_RW_ASYNC]) { |
2692 | cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_ASYNC]); | 2786 | cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_ASYNC]); |
@@ -2835,7 +2929,7 @@ static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq, | |||
2835 | RB_CLEAR_NODE(&cfqq->p_node); | 2929 | RB_CLEAR_NODE(&cfqq->p_node); |
2836 | INIT_LIST_HEAD(&cfqq->fifo); | 2930 | INIT_LIST_HEAD(&cfqq->fifo); |
2837 | 2931 | ||
2838 | atomic_set(&cfqq->ref, 0); | 2932 | cfqq->ref = 0; |
2839 | cfqq->cfqd = cfqd; | 2933 | cfqq->cfqd = cfqd; |
2840 | 2934 | ||
2841 | cfq_mark_cfqq_prio_changed(cfqq); | 2935 | cfq_mark_cfqq_prio_changed(cfqq); |
@@ -2892,7 +2986,7 @@ cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync, | |||
2892 | struct cfq_group *cfqg; | 2986 | struct cfq_group *cfqg; |
2893 | 2987 | ||
2894 | retry: | 2988 | retry: |
2895 | cfqg = cfq_get_cfqg(cfqd, 1); | 2989 | cfqg = cfq_get_cfqg(cfqd); |
2896 | cic = cfq_cic_lookup(cfqd, ioc); | 2990 | cic = cfq_cic_lookup(cfqd, ioc); |
2897 | /* cic always exists here */ | 2991 | /* cic always exists here */ |
2898 | cfqq = cic_to_cfqq(cic, is_sync); | 2992 | cfqq = cic_to_cfqq(cic, is_sync); |
@@ -2971,11 +3065,11 @@ cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct io_context *ioc, | |||
2971 | * pin the queue now that it's allocated, scheduler exit will prune it | 3065 | * pin the queue now that it's allocated, scheduler exit will prune it |
2972 | */ | 3066 | */ |
2973 | if (!is_sync && !(*async_cfqq)) { | 3067 | if (!is_sync && !(*async_cfqq)) { |
2974 | atomic_inc(&cfqq->ref); | 3068 | cfqq->ref++; |
2975 | *async_cfqq = cfqq; | 3069 | *async_cfqq = cfqq; |
2976 | } | 3070 | } |
2977 | 3071 | ||
2978 | atomic_inc(&cfqq->ref); | 3072 | cfqq->ref++; |
2979 | return cfqq; | 3073 | return cfqq; |
2980 | } | 3074 | } |
2981 | 3075 | ||
@@ -2993,7 +3087,8 @@ cfq_drop_dead_cic(struct cfq_data *cfqd, struct io_context *ioc, | |||
2993 | 3087 | ||
2994 | spin_lock_irqsave(&ioc->lock, flags); | 3088 | spin_lock_irqsave(&ioc->lock, flags); |
2995 | 3089 | ||
2996 | BUG_ON(ioc->ioc_data == cic); | 3090 | BUG_ON(rcu_dereference_check(ioc->ioc_data, |
3091 | lockdep_is_held(&ioc->lock)) == cic); | ||
2997 | 3092 | ||
2998 | radix_tree_delete(&ioc->radix_root, cfqd->cic_index); | 3093 | radix_tree_delete(&ioc->radix_root, cfqd->cic_index); |
2999 | hlist_del_rcu(&cic->cic_list); | 3094 | hlist_del_rcu(&cic->cic_list); |
@@ -3177,7 +3272,9 @@ cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq, | |||
3177 | if (cfqq->queued[0] + cfqq->queued[1] >= 4) | 3272 | if (cfqq->queued[0] + cfqq->queued[1] >= 4) |
3178 | cfq_mark_cfqq_deep(cfqq); | 3273 | cfq_mark_cfqq_deep(cfqq); |
3179 | 3274 | ||
3180 | if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle || | 3275 | if (cfqq->next_rq && (cfqq->next_rq->cmd_flags & REQ_NOIDLE)) |
3276 | enable_idle = 0; | ||
3277 | else if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle || | ||
3181 | (!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq))) | 3278 | (!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq))) |
3182 | enable_idle = 0; | 3279 | enable_idle = 0; |
3183 | else if (sample_valid(cic->ttime_samples)) { | 3280 | else if (sample_valid(cic->ttime_samples)) { |
@@ -3255,6 +3352,10 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq, | |||
3255 | if (cfq_class_rt(new_cfqq) && !cfq_class_rt(cfqq)) | 3352 | if (cfq_class_rt(new_cfqq) && !cfq_class_rt(cfqq)) |
3256 | return true; | 3353 | return true; |
3257 | 3354 | ||
3355 | /* An idle queue should not be idle now for some reason */ | ||
3356 | if (RB_EMPTY_ROOT(&cfqq->sort_list) && !cfq_should_idle(cfqd, cfqq)) | ||
3357 | return true; | ||
3358 | |||
3258 | if (!cfqd->active_cic || !cfq_cfqq_wait_request(cfqq)) | 3359 | if (!cfqd->active_cic || !cfq_cfqq_wait_request(cfqq)) |
3259 | return false; | 3360 | return false; |
3260 | 3361 | ||
@@ -3274,10 +3375,19 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq, | |||
3274 | */ | 3375 | */ |
3275 | static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq) | 3376 | static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq) |
3276 | { | 3377 | { |
3378 | struct cfq_queue *old_cfqq = cfqd->active_queue; | ||
3379 | |||
3277 | cfq_log_cfqq(cfqd, cfqq, "preempt"); | 3380 | cfq_log_cfqq(cfqd, cfqq, "preempt"); |
3278 | cfq_slice_expired(cfqd, 1); | 3381 | cfq_slice_expired(cfqd, 1); |
3279 | 3382 | ||
3280 | /* | 3383 | /* |
3384 | * workload type is changed, don't save slice, otherwise preempt | ||
3385 | * doesn't happen | ||
3386 | */ | ||
3387 | if (cfqq_type(old_cfqq) != cfqq_type(cfqq)) | ||
3388 | cfqq->cfqg->saved_workload_slice = 0; | ||
3389 | |||
3390 | /* | ||
3281 | * Put the new queue at the front of the of the current list, | 3391 | * Put the new queue at the front of the of the current list, |
3282 | * so we know that it will be selected next. | 3392 | * so we know that it will be selected next. |
3283 | */ | 3393 | */ |
@@ -3402,6 +3512,10 @@ static bool cfq_should_wait_busy(struct cfq_data *cfqd, struct cfq_queue *cfqq) | |||
3402 | { | 3512 | { |
3403 | struct cfq_io_context *cic = cfqd->active_cic; | 3513 | struct cfq_io_context *cic = cfqd->active_cic; |
3404 | 3514 | ||
3515 | /* If the queue already has requests, don't wait */ | ||
3516 | if (!RB_EMPTY_ROOT(&cfqq->sort_list)) | ||
3517 | return false; | ||
3518 | |||
3405 | /* If there are other queues in the group, don't wait */ | 3519 | /* If there are other queues in the group, don't wait */ |
3406 | if (cfqq->cfqg->nr_cfqq > 1) | 3520 | if (cfqq->cfqg->nr_cfqq > 1) |
3407 | return false; | 3521 | return false; |
@@ -3494,17 +3608,7 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq) | |||
3494 | cfq_slice_expired(cfqd, 1); | 3608 | cfq_slice_expired(cfqd, 1); |
3495 | else if (sync && cfqq_empty && | 3609 | else if (sync && cfqq_empty && |
3496 | !cfq_close_cooperator(cfqd, cfqq)) { | 3610 | !cfq_close_cooperator(cfqd, cfqq)) { |
3497 | cfqd->noidle_tree_requires_idle |= | 3611 | cfq_arm_slice_timer(cfqd); |
3498 | !(rq->cmd_flags & REQ_NOIDLE); | ||
3499 | /* | ||
3500 | * Idling is enabled for SYNC_WORKLOAD. | ||
3501 | * SYNC_NOIDLE_WORKLOAD idles at the end of the tree | ||
3502 | * only if we processed at least one !REQ_NOIDLE request | ||
3503 | */ | ||
3504 | if (cfqd->serving_type == SYNC_WORKLOAD | ||
3505 | || cfqd->noidle_tree_requires_idle | ||
3506 | || cfqq->cfqg->nr_cfqq == 1) | ||
3507 | cfq_arm_slice_timer(cfqd); | ||
3508 | } | 3612 | } |
3509 | } | 3613 | } |
3510 | 3614 | ||
@@ -3589,12 +3693,12 @@ static void cfq_put_request(struct request *rq) | |||
3589 | 3693 | ||
3590 | put_io_context(RQ_CIC(rq)->ioc); | 3694 | put_io_context(RQ_CIC(rq)->ioc); |
3591 | 3695 | ||
3592 | rq->elevator_private = NULL; | 3696 | rq->elevator_private[0] = NULL; |
3593 | rq->elevator_private2 = NULL; | 3697 | rq->elevator_private[1] = NULL; |
3594 | 3698 | ||
3595 | /* Put down rq reference on cfqg */ | 3699 | /* Put down rq reference on cfqg */ |
3596 | cfq_put_cfqg(RQ_CFQG(rq)); | 3700 | cfq_put_cfqg(RQ_CFQG(rq)); |
3597 | rq->elevator_private3 = NULL; | 3701 | rq->elevator_private[2] = NULL; |
3598 | 3702 | ||
3599 | cfq_put_queue(cfqq); | 3703 | cfq_put_queue(cfqq); |
3600 | } | 3704 | } |
@@ -3681,19 +3785,15 @@ new_queue: | |||
3681 | } | 3785 | } |
3682 | 3786 | ||
3683 | cfqq->allocated[rw]++; | 3787 | cfqq->allocated[rw]++; |
3684 | atomic_inc(&cfqq->ref); | ||
3685 | 3788 | ||
3789 | cfqq->ref++; | ||
3790 | rq->elevator_private[0] = cic; | ||
3791 | rq->elevator_private[1] = cfqq; | ||
3792 | rq->elevator_private[2] = cfq_ref_get_cfqg(cfqq->cfqg); | ||
3686 | spin_unlock_irqrestore(q->queue_lock, flags); | 3793 | spin_unlock_irqrestore(q->queue_lock, flags); |
3687 | |||
3688 | rq->elevator_private = cic; | ||
3689 | rq->elevator_private2 = cfqq; | ||
3690 | rq->elevator_private3 = cfq_ref_get_cfqg(cfqq->cfqg); | ||
3691 | return 0; | 3794 | return 0; |
3692 | 3795 | ||
3693 | queue_fail: | 3796 | queue_fail: |
3694 | if (cic) | ||
3695 | put_io_context(cic->ioc); | ||
3696 | |||
3697 | cfq_schedule_dispatch(cfqd); | 3797 | cfq_schedule_dispatch(cfqd); |
3698 | spin_unlock_irqrestore(q->queue_lock, flags); | 3798 | spin_unlock_irqrestore(q->queue_lock, flags); |
3699 | cfq_log(cfqd, "set_request fail"); | 3799 | cfq_log(cfqd, "set_request fail"); |
@@ -3788,15 +3888,11 @@ static void cfq_put_async_queues(struct cfq_data *cfqd) | |||
3788 | cfq_put_queue(cfqd->async_idle_cfqq); | 3888 | cfq_put_queue(cfqd->async_idle_cfqq); |
3789 | } | 3889 | } |
3790 | 3890 | ||
3791 | static void cfq_cfqd_free(struct rcu_head *head) | ||
3792 | { | ||
3793 | kfree(container_of(head, struct cfq_data, rcu)); | ||
3794 | } | ||
3795 | |||
3796 | static void cfq_exit_queue(struct elevator_queue *e) | 3891 | static void cfq_exit_queue(struct elevator_queue *e) |
3797 | { | 3892 | { |
3798 | struct cfq_data *cfqd = e->elevator_data; | 3893 | struct cfq_data *cfqd = e->elevator_data; |
3799 | struct request_queue *q = cfqd->queue; | 3894 | struct request_queue *q = cfqd->queue; |
3895 | bool wait = false; | ||
3800 | 3896 | ||
3801 | cfq_shutdown_timer_wq(cfqd); | 3897 | cfq_shutdown_timer_wq(cfqd); |
3802 | 3898 | ||
@@ -3815,7 +3911,13 @@ static void cfq_exit_queue(struct elevator_queue *e) | |||
3815 | 3911 | ||
3816 | cfq_put_async_queues(cfqd); | 3912 | cfq_put_async_queues(cfqd); |
3817 | cfq_release_cfq_groups(cfqd); | 3913 | cfq_release_cfq_groups(cfqd); |
3818 | cfq_blkiocg_del_blkio_group(&cfqd->root_group.blkg); | 3914 | |
3915 | /* | ||
3916 | * If there are groups which we could not unlink from blkcg list, | ||
3917 | * wait for a rcu period for them to be freed. | ||
3918 | */ | ||
3919 | if (cfqd->nr_blkcg_linked_grps) | ||
3920 | wait = true; | ||
3819 | 3921 | ||
3820 | spin_unlock_irq(q->queue_lock); | 3922 | spin_unlock_irq(q->queue_lock); |
3821 | 3923 | ||
@@ -3825,8 +3927,25 @@ static void cfq_exit_queue(struct elevator_queue *e) | |||
3825 | ida_remove(&cic_index_ida, cfqd->cic_index); | 3927 | ida_remove(&cic_index_ida, cfqd->cic_index); |
3826 | spin_unlock(&cic_index_lock); | 3928 | spin_unlock(&cic_index_lock); |
3827 | 3929 | ||
3828 | /* Wait for cfqg->blkg->key accessors to exit their grace periods. */ | 3930 | /* |
3829 | call_rcu(&cfqd->rcu, cfq_cfqd_free); | 3931 | * Wait for cfqg->blkg->key accessors to exit their grace periods. |
3932 | * Do this wait only if there are other unlinked groups out | ||
3933 | * there. This can happen if cgroup deletion path claimed the | ||
3934 | * responsibility of cleaning up a group before queue cleanup code | ||
3935 | * get to the group. | ||
3936 | * | ||
3937 | * Do not call synchronize_rcu() unconditionally as there are drivers | ||
3938 | * which create/delete request queue hundreds of times during scan/boot | ||
3939 | * and synchronize_rcu() can take significant time and slow down boot. | ||
3940 | */ | ||
3941 | if (wait) | ||
3942 | synchronize_rcu(); | ||
3943 | |||
3944 | #ifdef CONFIG_CFQ_GROUP_IOSCHED | ||
3945 | /* Free up per cpu stats for root group */ | ||
3946 | free_percpu(cfqd->root_group.blkg.stats_cpu); | ||
3947 | #endif | ||
3948 | kfree(cfqd); | ||
3830 | } | 3949 | } |
3831 | 3950 | ||
3832 | static int cfq_alloc_cic_index(void) | 3951 | static int cfq_alloc_cic_index(void) |
@@ -3859,9 +3978,17 @@ static void *cfq_init_queue(struct request_queue *q) | |||
3859 | return NULL; | 3978 | return NULL; |
3860 | 3979 | ||
3861 | cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node); | 3980 | cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node); |
3862 | if (!cfqd) | 3981 | if (!cfqd) { |
3982 | spin_lock(&cic_index_lock); | ||
3983 | ida_remove(&cic_index_ida, i); | ||
3984 | spin_unlock(&cic_index_lock); | ||
3863 | return NULL; | 3985 | return NULL; |
3986 | } | ||
3864 | 3987 | ||
3988 | /* | ||
3989 | * Don't need take queue_lock in the routine, since we are | ||
3990 | * initializing the ioscheduler, and nobody is using cfqd | ||
3991 | */ | ||
3865 | cfqd->cic_index = i; | 3992 | cfqd->cic_index = i; |
3866 | 3993 | ||
3867 | /* Init root service tree */ | 3994 | /* Init root service tree */ |
@@ -3878,14 +4005,29 @@ static void *cfq_init_queue(struct request_queue *q) | |||
3878 | 4005 | ||
3879 | #ifdef CONFIG_CFQ_GROUP_IOSCHED | 4006 | #ifdef CONFIG_CFQ_GROUP_IOSCHED |
3880 | /* | 4007 | /* |
3881 | * Take a reference to root group which we never drop. This is just | 4008 | * Set root group reference to 2. One reference will be dropped when |
3882 | * to make sure that cfq_put_cfqg() does not try to kfree root group | 4009 | * all groups on cfqd->cfqg_list are being deleted during queue exit. |
4010 | * Other reference will remain there as we don't want to delete this | ||
4011 | * group as it is statically allocated and gets destroyed when | ||
4012 | * throtl_data goes away. | ||
3883 | */ | 4013 | */ |
3884 | atomic_set(&cfqg->ref, 1); | 4014 | cfqg->ref = 2; |
4015 | |||
4016 | if (blkio_alloc_blkg_stats(&cfqg->blkg)) { | ||
4017 | kfree(cfqg); | ||
4018 | kfree(cfqd); | ||
4019 | return NULL; | ||
4020 | } | ||
4021 | |||
3885 | rcu_read_lock(); | 4022 | rcu_read_lock(); |
4023 | |||
3886 | cfq_blkiocg_add_blkio_group(&blkio_root_cgroup, &cfqg->blkg, | 4024 | cfq_blkiocg_add_blkio_group(&blkio_root_cgroup, &cfqg->blkg, |
3887 | (void *)cfqd, 0); | 4025 | (void *)cfqd, 0); |
3888 | rcu_read_unlock(); | 4026 | rcu_read_unlock(); |
4027 | cfqd->nr_blkcg_linked_grps++; | ||
4028 | |||
4029 | /* Add group on cfqd->cfqg_list */ | ||
4030 | hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list); | ||
3889 | #endif | 4031 | #endif |
3890 | /* | 4032 | /* |
3891 | * Not strictly needed (since RB_ROOT just clears the node and we | 4033 | * Not strictly needed (since RB_ROOT just clears the node and we |
@@ -3901,7 +4043,7 @@ static void *cfq_init_queue(struct request_queue *q) | |||
3901 | * will not attempt to free it. | 4043 | * will not attempt to free it. |
3902 | */ | 4044 | */ |
3903 | cfq_init_cfqq(cfqd, &cfqd->oom_cfqq, 1, 0); | 4045 | cfq_init_cfqq(cfqd, &cfqd->oom_cfqq, 1, 0); |
3904 | atomic_inc(&cfqd->oom_cfqq.ref); | 4046 | cfqd->oom_cfqq.ref++; |
3905 | cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, &cfqd->root_group); | 4047 | cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, &cfqd->root_group); |
3906 | 4048 | ||
3907 | INIT_LIST_HEAD(&cfqd->cic_list); | 4049 | INIT_LIST_HEAD(&cfqd->cic_list); |
@@ -3925,7 +4067,6 @@ static void *cfq_init_queue(struct request_queue *q) | |||
3925 | cfqd->cfq_slice_idle = cfq_slice_idle; | 4067 | cfqd->cfq_slice_idle = cfq_slice_idle; |
3926 | cfqd->cfq_group_idle = cfq_group_idle; | 4068 | cfqd->cfq_group_idle = cfq_group_idle; |
3927 | cfqd->cfq_latency = 1; | 4069 | cfqd->cfq_latency = 1; |
3928 | cfqd->cfq_group_isolation = 0; | ||
3929 | cfqd->hw_tag = -1; | 4070 | cfqd->hw_tag = -1; |
3930 | /* | 4071 | /* |
3931 | * we optimistically start assuming sync ops weren't delayed in last | 4072 | * we optimistically start assuming sync ops weren't delayed in last |
@@ -4001,7 +4142,6 @@ SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1); | |||
4001 | SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1); | 4142 | SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1); |
4002 | SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0); | 4143 | SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0); |
4003 | SHOW_FUNCTION(cfq_low_latency_show, cfqd->cfq_latency, 0); | 4144 | SHOW_FUNCTION(cfq_low_latency_show, cfqd->cfq_latency, 0); |
4004 | SHOW_FUNCTION(cfq_group_isolation_show, cfqd->cfq_group_isolation, 0); | ||
4005 | #undef SHOW_FUNCTION | 4145 | #undef SHOW_FUNCTION |
4006 | 4146 | ||
4007 | #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ | 4147 | #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ |
@@ -4035,7 +4175,6 @@ STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1); | |||
4035 | STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1, | 4175 | STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1, |
4036 | UINT_MAX, 0); | 4176 | UINT_MAX, 0); |
4037 | STORE_FUNCTION(cfq_low_latency_store, &cfqd->cfq_latency, 0, 1, 0); | 4177 | STORE_FUNCTION(cfq_low_latency_store, &cfqd->cfq_latency, 0, 1, 0); |
4038 | STORE_FUNCTION(cfq_group_isolation_store, &cfqd->cfq_group_isolation, 0, 1, 0); | ||
4039 | #undef STORE_FUNCTION | 4178 | #undef STORE_FUNCTION |
4040 | 4179 | ||
4041 | #define CFQ_ATTR(name) \ | 4180 | #define CFQ_ATTR(name) \ |
@@ -4053,7 +4192,6 @@ static struct elv_fs_entry cfq_attrs[] = { | |||
4053 | CFQ_ATTR(slice_idle), | 4192 | CFQ_ATTR(slice_idle), |
4054 | CFQ_ATTR(group_idle), | 4193 | CFQ_ATTR(group_idle), |
4055 | CFQ_ATTR(low_latency), | 4194 | CFQ_ATTR(low_latency), |
4056 | CFQ_ATTR(group_isolation), | ||
4057 | __ATTR_NULL | 4195 | __ATTR_NULL |
4058 | }; | 4196 | }; |
4059 | 4197 | ||
@@ -4068,7 +4206,6 @@ static struct elevator_type iosched_cfq = { | |||
4068 | .elevator_add_req_fn = cfq_insert_request, | 4206 | .elevator_add_req_fn = cfq_insert_request, |
4069 | .elevator_activate_req_fn = cfq_activate_request, | 4207 | .elevator_activate_req_fn = cfq_activate_request, |
4070 | .elevator_deactivate_req_fn = cfq_deactivate_request, | 4208 | .elevator_deactivate_req_fn = cfq_deactivate_request, |
4071 | .elevator_queue_empty_fn = cfq_queue_empty, | ||
4072 | .elevator_completed_req_fn = cfq_completed_request, | 4209 | .elevator_completed_req_fn = cfq_completed_request, |
4073 | .elevator_former_req_fn = elv_rb_former_request, | 4210 | .elevator_former_req_fn = elv_rb_former_request, |
4074 | .elevator_latter_req_fn = elv_rb_latter_request, | 4211 | .elevator_latter_req_fn = elv_rb_latter_request, |
@@ -4090,6 +4227,7 @@ static struct blkio_policy_type blkio_policy_cfq = { | |||
4090 | .blkio_unlink_group_fn = cfq_unlink_blkio_group, | 4227 | .blkio_unlink_group_fn = cfq_unlink_blkio_group, |
4091 | .blkio_update_group_weight_fn = cfq_update_blkio_group_weight, | 4228 | .blkio_update_group_weight_fn = cfq_update_blkio_group_weight, |
4092 | }, | 4229 | }, |
4230 | .plid = BLKIO_POLICY_PROP, | ||
4093 | }; | 4231 | }; |
4094 | #else | 4232 | #else |
4095 | static struct blkio_policy_type blkio_policy_cfq; | 4233 | static struct blkio_policy_type blkio_policy_cfq; |
diff --git a/block/cfq.h b/block/cfq.h index 93448e5a2e41..2a155927e37c 100644 --- a/block/cfq.h +++ b/block/cfq.h | |||
@@ -16,9 +16,9 @@ static inline void cfq_blkiocg_update_dequeue_stats(struct blkio_group *blkg, | |||
16 | } | 16 | } |
17 | 17 | ||
18 | static inline void cfq_blkiocg_update_timeslice_used(struct blkio_group *blkg, | 18 | static inline void cfq_blkiocg_update_timeslice_used(struct blkio_group *blkg, |
19 | unsigned long time) | 19 | unsigned long time, unsigned long unaccounted_time) |
20 | { | 20 | { |
21 | blkiocg_update_timeslice_used(blkg, time); | 21 | blkiocg_update_timeslice_used(blkg, time, unaccounted_time); |
22 | } | 22 | } |
23 | 23 | ||
24 | static inline void cfq_blkiocg_set_start_empty_time(struct blkio_group *blkg) | 24 | static inline void cfq_blkiocg_set_start_empty_time(struct blkio_group *blkg) |
@@ -69,7 +69,7 @@ static inline void cfq_blkiocg_update_completion_stats(struct blkio_group *blkg, | |||
69 | 69 | ||
70 | static inline void cfq_blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, | 70 | static inline void cfq_blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, |
71 | struct blkio_group *blkg, void *key, dev_t dev) { | 71 | struct blkio_group *blkg, void *key, dev_t dev) { |
72 | blkiocg_add_blkio_group(blkcg, blkg, key, dev); | 72 | blkiocg_add_blkio_group(blkcg, blkg, key, dev, BLKIO_POLICY_PROP); |
73 | } | 73 | } |
74 | 74 | ||
75 | static inline int cfq_blkiocg_del_blkio_group(struct blkio_group *blkg) | 75 | static inline int cfq_blkiocg_del_blkio_group(struct blkio_group *blkg) |
@@ -85,7 +85,7 @@ static inline void cfq_blkiocg_update_dequeue_stats(struct blkio_group *blkg, | |||
85 | unsigned long dequeue) {} | 85 | unsigned long dequeue) {} |
86 | 86 | ||
87 | static inline void cfq_blkiocg_update_timeslice_used(struct blkio_group *blkg, | 87 | static inline void cfq_blkiocg_update_timeslice_used(struct blkio_group *blkg, |
88 | unsigned long time) {} | 88 | unsigned long time, unsigned long unaccounted_time) {} |
89 | static inline void cfq_blkiocg_set_start_empty_time(struct blkio_group *blkg) {} | 89 | static inline void cfq_blkiocg_set_start_empty_time(struct blkio_group *blkg) {} |
90 | static inline void cfq_blkiocg_update_io_remove_stats(struct blkio_group *blkg, | 90 | static inline void cfq_blkiocg_update_io_remove_stats(struct blkio_group *blkg, |
91 | bool direction, bool sync) {} | 91 | bool direction, bool sync) {} |
diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c index 119f07b74dc0..cc3eb78e333a 100644 --- a/block/compat_ioctl.c +++ b/block/compat_ioctl.c | |||
@@ -8,7 +8,6 @@ | |||
8 | #include <linux/hdreg.h> | 8 | #include <linux/hdreg.h> |
9 | #include <linux/slab.h> | 9 | #include <linux/slab.h> |
10 | #include <linux/syscalls.h> | 10 | #include <linux/syscalls.h> |
11 | #include <linux/smp_lock.h> | ||
12 | #include <linux/types.h> | 11 | #include <linux/types.h> |
13 | #include <linux/uaccess.h> | 12 | #include <linux/uaccess.h> |
14 | 13 | ||
@@ -744,13 +743,13 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg) | |||
744 | bdi->ra_pages = (arg * 512) / PAGE_CACHE_SIZE; | 743 | bdi->ra_pages = (arg * 512) / PAGE_CACHE_SIZE; |
745 | return 0; | 744 | return 0; |
746 | case BLKGETSIZE: | 745 | case BLKGETSIZE: |
747 | size = bdev->bd_inode->i_size; | 746 | size = i_size_read(bdev->bd_inode); |
748 | if ((size >> 9) > ~0UL) | 747 | if ((size >> 9) > ~0UL) |
749 | return -EFBIG; | 748 | return -EFBIG; |
750 | return compat_put_ulong(arg, size >> 9); | 749 | return compat_put_ulong(arg, size >> 9); |
751 | 750 | ||
752 | case BLKGETSIZE64_32: | 751 | case BLKGETSIZE64_32: |
753 | return compat_put_u64(arg, bdev->bd_inode->i_size); | 752 | return compat_put_u64(arg, i_size_read(bdev->bd_inode)); |
754 | 753 | ||
755 | case BLKTRACESETUP32: | 754 | case BLKTRACESETUP32: |
756 | case BLKTRACESTART: /* compatible */ | 755 | case BLKTRACESTART: /* compatible */ |
diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c index b547cbca7b23..5139c0ea1864 100644 --- a/block/deadline-iosched.c +++ b/block/deadline-iosched.c | |||
@@ -326,14 +326,6 @@ dispatch_request: | |||
326 | return 1; | 326 | return 1; |
327 | } | 327 | } |
328 | 328 | ||
329 | static int deadline_queue_empty(struct request_queue *q) | ||
330 | { | ||
331 | struct deadline_data *dd = q->elevator->elevator_data; | ||
332 | |||
333 | return list_empty(&dd->fifo_list[WRITE]) | ||
334 | && list_empty(&dd->fifo_list[READ]); | ||
335 | } | ||
336 | |||
337 | static void deadline_exit_queue(struct elevator_queue *e) | 329 | static void deadline_exit_queue(struct elevator_queue *e) |
338 | { | 330 | { |
339 | struct deadline_data *dd = e->elevator_data; | 331 | struct deadline_data *dd = e->elevator_data; |
@@ -445,7 +437,6 @@ static struct elevator_type iosched_deadline = { | |||
445 | .elevator_merge_req_fn = deadline_merged_requests, | 437 | .elevator_merge_req_fn = deadline_merged_requests, |
446 | .elevator_dispatch_fn = deadline_dispatch_requests, | 438 | .elevator_dispatch_fn = deadline_dispatch_requests, |
447 | .elevator_add_req_fn = deadline_add_request, | 439 | .elevator_add_req_fn = deadline_add_request, |
448 | .elevator_queue_empty_fn = deadline_queue_empty, | ||
449 | .elevator_former_req_fn = elv_rb_former_request, | 440 | .elevator_former_req_fn = elv_rb_former_request, |
450 | .elevator_latter_req_fn = elv_rb_latter_request, | 441 | .elevator_latter_req_fn = elv_rb_latter_request, |
451 | .elevator_init_fn = deadline_init_queue, | 442 | .elevator_init_fn = deadline_init_queue, |
diff --git a/block/elevator.c b/block/elevator.c index 4e11559aa2b0..b0b38ce0dcb6 100644 --- a/block/elevator.c +++ b/block/elevator.c | |||
@@ -113,7 +113,7 @@ int elv_rq_merge_ok(struct request *rq, struct bio *bio) | |||
113 | } | 113 | } |
114 | EXPORT_SYMBOL(elv_rq_merge_ok); | 114 | EXPORT_SYMBOL(elv_rq_merge_ok); |
115 | 115 | ||
116 | static inline int elv_try_merge(struct request *__rq, struct bio *bio) | 116 | int elv_try_merge(struct request *__rq, struct bio *bio) |
117 | { | 117 | { |
118 | int ret = ELEVATOR_NO_MERGE; | 118 | int ret = ELEVATOR_NO_MERGE; |
119 | 119 | ||
@@ -155,13 +155,8 @@ static struct elevator_type *elevator_get(const char *name) | |||
155 | 155 | ||
156 | e = elevator_find(name); | 156 | e = elevator_find(name); |
157 | if (!e) { | 157 | if (!e) { |
158 | char elv[ELV_NAME_MAX + strlen("-iosched")]; | ||
159 | |||
160 | spin_unlock(&elv_list_lock); | 158 | spin_unlock(&elv_list_lock); |
161 | 159 | request_module("%s-iosched", name); | |
162 | snprintf(elv, sizeof(elv), "%s-iosched", name); | ||
163 | |||
164 | request_module("%s", elv); | ||
165 | spin_lock(&elv_list_lock); | 160 | spin_lock(&elv_list_lock); |
166 | e = elevator_find(name); | 161 | e = elevator_find(name); |
167 | } | 162 | } |
@@ -429,7 +424,7 @@ void elv_dispatch_sort(struct request_queue *q, struct request *rq) | |||
429 | q->nr_sorted--; | 424 | q->nr_sorted--; |
430 | 425 | ||
431 | boundary = q->end_sector; | 426 | boundary = q->end_sector; |
432 | stop_flags = REQ_SOFTBARRIER | REQ_HARDBARRIER | REQ_STARTED; | 427 | stop_flags = REQ_SOFTBARRIER | REQ_STARTED; |
433 | list_for_each_prev(entry, &q->queue_head) { | 428 | list_for_each_prev(entry, &q->queue_head) { |
434 | struct request *pos = list_entry_rq(entry); | 429 | struct request *pos = list_entry_rq(entry); |
435 | 430 | ||
@@ -519,6 +514,40 @@ int elv_merge(struct request_queue *q, struct request **req, struct bio *bio) | |||
519 | return ELEVATOR_NO_MERGE; | 514 | return ELEVATOR_NO_MERGE; |
520 | } | 515 | } |
521 | 516 | ||
517 | /* | ||
518 | * Attempt to do an insertion back merge. Only check for the case where | ||
519 | * we can append 'rq' to an existing request, so we can throw 'rq' away | ||
520 | * afterwards. | ||
521 | * | ||
522 | * Returns true if we merged, false otherwise | ||
523 | */ | ||
524 | static bool elv_attempt_insert_merge(struct request_queue *q, | ||
525 | struct request *rq) | ||
526 | { | ||
527 | struct request *__rq; | ||
528 | |||
529 | if (blk_queue_nomerges(q)) | ||
530 | return false; | ||
531 | |||
532 | /* | ||
533 | * First try one-hit cache. | ||
534 | */ | ||
535 | if (q->last_merge && blk_attempt_req_merge(q, q->last_merge, rq)) | ||
536 | return true; | ||
537 | |||
538 | if (blk_queue_noxmerges(q)) | ||
539 | return false; | ||
540 | |||
541 | /* | ||
542 | * See if our hash lookup can find a potential backmerge. | ||
543 | */ | ||
544 | __rq = elv_rqhash_find(q, blk_rq_pos(rq)); | ||
545 | if (__rq && blk_attempt_req_merge(q, __rq, rq)) | ||
546 | return true; | ||
547 | |||
548 | return false; | ||
549 | } | ||
550 | |||
522 | void elv_merged_request(struct request_queue *q, struct request *rq, int type) | 551 | void elv_merged_request(struct request_queue *q, struct request *rq, int type) |
523 | { | 552 | { |
524 | struct elevator_queue *e = q->elevator; | 553 | struct elevator_queue *e = q->elevator; |
@@ -536,14 +565,18 @@ void elv_merge_requests(struct request_queue *q, struct request *rq, | |||
536 | struct request *next) | 565 | struct request *next) |
537 | { | 566 | { |
538 | struct elevator_queue *e = q->elevator; | 567 | struct elevator_queue *e = q->elevator; |
568 | const int next_sorted = next->cmd_flags & REQ_SORTED; | ||
539 | 569 | ||
540 | if (e->ops->elevator_merge_req_fn) | 570 | if (next_sorted && e->ops->elevator_merge_req_fn) |
541 | e->ops->elevator_merge_req_fn(q, rq, next); | 571 | e->ops->elevator_merge_req_fn(q, rq, next); |
542 | 572 | ||
543 | elv_rqhash_reposition(q, rq); | 573 | elv_rqhash_reposition(q, rq); |
544 | elv_rqhash_del(q, next); | ||
545 | 574 | ||
546 | q->nr_sorted--; | 575 | if (next_sorted) { |
576 | elv_rqhash_del(q, next); | ||
577 | q->nr_sorted--; | ||
578 | } | ||
579 | |||
547 | q->last_merge = rq; | 580 | q->last_merge = rq; |
548 | } | 581 | } |
549 | 582 | ||
@@ -570,7 +603,7 @@ void elv_requeue_request(struct request_queue *q, struct request *rq) | |||
570 | 603 | ||
571 | rq->cmd_flags &= ~REQ_STARTED; | 604 | rq->cmd_flags &= ~REQ_STARTED; |
572 | 605 | ||
573 | elv_insert(q, rq, ELEVATOR_INSERT_REQUEUE); | 606 | __elv_add_request(q, rq, ELEVATOR_INSERT_REQUEUE); |
574 | } | 607 | } |
575 | 608 | ||
576 | void elv_drain_elevator(struct request_queue *q) | 609 | void elv_drain_elevator(struct request_queue *q) |
@@ -615,20 +648,28 @@ void elv_quiesce_end(struct request_queue *q) | |||
615 | queue_flag_clear(QUEUE_FLAG_ELVSWITCH, q); | 648 | queue_flag_clear(QUEUE_FLAG_ELVSWITCH, q); |
616 | } | 649 | } |
617 | 650 | ||
618 | void elv_insert(struct request_queue *q, struct request *rq, int where) | 651 | void __elv_add_request(struct request_queue *q, struct request *rq, int where) |
619 | { | 652 | { |
620 | struct list_head *pos; | ||
621 | unsigned ordseq; | ||
622 | int unplug_it = 1; | ||
623 | |||
624 | trace_block_rq_insert(q, rq); | 653 | trace_block_rq_insert(q, rq); |
625 | 654 | ||
626 | rq->q = q; | 655 | rq->q = q; |
627 | 656 | ||
657 | if (rq->cmd_flags & REQ_SOFTBARRIER) { | ||
658 | /* barriers are scheduling boundary, update end_sector */ | ||
659 | if (rq->cmd_type == REQ_TYPE_FS || | ||
660 | (rq->cmd_flags & REQ_DISCARD)) { | ||
661 | q->end_sector = rq_end_sector(rq); | ||
662 | q->boundary_rq = rq; | ||
663 | } | ||
664 | } else if (!(rq->cmd_flags & REQ_ELVPRIV) && | ||
665 | (where == ELEVATOR_INSERT_SORT || | ||
666 | where == ELEVATOR_INSERT_SORT_MERGE)) | ||
667 | where = ELEVATOR_INSERT_BACK; | ||
668 | |||
628 | switch (where) { | 669 | switch (where) { |
670 | case ELEVATOR_INSERT_REQUEUE: | ||
629 | case ELEVATOR_INSERT_FRONT: | 671 | case ELEVATOR_INSERT_FRONT: |
630 | rq->cmd_flags |= REQ_SOFTBARRIER; | 672 | rq->cmd_flags |= REQ_SOFTBARRIER; |
631 | |||
632 | list_add(&rq->queuelist, &q->queue_head); | 673 | list_add(&rq->queuelist, &q->queue_head); |
633 | break; | 674 | break; |
634 | 675 | ||
@@ -649,6 +690,14 @@ void elv_insert(struct request_queue *q, struct request *rq, int where) | |||
649 | __blk_run_queue(q); | 690 | __blk_run_queue(q); |
650 | break; | 691 | break; |
651 | 692 | ||
693 | case ELEVATOR_INSERT_SORT_MERGE: | ||
694 | /* | ||
695 | * If we succeed in merging this request with one in the | ||
696 | * queue already, we are done - rq has now been freed, | ||
697 | * so no need to do anything further. | ||
698 | */ | ||
699 | if (elv_attempt_insert_merge(q, rq)) | ||
700 | break; | ||
652 | case ELEVATOR_INSERT_SORT: | 701 | case ELEVATOR_INSERT_SORT: |
653 | BUG_ON(rq->cmd_type != REQ_TYPE_FS && | 702 | BUG_ON(rq->cmd_type != REQ_TYPE_FS && |
654 | !(rq->cmd_flags & REQ_DISCARD)); | 703 | !(rq->cmd_flags & REQ_DISCARD)); |
@@ -668,115 +717,28 @@ void elv_insert(struct request_queue *q, struct request *rq, int where) | |||
668 | q->elevator->ops->elevator_add_req_fn(q, rq); | 717 | q->elevator->ops->elevator_add_req_fn(q, rq); |
669 | break; | 718 | break; |
670 | 719 | ||
671 | case ELEVATOR_INSERT_REQUEUE: | 720 | case ELEVATOR_INSERT_FLUSH: |
672 | /* | ||
673 | * If ordered flush isn't in progress, we do front | ||
674 | * insertion; otherwise, requests should be requeued | ||
675 | * in ordseq order. | ||
676 | */ | ||
677 | rq->cmd_flags |= REQ_SOFTBARRIER; | 721 | rq->cmd_flags |= REQ_SOFTBARRIER; |
678 | 722 | blk_insert_flush(rq); | |
679 | /* | ||
680 | * Most requeues happen because of a busy condition, | ||
681 | * don't force unplug of the queue for that case. | ||
682 | */ | ||
683 | unplug_it = 0; | ||
684 | |||
685 | if (q->ordseq == 0) { | ||
686 | list_add(&rq->queuelist, &q->queue_head); | ||
687 | break; | ||
688 | } | ||
689 | |||
690 | ordseq = blk_ordered_req_seq(rq); | ||
691 | |||
692 | list_for_each(pos, &q->queue_head) { | ||
693 | struct request *pos_rq = list_entry_rq(pos); | ||
694 | if (ordseq <= blk_ordered_req_seq(pos_rq)) | ||
695 | break; | ||
696 | } | ||
697 | |||
698 | list_add_tail(&rq->queuelist, pos); | ||
699 | break; | 723 | break; |
700 | |||
701 | default: | 724 | default: |
702 | printk(KERN_ERR "%s: bad insertion point %d\n", | 725 | printk(KERN_ERR "%s: bad insertion point %d\n", |
703 | __func__, where); | 726 | __func__, where); |
704 | BUG(); | 727 | BUG(); |
705 | } | 728 | } |
706 | |||
707 | if (unplug_it && blk_queue_plugged(q)) { | ||
708 | int nrq = q->rq.count[BLK_RW_SYNC] + q->rq.count[BLK_RW_ASYNC] | ||
709 | - queue_in_flight(q); | ||
710 | |||
711 | if (nrq >= q->unplug_thresh) | ||
712 | __generic_unplug_device(q); | ||
713 | } | ||
714 | } | ||
715 | |||
716 | void __elv_add_request(struct request_queue *q, struct request *rq, int where, | ||
717 | int plug) | ||
718 | { | ||
719 | if (q->ordcolor) | ||
720 | rq->cmd_flags |= REQ_ORDERED_COLOR; | ||
721 | |||
722 | if (rq->cmd_flags & (REQ_SOFTBARRIER | REQ_HARDBARRIER)) { | ||
723 | /* | ||
724 | * toggle ordered color | ||
725 | */ | ||
726 | if (rq->cmd_flags & REQ_HARDBARRIER) | ||
727 | q->ordcolor ^= 1; | ||
728 | |||
729 | /* | ||
730 | * barriers implicitly indicate back insertion | ||
731 | */ | ||
732 | if (where == ELEVATOR_INSERT_SORT) | ||
733 | where = ELEVATOR_INSERT_BACK; | ||
734 | |||
735 | /* | ||
736 | * this request is scheduling boundary, update | ||
737 | * end_sector | ||
738 | */ | ||
739 | if (rq->cmd_type == REQ_TYPE_FS || | ||
740 | (rq->cmd_flags & REQ_DISCARD)) { | ||
741 | q->end_sector = rq_end_sector(rq); | ||
742 | q->boundary_rq = rq; | ||
743 | } | ||
744 | } else if (!(rq->cmd_flags & REQ_ELVPRIV) && | ||
745 | where == ELEVATOR_INSERT_SORT) | ||
746 | where = ELEVATOR_INSERT_BACK; | ||
747 | |||
748 | if (plug) | ||
749 | blk_plug_device(q); | ||
750 | |||
751 | elv_insert(q, rq, where); | ||
752 | } | 729 | } |
753 | EXPORT_SYMBOL(__elv_add_request); | 730 | EXPORT_SYMBOL(__elv_add_request); |
754 | 731 | ||
755 | void elv_add_request(struct request_queue *q, struct request *rq, int where, | 732 | void elv_add_request(struct request_queue *q, struct request *rq, int where) |
756 | int plug) | ||
757 | { | 733 | { |
758 | unsigned long flags; | 734 | unsigned long flags; |
759 | 735 | ||
760 | spin_lock_irqsave(q->queue_lock, flags); | 736 | spin_lock_irqsave(q->queue_lock, flags); |
761 | __elv_add_request(q, rq, where, plug); | 737 | __elv_add_request(q, rq, where); |
762 | spin_unlock_irqrestore(q->queue_lock, flags); | 738 | spin_unlock_irqrestore(q->queue_lock, flags); |
763 | } | 739 | } |
764 | EXPORT_SYMBOL(elv_add_request); | 740 | EXPORT_SYMBOL(elv_add_request); |
765 | 741 | ||
766 | int elv_queue_empty(struct request_queue *q) | ||
767 | { | ||
768 | struct elevator_queue *e = q->elevator; | ||
769 | |||
770 | if (!list_empty(&q->queue_head)) | ||
771 | return 0; | ||
772 | |||
773 | if (e->ops->elevator_queue_empty_fn) | ||
774 | return e->ops->elevator_queue_empty_fn(q); | ||
775 | |||
776 | return 1; | ||
777 | } | ||
778 | EXPORT_SYMBOL(elv_queue_empty); | ||
779 | |||
780 | struct request *elv_latter_request(struct request_queue *q, struct request *rq) | 742 | struct request *elv_latter_request(struct request_queue *q, struct request *rq) |
781 | { | 743 | { |
782 | struct elevator_queue *e = q->elevator; | 744 | struct elevator_queue *e = q->elevator; |
@@ -802,7 +764,7 @@ int elv_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask) | |||
802 | if (e->ops->elevator_set_req_fn) | 764 | if (e->ops->elevator_set_req_fn) |
803 | return e->ops->elevator_set_req_fn(q, rq, gfp_mask); | 765 | return e->ops->elevator_set_req_fn(q, rq, gfp_mask); |
804 | 766 | ||
805 | rq->elevator_private = NULL; | 767 | rq->elevator_private[0] = NULL; |
806 | return 0; | 768 | return 0; |
807 | } | 769 | } |
808 | 770 | ||
@@ -828,6 +790,8 @@ void elv_abort_queue(struct request_queue *q) | |||
828 | { | 790 | { |
829 | struct request *rq; | 791 | struct request *rq; |
830 | 792 | ||
793 | blk_abort_flushes(q); | ||
794 | |||
831 | while (!list_empty(&q->queue_head)) { | 795 | while (!list_empty(&q->queue_head)) { |
832 | rq = list_entry_rq(q->queue_head.next); | 796 | rq = list_entry_rq(q->queue_head.next); |
833 | rq->cmd_flags |= REQ_QUIET; | 797 | rq->cmd_flags |= REQ_QUIET; |
@@ -855,24 +819,6 @@ void elv_completed_request(struct request_queue *q, struct request *rq) | |||
855 | e->ops->elevator_completed_req_fn) | 819 | e->ops->elevator_completed_req_fn) |
856 | e->ops->elevator_completed_req_fn(q, rq); | 820 | e->ops->elevator_completed_req_fn(q, rq); |
857 | } | 821 | } |
858 | |||
859 | /* | ||
860 | * Check if the queue is waiting for fs requests to be | ||
861 | * drained for flush sequence. | ||
862 | */ | ||
863 | if (unlikely(q->ordseq)) { | ||
864 | struct request *next = NULL; | ||
865 | |||
866 | if (!list_empty(&q->queue_head)) | ||
867 | next = list_entry_rq(q->queue_head.next); | ||
868 | |||
869 | if (!queue_in_flight(q) && | ||
870 | blk_ordered_cur_seq(q) == QUEUE_ORDSEQ_DRAIN && | ||
871 | (!next || blk_ordered_req_seq(next) > QUEUE_ORDSEQ_DRAIN)) { | ||
872 | blk_ordered_complete_seq(q, QUEUE_ORDSEQ_DRAIN, 0); | ||
873 | __blk_run_queue(q); | ||
874 | } | ||
875 | } | ||
876 | } | 822 | } |
877 | 823 | ||
878 | #define to_elv(atr) container_of((atr), struct elv_fs_entry, attr) | 824 | #define to_elv(atr) container_of((atr), struct elv_fs_entry, attr) |
diff --git a/block/genhd.c b/block/genhd.c index 59a2db6fecef..3608289c8ecd 100644 --- a/block/genhd.c +++ b/block/genhd.c | |||
@@ -18,13 +18,12 @@ | |||
18 | #include <linux/buffer_head.h> | 18 | #include <linux/buffer_head.h> |
19 | #include <linux/mutex.h> | 19 | #include <linux/mutex.h> |
20 | #include <linux/idr.h> | 20 | #include <linux/idr.h> |
21 | #include <linux/log2.h> | ||
21 | 22 | ||
22 | #include "blk.h" | 23 | #include "blk.h" |
23 | 24 | ||
24 | static DEFINE_MUTEX(block_class_lock); | 25 | static DEFINE_MUTEX(block_class_lock); |
25 | #ifndef CONFIG_SYSFS_DEPRECATED | ||
26 | struct kobject *block_depr; | 26 | struct kobject *block_depr; |
27 | #endif | ||
28 | 27 | ||
29 | /* for extended dynamic devt allocation, currently only one major is used */ | 28 | /* for extended dynamic devt allocation, currently only one major is used */ |
30 | #define MAX_EXT_DEVT (1 << MINORBITS) | 29 | #define MAX_EXT_DEVT (1 << MINORBITS) |
@@ -37,6 +36,10 @@ static DEFINE_IDR(ext_devt_idr); | |||
37 | 36 | ||
38 | static struct device_type disk_type; | 37 | static struct device_type disk_type; |
39 | 38 | ||
39 | static void disk_add_events(struct gendisk *disk); | ||
40 | static void disk_del_events(struct gendisk *disk); | ||
41 | static void disk_release_events(struct gendisk *disk); | ||
42 | |||
40 | /** | 43 | /** |
41 | * disk_get_part - get partition | 44 | * disk_get_part - get partition |
42 | * @disk: disk to look partition from | 45 | * @disk: disk to look partition from |
@@ -241,7 +244,7 @@ static struct blk_major_name { | |||
241 | } *major_names[BLKDEV_MAJOR_HASH_SIZE]; | 244 | } *major_names[BLKDEV_MAJOR_HASH_SIZE]; |
242 | 245 | ||
243 | /* index in the above - for now: assume no multimajor ranges */ | 246 | /* index in the above - for now: assume no multimajor ranges */ |
244 | static inline int major_to_index(int major) | 247 | static inline int major_to_index(unsigned major) |
245 | { | 248 | { |
246 | return major % BLKDEV_MAJOR_HASH_SIZE; | 249 | return major % BLKDEV_MAJOR_HASH_SIZE; |
247 | } | 250 | } |
@@ -504,6 +507,64 @@ static int exact_lock(dev_t devt, void *data) | |||
504 | return 0; | 507 | return 0; |
505 | } | 508 | } |
506 | 509 | ||
510 | void register_disk(struct gendisk *disk) | ||
511 | { | ||
512 | struct device *ddev = disk_to_dev(disk); | ||
513 | struct block_device *bdev; | ||
514 | struct disk_part_iter piter; | ||
515 | struct hd_struct *part; | ||
516 | int err; | ||
517 | |||
518 | ddev->parent = disk->driverfs_dev; | ||
519 | |||
520 | dev_set_name(ddev, disk->disk_name); | ||
521 | |||
522 | /* delay uevents, until we scanned partition table */ | ||
523 | dev_set_uevent_suppress(ddev, 1); | ||
524 | |||
525 | if (device_add(ddev)) | ||
526 | return; | ||
527 | if (!sysfs_deprecated) { | ||
528 | err = sysfs_create_link(block_depr, &ddev->kobj, | ||
529 | kobject_name(&ddev->kobj)); | ||
530 | if (err) { | ||
531 | device_del(ddev); | ||
532 | return; | ||
533 | } | ||
534 | } | ||
535 | disk->part0.holder_dir = kobject_create_and_add("holders", &ddev->kobj); | ||
536 | disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj); | ||
537 | |||
538 | /* No minors to use for partitions */ | ||
539 | if (!disk_partitionable(disk)) | ||
540 | goto exit; | ||
541 | |||
542 | /* No such device (e.g., media were just removed) */ | ||
543 | if (!get_capacity(disk)) | ||
544 | goto exit; | ||
545 | |||
546 | bdev = bdget_disk(disk, 0); | ||
547 | if (!bdev) | ||
548 | goto exit; | ||
549 | |||
550 | bdev->bd_invalidated = 1; | ||
551 | err = blkdev_get(bdev, FMODE_READ, NULL); | ||
552 | if (err < 0) | ||
553 | goto exit; | ||
554 | blkdev_put(bdev, FMODE_READ); | ||
555 | |||
556 | exit: | ||
557 | /* announce disk after possible partitions are created */ | ||
558 | dev_set_uevent_suppress(ddev, 0); | ||
559 | kobject_uevent(&ddev->kobj, KOBJ_ADD); | ||
560 | |||
561 | /* announce possible partitions */ | ||
562 | disk_part_iter_init(&piter, disk, 0); | ||
563 | while ((part = disk_part_iter_next(&piter))) | ||
564 | kobject_uevent(&part_to_dev(part)->kobj, KOBJ_ADD); | ||
565 | disk_part_iter_exit(&piter); | ||
566 | } | ||
567 | |||
507 | /** | 568 | /** |
508 | * add_disk - add partitioning information to kernel list | 569 | * add_disk - add partitioning information to kernel list |
509 | * @disk: per-device partitioning information | 570 | * @disk: per-device partitioning information |
@@ -541,28 +602,60 @@ void add_disk(struct gendisk *disk) | |||
541 | disk->major = MAJOR(devt); | 602 | disk->major = MAJOR(devt); |
542 | disk->first_minor = MINOR(devt); | 603 | disk->first_minor = MINOR(devt); |
543 | 604 | ||
605 | /* Register BDI before referencing it from bdev */ | ||
606 | bdi = &disk->queue->backing_dev_info; | ||
607 | bdi_register_dev(bdi, disk_devt(disk)); | ||
608 | |||
544 | blk_register_region(disk_devt(disk), disk->minors, NULL, | 609 | blk_register_region(disk_devt(disk), disk->minors, NULL, |
545 | exact_match, exact_lock, disk); | 610 | exact_match, exact_lock, disk); |
546 | register_disk(disk); | 611 | register_disk(disk); |
547 | blk_register_queue(disk); | 612 | blk_register_queue(disk); |
548 | 613 | ||
549 | bdi = &disk->queue->backing_dev_info; | ||
550 | bdi_register_dev(bdi, disk_devt(disk)); | ||
551 | retval = sysfs_create_link(&disk_to_dev(disk)->kobj, &bdi->dev->kobj, | 614 | retval = sysfs_create_link(&disk_to_dev(disk)->kobj, &bdi->dev->kobj, |
552 | "bdi"); | 615 | "bdi"); |
553 | WARN_ON(retval); | 616 | WARN_ON(retval); |
554 | } | ||
555 | 617 | ||
618 | disk_add_events(disk); | ||
619 | } | ||
556 | EXPORT_SYMBOL(add_disk); | 620 | EXPORT_SYMBOL(add_disk); |
557 | EXPORT_SYMBOL(del_gendisk); /* in partitions/check.c */ | ||
558 | 621 | ||
559 | void unlink_gendisk(struct gendisk *disk) | 622 | void del_gendisk(struct gendisk *disk) |
560 | { | 623 | { |
624 | struct disk_part_iter piter; | ||
625 | struct hd_struct *part; | ||
626 | |||
627 | disk_del_events(disk); | ||
628 | |||
629 | /* invalidate stuff */ | ||
630 | disk_part_iter_init(&piter, disk, | ||
631 | DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE); | ||
632 | while ((part = disk_part_iter_next(&piter))) { | ||
633 | invalidate_partition(disk, part->partno); | ||
634 | delete_partition(disk, part->partno); | ||
635 | } | ||
636 | disk_part_iter_exit(&piter); | ||
637 | |||
638 | invalidate_partition(disk, 0); | ||
639 | blk_free_devt(disk_to_dev(disk)->devt); | ||
640 | set_capacity(disk, 0); | ||
641 | disk->flags &= ~GENHD_FL_UP; | ||
642 | |||
561 | sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi"); | 643 | sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi"); |
562 | bdi_unregister(&disk->queue->backing_dev_info); | 644 | bdi_unregister(&disk->queue->backing_dev_info); |
563 | blk_unregister_queue(disk); | 645 | blk_unregister_queue(disk); |
564 | blk_unregister_region(disk_devt(disk), disk->minors); | 646 | blk_unregister_region(disk_devt(disk), disk->minors); |
647 | |||
648 | part_stat_set_all(&disk->part0, 0); | ||
649 | disk->part0.stamp = 0; | ||
650 | |||
651 | kobject_put(disk->part0.holder_dir); | ||
652 | kobject_put(disk->slave_dir); | ||
653 | disk->driverfs_dev = NULL; | ||
654 | if (!sysfs_deprecated) | ||
655 | sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk))); | ||
656 | device_del(disk_to_dev(disk)); | ||
565 | } | 657 | } |
658 | EXPORT_SYMBOL(del_gendisk); | ||
566 | 659 | ||
567 | /** | 660 | /** |
568 | * get_gendisk - get partitioning information for a given device | 661 | * get_gendisk - get partitioning information for a given device |
@@ -642,10 +735,11 @@ void __init printk_all_partitions(void) | |||
642 | struct hd_struct *part; | 735 | struct hd_struct *part; |
643 | char name_buf[BDEVNAME_SIZE]; | 736 | char name_buf[BDEVNAME_SIZE]; |
644 | char devt_buf[BDEVT_SIZE]; | 737 | char devt_buf[BDEVT_SIZE]; |
738 | u8 uuid[PARTITION_META_INFO_UUIDLTH * 2 + 1]; | ||
645 | 739 | ||
646 | /* | 740 | /* |
647 | * Don't show empty devices or things that have been | 741 | * Don't show empty devices or things that have been |
648 | * surpressed | 742 | * suppressed |
649 | */ | 743 | */ |
650 | if (get_capacity(disk) == 0 || | 744 | if (get_capacity(disk) == 0 || |
651 | (disk->flags & GENHD_FL_SUPPRESS_PARTITION_INFO)) | 745 | (disk->flags & GENHD_FL_SUPPRESS_PARTITION_INFO)) |
@@ -660,10 +754,14 @@ void __init printk_all_partitions(void) | |||
660 | while ((part = disk_part_iter_next(&piter))) { | 754 | while ((part = disk_part_iter_next(&piter))) { |
661 | bool is_part0 = part == &disk->part0; | 755 | bool is_part0 = part == &disk->part0; |
662 | 756 | ||
663 | printk("%s%s %10llu %s", is_part0 ? "" : " ", | 757 | uuid[0] = 0; |
758 | if (part->info) | ||
759 | part_unpack_uuid(part->info->uuid, uuid); | ||
760 | |||
761 | printk("%s%s %10llu %s %s", is_part0 ? "" : " ", | ||
664 | bdevt_str(part_devt(part), devt_buf), | 762 | bdevt_str(part_devt(part), devt_buf), |
665 | (unsigned long long)part->nr_sects >> 1, | 763 | (unsigned long long)part->nr_sects >> 1, |
666 | disk_name(disk, part->partno, name_buf)); | 764 | disk_name(disk, part->partno, name_buf), uuid); |
667 | if (is_part0) { | 765 | if (is_part0) { |
668 | if (disk->driverfs_dev != NULL && | 766 | if (disk->driverfs_dev != NULL && |
669 | disk->driverfs_dev->driver != NULL) | 767 | disk->driverfs_dev->driver != NULL) |
@@ -730,7 +828,7 @@ static void *show_partition_start(struct seq_file *seqf, loff_t *pos) | |||
730 | static void *p; | 828 | static void *p; |
731 | 829 | ||
732 | p = disk_seqf_start(seqf, pos); | 830 | p = disk_seqf_start(seqf, pos); |
733 | if (!IS_ERR(p) && p && !*pos) | 831 | if (!IS_ERR_OR_NULL(p) && !*pos) |
734 | seq_puts(seqf, "major minor #blocks name\n\n"); | 832 | seq_puts(seqf, "major minor #blocks name\n\n"); |
735 | return p; | 833 | return p; |
736 | } | 834 | } |
@@ -803,10 +901,9 @@ static int __init genhd_device_init(void) | |||
803 | 901 | ||
804 | register_blkdev(BLOCK_EXT_MAJOR, "blkext"); | 902 | register_blkdev(BLOCK_EXT_MAJOR, "blkext"); |
805 | 903 | ||
806 | #ifndef CONFIG_SYSFS_DEPRECATED | ||
807 | /* create top-level block dir */ | 904 | /* create top-level block dir */ |
808 | block_depr = kobject_create_and_add("block", NULL); | 905 | if (!sysfs_deprecated) |
809 | #endif | 906 | block_depr = kobject_create_and_add("block", NULL); |
810 | return 0; | 907 | return 0; |
811 | } | 908 | } |
812 | 909 | ||
@@ -1001,9 +1098,11 @@ static void disk_release(struct device *dev) | |||
1001 | { | 1098 | { |
1002 | struct gendisk *disk = dev_to_disk(dev); | 1099 | struct gendisk *disk = dev_to_disk(dev); |
1003 | 1100 | ||
1101 | disk_release_events(disk); | ||
1004 | kfree(disk->random); | 1102 | kfree(disk->random); |
1005 | disk_replace_part_tbl(disk, NULL); | 1103 | disk_replace_part_tbl(disk, NULL); |
1006 | free_part_stats(&disk->part0); | 1104 | free_part_stats(&disk->part0); |
1105 | free_part_info(&disk->part0); | ||
1007 | kfree(disk); | 1106 | kfree(disk); |
1008 | } | 1107 | } |
1009 | struct class block_class = { | 1108 | struct class block_class = { |
@@ -1059,14 +1158,14 @@ static int diskstats_show(struct seq_file *seqf, void *v) | |||
1059 | "%u %lu %lu %llu %u %u %u %u\n", | 1158 | "%u %lu %lu %llu %u %u %u %u\n", |
1060 | MAJOR(part_devt(hd)), MINOR(part_devt(hd)), | 1159 | MAJOR(part_devt(hd)), MINOR(part_devt(hd)), |
1061 | disk_name(gp, hd->partno, buf), | 1160 | disk_name(gp, hd->partno, buf), |
1062 | part_stat_read(hd, ios[0]), | 1161 | part_stat_read(hd, ios[READ]), |
1063 | part_stat_read(hd, merges[0]), | 1162 | part_stat_read(hd, merges[READ]), |
1064 | (unsigned long long)part_stat_read(hd, sectors[0]), | 1163 | (unsigned long long)part_stat_read(hd, sectors[READ]), |
1065 | jiffies_to_msecs(part_stat_read(hd, ticks[0])), | 1164 | jiffies_to_msecs(part_stat_read(hd, ticks[READ])), |
1066 | part_stat_read(hd, ios[1]), | 1165 | part_stat_read(hd, ios[WRITE]), |
1067 | part_stat_read(hd, merges[1]), | 1166 | part_stat_read(hd, merges[WRITE]), |
1068 | (unsigned long long)part_stat_read(hd, sectors[1]), | 1167 | (unsigned long long)part_stat_read(hd, sectors[WRITE]), |
1069 | jiffies_to_msecs(part_stat_read(hd, ticks[1])), | 1168 | jiffies_to_msecs(part_stat_read(hd, ticks[WRITE])), |
1070 | part_in_flight(hd), | 1169 | part_in_flight(hd), |
1071 | jiffies_to_msecs(part_stat_read(hd, io_ticks)), | 1170 | jiffies_to_msecs(part_stat_read(hd, io_ticks)), |
1072 | jiffies_to_msecs(part_stat_read(hd, time_in_queue)) | 1171 | jiffies_to_msecs(part_stat_read(hd, time_in_queue)) |
@@ -1105,29 +1204,6 @@ static int __init proc_genhd_init(void) | |||
1105 | module_init(proc_genhd_init); | 1204 | module_init(proc_genhd_init); |
1106 | #endif /* CONFIG_PROC_FS */ | 1205 | #endif /* CONFIG_PROC_FS */ |
1107 | 1206 | ||
1108 | static void media_change_notify_thread(struct work_struct *work) | ||
1109 | { | ||
1110 | struct gendisk *gd = container_of(work, struct gendisk, async_notify); | ||
1111 | char event[] = "MEDIA_CHANGE=1"; | ||
1112 | char *envp[] = { event, NULL }; | ||
1113 | |||
1114 | /* | ||
1115 | * set enviroment vars to indicate which event this is for | ||
1116 | * so that user space will know to go check the media status. | ||
1117 | */ | ||
1118 | kobject_uevent_env(&disk_to_dev(gd)->kobj, KOBJ_CHANGE, envp); | ||
1119 | put_device(gd->driverfs_dev); | ||
1120 | } | ||
1121 | |||
1122 | #if 0 | ||
1123 | void genhd_media_change_notify(struct gendisk *disk) | ||
1124 | { | ||
1125 | get_device(disk->driverfs_dev); | ||
1126 | schedule_work(&disk->async_notify); | ||
1127 | } | ||
1128 | EXPORT_SYMBOL_GPL(genhd_media_change_notify); | ||
1129 | #endif /* 0 */ | ||
1130 | |||
1131 | dev_t blk_lookup_devt(const char *name, int partno) | 1207 | dev_t blk_lookup_devt(const char *name, int partno) |
1132 | { | 1208 | { |
1133 | dev_t devt = MKDEV(0, 0); | 1209 | dev_t devt = MKDEV(0, 0); |
@@ -1188,13 +1264,13 @@ struct gendisk *alloc_disk_node(int minors, int node_id) | |||
1188 | } | 1264 | } |
1189 | disk->part_tbl->part[0] = &disk->part0; | 1265 | disk->part_tbl->part[0] = &disk->part0; |
1190 | 1266 | ||
1267 | hd_ref_init(&disk->part0); | ||
1268 | |||
1191 | disk->minors = minors; | 1269 | disk->minors = minors; |
1192 | rand_initialize_disk(disk); | 1270 | rand_initialize_disk(disk); |
1193 | disk_to_dev(disk)->class = &block_class; | 1271 | disk_to_dev(disk)->class = &block_class; |
1194 | disk_to_dev(disk)->type = &disk_type; | 1272 | disk_to_dev(disk)->type = &disk_type; |
1195 | device_initialize(disk_to_dev(disk)); | 1273 | device_initialize(disk_to_dev(disk)); |
1196 | INIT_WORK(&disk->async_notify, | ||
1197 | media_change_notify_thread); | ||
1198 | } | 1274 | } |
1199 | return disk; | 1275 | return disk; |
1200 | } | 1276 | } |
@@ -1279,10 +1355,444 @@ int invalidate_partition(struct gendisk *disk, int partno) | |||
1279 | struct block_device *bdev = bdget_disk(disk, partno); | 1355 | struct block_device *bdev = bdget_disk(disk, partno); |
1280 | if (bdev) { | 1356 | if (bdev) { |
1281 | fsync_bdev(bdev); | 1357 | fsync_bdev(bdev); |
1282 | res = __invalidate_device(bdev); | 1358 | res = __invalidate_device(bdev, true); |
1283 | bdput(bdev); | 1359 | bdput(bdev); |
1284 | } | 1360 | } |
1285 | return res; | 1361 | return res; |
1286 | } | 1362 | } |
1287 | 1363 | ||
1288 | EXPORT_SYMBOL(invalidate_partition); | 1364 | EXPORT_SYMBOL(invalidate_partition); |
1365 | |||
1366 | /* | ||
1367 | * Disk events - monitor disk events like media change and eject request. | ||
1368 | */ | ||
1369 | struct disk_events { | ||
1370 | struct list_head node; /* all disk_event's */ | ||
1371 | struct gendisk *disk; /* the associated disk */ | ||
1372 | spinlock_t lock; | ||
1373 | |||
1374 | struct mutex block_mutex; /* protects blocking */ | ||
1375 | int block; /* event blocking depth */ | ||
1376 | unsigned int pending; /* events already sent out */ | ||
1377 | unsigned int clearing; /* events being cleared */ | ||
1378 | |||
1379 | long poll_msecs; /* interval, -1 for default */ | ||
1380 | struct delayed_work dwork; | ||
1381 | }; | ||
1382 | |||
1383 | static const char *disk_events_strs[] = { | ||
1384 | [ilog2(DISK_EVENT_MEDIA_CHANGE)] = "media_change", | ||
1385 | [ilog2(DISK_EVENT_EJECT_REQUEST)] = "eject_request", | ||
1386 | }; | ||
1387 | |||
1388 | static char *disk_uevents[] = { | ||
1389 | [ilog2(DISK_EVENT_MEDIA_CHANGE)] = "DISK_MEDIA_CHANGE=1", | ||
1390 | [ilog2(DISK_EVENT_EJECT_REQUEST)] = "DISK_EJECT_REQUEST=1", | ||
1391 | }; | ||
1392 | |||
1393 | /* list of all disk_events */ | ||
1394 | static DEFINE_MUTEX(disk_events_mutex); | ||
1395 | static LIST_HEAD(disk_events); | ||
1396 | |||
1397 | /* disable in-kernel polling by default */ | ||
1398 | static unsigned long disk_events_dfl_poll_msecs = 0; | ||
1399 | |||
1400 | static unsigned long disk_events_poll_jiffies(struct gendisk *disk) | ||
1401 | { | ||
1402 | struct disk_events *ev = disk->ev; | ||
1403 | long intv_msecs = 0; | ||
1404 | |||
1405 | /* | ||
1406 | * If device-specific poll interval is set, always use it. If | ||
1407 | * the default is being used, poll iff there are events which | ||
1408 | * can't be monitored asynchronously. | ||
1409 | */ | ||
1410 | if (ev->poll_msecs >= 0) | ||
1411 | intv_msecs = ev->poll_msecs; | ||
1412 | else if (disk->events & ~disk->async_events) | ||
1413 | intv_msecs = disk_events_dfl_poll_msecs; | ||
1414 | |||
1415 | return msecs_to_jiffies(intv_msecs); | ||
1416 | } | ||
1417 | |||
1418 | /** | ||
1419 | * disk_block_events - block and flush disk event checking | ||
1420 | * @disk: disk to block events for | ||
1421 | * | ||
1422 | * On return from this function, it is guaranteed that event checking | ||
1423 | * isn't in progress and won't happen until unblocked by | ||
1424 | * disk_unblock_events(). Events blocking is counted and the actual | ||
1425 | * unblocking happens after the matching number of unblocks are done. | ||
1426 | * | ||
1427 | * Note that this intentionally does not block event checking from | ||
1428 | * disk_clear_events(). | ||
1429 | * | ||
1430 | * CONTEXT: | ||
1431 | * Might sleep. | ||
1432 | */ | ||
1433 | void disk_block_events(struct gendisk *disk) | ||
1434 | { | ||
1435 | struct disk_events *ev = disk->ev; | ||
1436 | unsigned long flags; | ||
1437 | bool cancel; | ||
1438 | |||
1439 | if (!ev) | ||
1440 | return; | ||
1441 | |||
1442 | /* | ||
1443 | * Outer mutex ensures that the first blocker completes canceling | ||
1444 | * the event work before further blockers are allowed to finish. | ||
1445 | */ | ||
1446 | mutex_lock(&ev->block_mutex); | ||
1447 | |||
1448 | spin_lock_irqsave(&ev->lock, flags); | ||
1449 | cancel = !ev->block++; | ||
1450 | spin_unlock_irqrestore(&ev->lock, flags); | ||
1451 | |||
1452 | if (cancel) | ||
1453 | cancel_delayed_work_sync(&disk->ev->dwork); | ||
1454 | |||
1455 | mutex_unlock(&ev->block_mutex); | ||
1456 | } | ||
1457 | |||
1458 | static void __disk_unblock_events(struct gendisk *disk, bool check_now) | ||
1459 | { | ||
1460 | struct disk_events *ev = disk->ev; | ||
1461 | unsigned long intv; | ||
1462 | unsigned long flags; | ||
1463 | |||
1464 | spin_lock_irqsave(&ev->lock, flags); | ||
1465 | |||
1466 | if (WARN_ON_ONCE(ev->block <= 0)) | ||
1467 | goto out_unlock; | ||
1468 | |||
1469 | if (--ev->block) | ||
1470 | goto out_unlock; | ||
1471 | |||
1472 | /* | ||
1473 | * Not exactly a latency critical operation, set poll timer | ||
1474 | * slack to 25% and kick event check. | ||
1475 | */ | ||
1476 | intv = disk_events_poll_jiffies(disk); | ||
1477 | set_timer_slack(&ev->dwork.timer, intv / 4); | ||
1478 | if (check_now) | ||
1479 | queue_delayed_work(system_nrt_wq, &ev->dwork, 0); | ||
1480 | else if (intv) | ||
1481 | queue_delayed_work(system_nrt_wq, &ev->dwork, intv); | ||
1482 | out_unlock: | ||
1483 | spin_unlock_irqrestore(&ev->lock, flags); | ||
1484 | } | ||
1485 | |||
1486 | /** | ||
1487 | * disk_unblock_events - unblock disk event checking | ||
1488 | * @disk: disk to unblock events for | ||
1489 | * | ||
1490 | * Undo disk_block_events(). When the block count reaches zero, it | ||
1491 | * starts events polling if configured. | ||
1492 | * | ||
1493 | * CONTEXT: | ||
1494 | * Don't care. Safe to call from irq context. | ||
1495 | */ | ||
1496 | void disk_unblock_events(struct gendisk *disk) | ||
1497 | { | ||
1498 | if (disk->ev) | ||
1499 | __disk_unblock_events(disk, false); | ||
1500 | } | ||
1501 | |||
1502 | /** | ||
1503 | * disk_check_events - schedule immediate event checking | ||
1504 | * @disk: disk to check events for | ||
1505 | * | ||
1506 | * Schedule immediate event checking on @disk if not blocked. | ||
1507 | * | ||
1508 | * CONTEXT: | ||
1509 | * Don't care. Safe to call from irq context. | ||
1510 | */ | ||
1511 | void disk_check_events(struct gendisk *disk) | ||
1512 | { | ||
1513 | struct disk_events *ev = disk->ev; | ||
1514 | unsigned long flags; | ||
1515 | |||
1516 | if (!ev) | ||
1517 | return; | ||
1518 | |||
1519 | spin_lock_irqsave(&ev->lock, flags); | ||
1520 | if (!ev->block) { | ||
1521 | cancel_delayed_work(&ev->dwork); | ||
1522 | queue_delayed_work(system_nrt_wq, &ev->dwork, 0); | ||
1523 | } | ||
1524 | spin_unlock_irqrestore(&ev->lock, flags); | ||
1525 | } | ||
1526 | EXPORT_SYMBOL_GPL(disk_check_events); | ||
1527 | |||
1528 | /** | ||
1529 | * disk_clear_events - synchronously check, clear and return pending events | ||
1530 | * @disk: disk to fetch and clear events from | ||
1531 | * @mask: mask of events to be fetched and clearted | ||
1532 | * | ||
1533 | * Disk events are synchronously checked and pending events in @mask | ||
1534 | * are cleared and returned. This ignores the block count. | ||
1535 | * | ||
1536 | * CONTEXT: | ||
1537 | * Might sleep. | ||
1538 | */ | ||
1539 | unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask) | ||
1540 | { | ||
1541 | const struct block_device_operations *bdops = disk->fops; | ||
1542 | struct disk_events *ev = disk->ev; | ||
1543 | unsigned int pending; | ||
1544 | |||
1545 | if (!ev) { | ||
1546 | /* for drivers still using the old ->media_changed method */ | ||
1547 | if ((mask & DISK_EVENT_MEDIA_CHANGE) && | ||
1548 | bdops->media_changed && bdops->media_changed(disk)) | ||
1549 | return DISK_EVENT_MEDIA_CHANGE; | ||
1550 | return 0; | ||
1551 | } | ||
1552 | |||
1553 | /* tell the workfn about the events being cleared */ | ||
1554 | spin_lock_irq(&ev->lock); | ||
1555 | ev->clearing |= mask; | ||
1556 | spin_unlock_irq(&ev->lock); | ||
1557 | |||
1558 | /* uncondtionally schedule event check and wait for it to finish */ | ||
1559 | disk_block_events(disk); | ||
1560 | queue_delayed_work(system_nrt_wq, &ev->dwork, 0); | ||
1561 | flush_delayed_work(&ev->dwork); | ||
1562 | __disk_unblock_events(disk, false); | ||
1563 | |||
1564 | /* then, fetch and clear pending events */ | ||
1565 | spin_lock_irq(&ev->lock); | ||
1566 | WARN_ON_ONCE(ev->clearing & mask); /* cleared by workfn */ | ||
1567 | pending = ev->pending & mask; | ||
1568 | ev->pending &= ~mask; | ||
1569 | spin_unlock_irq(&ev->lock); | ||
1570 | |||
1571 | return pending; | ||
1572 | } | ||
1573 | |||
1574 | static void disk_events_workfn(struct work_struct *work) | ||
1575 | { | ||
1576 | struct delayed_work *dwork = to_delayed_work(work); | ||
1577 | struct disk_events *ev = container_of(dwork, struct disk_events, dwork); | ||
1578 | struct gendisk *disk = ev->disk; | ||
1579 | char *envp[ARRAY_SIZE(disk_uevents) + 1] = { }; | ||
1580 | unsigned int clearing = ev->clearing; | ||
1581 | unsigned int events; | ||
1582 | unsigned long intv; | ||
1583 | int nr_events = 0, i; | ||
1584 | |||
1585 | /* check events */ | ||
1586 | events = disk->fops->check_events(disk, clearing); | ||
1587 | |||
1588 | /* accumulate pending events and schedule next poll if necessary */ | ||
1589 | spin_lock_irq(&ev->lock); | ||
1590 | |||
1591 | events &= ~ev->pending; | ||
1592 | ev->pending |= events; | ||
1593 | ev->clearing &= ~clearing; | ||
1594 | |||
1595 | intv = disk_events_poll_jiffies(disk); | ||
1596 | if (!ev->block && intv) | ||
1597 | queue_delayed_work(system_nrt_wq, &ev->dwork, intv); | ||
1598 | |||
1599 | spin_unlock_irq(&ev->lock); | ||
1600 | |||
1601 | /* | ||
1602 | * Tell userland about new events. Only the events listed in | ||
1603 | * @disk->events are reported. Unlisted events are processed the | ||
1604 | * same internally but never get reported to userland. | ||
1605 | */ | ||
1606 | for (i = 0; i < ARRAY_SIZE(disk_uevents); i++) | ||
1607 | if (events & disk->events & (1 << i)) | ||
1608 | envp[nr_events++] = disk_uevents[i]; | ||
1609 | |||
1610 | if (nr_events) | ||
1611 | kobject_uevent_env(&disk_to_dev(disk)->kobj, KOBJ_CHANGE, envp); | ||
1612 | } | ||
1613 | |||
1614 | /* | ||
1615 | * A disk events enabled device has the following sysfs nodes under | ||
1616 | * its /sys/block/X/ directory. | ||
1617 | * | ||
1618 | * events : list of all supported events | ||
1619 | * events_async : list of events which can be detected w/o polling | ||
1620 | * events_poll_msecs : polling interval, 0: disable, -1: system default | ||
1621 | */ | ||
1622 | static ssize_t __disk_events_show(unsigned int events, char *buf) | ||
1623 | { | ||
1624 | const char *delim = ""; | ||
1625 | ssize_t pos = 0; | ||
1626 | int i; | ||
1627 | |||
1628 | for (i = 0; i < ARRAY_SIZE(disk_events_strs); i++) | ||
1629 | if (events & (1 << i)) { | ||
1630 | pos += sprintf(buf + pos, "%s%s", | ||
1631 | delim, disk_events_strs[i]); | ||
1632 | delim = " "; | ||
1633 | } | ||
1634 | if (pos) | ||
1635 | pos += sprintf(buf + pos, "\n"); | ||
1636 | return pos; | ||
1637 | } | ||
1638 | |||
1639 | static ssize_t disk_events_show(struct device *dev, | ||
1640 | struct device_attribute *attr, char *buf) | ||
1641 | { | ||
1642 | struct gendisk *disk = dev_to_disk(dev); | ||
1643 | |||
1644 | return __disk_events_show(disk->events, buf); | ||
1645 | } | ||
1646 | |||
1647 | static ssize_t disk_events_async_show(struct device *dev, | ||
1648 | struct device_attribute *attr, char *buf) | ||
1649 | { | ||
1650 | struct gendisk *disk = dev_to_disk(dev); | ||
1651 | |||
1652 | return __disk_events_show(disk->async_events, buf); | ||
1653 | } | ||
1654 | |||
1655 | static ssize_t disk_events_poll_msecs_show(struct device *dev, | ||
1656 | struct device_attribute *attr, | ||
1657 | char *buf) | ||
1658 | { | ||
1659 | struct gendisk *disk = dev_to_disk(dev); | ||
1660 | |||
1661 | return sprintf(buf, "%ld\n", disk->ev->poll_msecs); | ||
1662 | } | ||
1663 | |||
1664 | static ssize_t disk_events_poll_msecs_store(struct device *dev, | ||
1665 | struct device_attribute *attr, | ||
1666 | const char *buf, size_t count) | ||
1667 | { | ||
1668 | struct gendisk *disk = dev_to_disk(dev); | ||
1669 | long intv; | ||
1670 | |||
1671 | if (!count || !sscanf(buf, "%ld", &intv)) | ||
1672 | return -EINVAL; | ||
1673 | |||
1674 | if (intv < 0 && intv != -1) | ||
1675 | return -EINVAL; | ||
1676 | |||
1677 | disk_block_events(disk); | ||
1678 | disk->ev->poll_msecs = intv; | ||
1679 | __disk_unblock_events(disk, true); | ||
1680 | |||
1681 | return count; | ||
1682 | } | ||
1683 | |||
1684 | static const DEVICE_ATTR(events, S_IRUGO, disk_events_show, NULL); | ||
1685 | static const DEVICE_ATTR(events_async, S_IRUGO, disk_events_async_show, NULL); | ||
1686 | static const DEVICE_ATTR(events_poll_msecs, S_IRUGO|S_IWUSR, | ||
1687 | disk_events_poll_msecs_show, | ||
1688 | disk_events_poll_msecs_store); | ||
1689 | |||
1690 | static const struct attribute *disk_events_attrs[] = { | ||
1691 | &dev_attr_events.attr, | ||
1692 | &dev_attr_events_async.attr, | ||
1693 | &dev_attr_events_poll_msecs.attr, | ||
1694 | NULL, | ||
1695 | }; | ||
1696 | |||
1697 | /* | ||
1698 | * The default polling interval can be specified by the kernel | ||
1699 | * parameter block.events_dfl_poll_msecs which defaults to 0 | ||
1700 | * (disable). This can also be modified runtime by writing to | ||
1701 | * /sys/module/block/events_dfl_poll_msecs. | ||
1702 | */ | ||
1703 | static int disk_events_set_dfl_poll_msecs(const char *val, | ||
1704 | const struct kernel_param *kp) | ||
1705 | { | ||
1706 | struct disk_events *ev; | ||
1707 | int ret; | ||
1708 | |||
1709 | ret = param_set_ulong(val, kp); | ||
1710 | if (ret < 0) | ||
1711 | return ret; | ||
1712 | |||
1713 | mutex_lock(&disk_events_mutex); | ||
1714 | |||
1715 | list_for_each_entry(ev, &disk_events, node) | ||
1716 | disk_check_events(ev->disk); | ||
1717 | |||
1718 | mutex_unlock(&disk_events_mutex); | ||
1719 | |||
1720 | return 0; | ||
1721 | } | ||
1722 | |||
1723 | static const struct kernel_param_ops disk_events_dfl_poll_msecs_param_ops = { | ||
1724 | .set = disk_events_set_dfl_poll_msecs, | ||
1725 | .get = param_get_ulong, | ||
1726 | }; | ||
1727 | |||
1728 | #undef MODULE_PARAM_PREFIX | ||
1729 | #define MODULE_PARAM_PREFIX "block." | ||
1730 | |||
1731 | module_param_cb(events_dfl_poll_msecs, &disk_events_dfl_poll_msecs_param_ops, | ||
1732 | &disk_events_dfl_poll_msecs, 0644); | ||
1733 | |||
1734 | /* | ||
1735 | * disk_{add|del|release}_events - initialize and destroy disk_events. | ||
1736 | */ | ||
1737 | static void disk_add_events(struct gendisk *disk) | ||
1738 | { | ||
1739 | struct disk_events *ev; | ||
1740 | |||
1741 | if (!disk->fops->check_events) | ||
1742 | return; | ||
1743 | |||
1744 | ev = kzalloc(sizeof(*ev), GFP_KERNEL); | ||
1745 | if (!ev) { | ||
1746 | pr_warn("%s: failed to initialize events\n", disk->disk_name); | ||
1747 | return; | ||
1748 | } | ||
1749 | |||
1750 | if (sysfs_create_files(&disk_to_dev(disk)->kobj, | ||
1751 | disk_events_attrs) < 0) { | ||
1752 | pr_warn("%s: failed to create sysfs files for events\n", | ||
1753 | disk->disk_name); | ||
1754 | kfree(ev); | ||
1755 | return; | ||
1756 | } | ||
1757 | |||
1758 | disk->ev = ev; | ||
1759 | |||
1760 | INIT_LIST_HEAD(&ev->node); | ||
1761 | ev->disk = disk; | ||
1762 | spin_lock_init(&ev->lock); | ||
1763 | mutex_init(&ev->block_mutex); | ||
1764 | ev->block = 1; | ||
1765 | ev->poll_msecs = -1; | ||
1766 | INIT_DELAYED_WORK(&ev->dwork, disk_events_workfn); | ||
1767 | |||
1768 | mutex_lock(&disk_events_mutex); | ||
1769 | list_add_tail(&ev->node, &disk_events); | ||
1770 | mutex_unlock(&disk_events_mutex); | ||
1771 | |||
1772 | /* | ||
1773 | * Block count is initialized to 1 and the following initial | ||
1774 | * unblock kicks it into action. | ||
1775 | */ | ||
1776 | __disk_unblock_events(disk, true); | ||
1777 | } | ||
1778 | |||
1779 | static void disk_del_events(struct gendisk *disk) | ||
1780 | { | ||
1781 | if (!disk->ev) | ||
1782 | return; | ||
1783 | |||
1784 | disk_block_events(disk); | ||
1785 | |||
1786 | mutex_lock(&disk_events_mutex); | ||
1787 | list_del_init(&disk->ev->node); | ||
1788 | mutex_unlock(&disk_events_mutex); | ||
1789 | |||
1790 | sysfs_remove_files(&disk_to_dev(disk)->kobj, disk_events_attrs); | ||
1791 | } | ||
1792 | |||
1793 | static void disk_release_events(struct gendisk *disk) | ||
1794 | { | ||
1795 | /* the block count should be 1 from disk_del_events() */ | ||
1796 | WARN_ON_ONCE(disk->ev && disk->ev->block != 1); | ||
1797 | kfree(disk->ev); | ||
1798 | } | ||
diff --git a/block/ioctl.c b/block/ioctl.c index d8052f0dabd3..1124cd297263 100644 --- a/block/ioctl.c +++ b/block/ioctl.c | |||
@@ -5,7 +5,6 @@ | |||
5 | #include <linux/hdreg.h> | 5 | #include <linux/hdreg.h> |
6 | #include <linux/backing-dev.h> | 6 | #include <linux/backing-dev.h> |
7 | #include <linux/buffer_head.h> | 7 | #include <linux/buffer_head.h> |
8 | #include <linux/smp_lock.h> | ||
9 | #include <linux/blktrace_api.h> | 8 | #include <linux/blktrace_api.h> |
10 | #include <asm/uaccess.h> | 9 | #include <asm/uaccess.h> |
11 | 10 | ||
@@ -62,7 +61,7 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user | |||
62 | 61 | ||
63 | /* all seems OK */ | 62 | /* all seems OK */ |
64 | part = add_partition(disk, partno, start, length, | 63 | part = add_partition(disk, partno, start, length, |
65 | ADDPART_FLAG_NONE); | 64 | ADDPART_FLAG_NONE, NULL); |
66 | mutex_unlock(&bdev->bd_mutex); | 65 | mutex_unlock(&bdev->bd_mutex); |
67 | return IS_ERR(part) ? PTR_ERR(part) : 0; | 66 | return IS_ERR(part) ? PTR_ERR(part) : 0; |
68 | case BLKPG_DEL_PARTITION: | 67 | case BLKPG_DEL_PARTITION: |
@@ -116,7 +115,7 @@ static int blkdev_reread_part(struct block_device *bdev) | |||
116 | static int blk_ioctl_discard(struct block_device *bdev, uint64_t start, | 115 | static int blk_ioctl_discard(struct block_device *bdev, uint64_t start, |
117 | uint64_t len, int secure) | 116 | uint64_t len, int secure) |
118 | { | 117 | { |
119 | unsigned long flags = BLKDEV_IFL_WAIT; | 118 | unsigned long flags = 0; |
120 | 119 | ||
121 | if (start & 511) | 120 | if (start & 511) |
122 | return -EINVAL; | 121 | return -EINVAL; |
@@ -125,10 +124,10 @@ static int blk_ioctl_discard(struct block_device *bdev, uint64_t start, | |||
125 | start >>= 9; | 124 | start >>= 9; |
126 | len >>= 9; | 125 | len >>= 9; |
127 | 126 | ||
128 | if (start + len > (bdev->bd_inode->i_size >> 9)) | 127 | if (start + len > (i_size_read(bdev->bd_inode) >> 9)) |
129 | return -EINVAL; | 128 | return -EINVAL; |
130 | if (secure) | 129 | if (secure) |
131 | flags |= BLKDEV_IFL_SECURE; | 130 | flags |= BLKDEV_DISCARD_SECURE; |
132 | return blkdev_issue_discard(bdev, start, len, GFP_KERNEL, flags); | 131 | return blkdev_issue_discard(bdev, start, len, GFP_KERNEL, flags); |
133 | } | 132 | } |
134 | 133 | ||
@@ -242,6 +241,7 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, | |||
242 | * We need to set the startsect first, the driver may | 241 | * We need to set the startsect first, the driver may |
243 | * want to override it. | 242 | * want to override it. |
244 | */ | 243 | */ |
244 | memset(&geo, 0, sizeof(geo)); | ||
245 | geo.start = get_start_sect(bdev); | 245 | geo.start = get_start_sect(bdev); |
246 | ret = disk->fops->getgeo(bdev, &geo); | 246 | ret = disk->fops->getgeo(bdev, &geo); |
247 | if (ret) | 247 | if (ret) |
@@ -294,11 +294,14 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, | |||
294 | return -EINVAL; | 294 | return -EINVAL; |
295 | if (get_user(n, (int __user *) arg)) | 295 | if (get_user(n, (int __user *) arg)) |
296 | return -EFAULT; | 296 | return -EFAULT; |
297 | if (!(mode & FMODE_EXCL) && bd_claim(bdev, &bdev) < 0) | 297 | if (!(mode & FMODE_EXCL)) { |
298 | return -EBUSY; | 298 | bdgrab(bdev); |
299 | if (blkdev_get(bdev, mode | FMODE_EXCL, &bdev) < 0) | ||
300 | return -EBUSY; | ||
301 | } | ||
299 | ret = set_blocksize(bdev, n); | 302 | ret = set_blocksize(bdev, n); |
300 | if (!(mode & FMODE_EXCL)) | 303 | if (!(mode & FMODE_EXCL)) |
301 | bd_release(bdev); | 304 | blkdev_put(bdev, mode | FMODE_EXCL); |
302 | return ret; | 305 | return ret; |
303 | case BLKPG: | 306 | case BLKPG: |
304 | ret = blkpg_ioctl(bdev, (struct blkpg_ioctl_arg __user *) arg); | 307 | ret = blkpg_ioctl(bdev, (struct blkpg_ioctl_arg __user *) arg); |
@@ -307,12 +310,12 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, | |||
307 | ret = blkdev_reread_part(bdev); | 310 | ret = blkdev_reread_part(bdev); |
308 | break; | 311 | break; |
309 | case BLKGETSIZE: | 312 | case BLKGETSIZE: |
310 | size = bdev->bd_inode->i_size; | 313 | size = i_size_read(bdev->bd_inode); |
311 | if ((size >> 9) > ~0UL) | 314 | if ((size >> 9) > ~0UL) |
312 | return -EFBIG; | 315 | return -EFBIG; |
313 | return put_ulong(arg, size >> 9); | 316 | return put_ulong(arg, size >> 9); |
314 | case BLKGETSIZE64: | 317 | case BLKGETSIZE64: |
315 | return put_u64(arg, bdev->bd_inode->i_size); | 318 | return put_u64(arg, i_size_read(bdev->bd_inode)); |
316 | case BLKTRACESTART: | 319 | case BLKTRACESTART: |
317 | case BLKTRACESTOP: | 320 | case BLKTRACESTOP: |
318 | case BLKTRACESETUP: | 321 | case BLKTRACESETUP: |
diff --git a/block/noop-iosched.c b/block/noop-iosched.c index 232c4b38cd37..06389e9ef96d 100644 --- a/block/noop-iosched.c +++ b/block/noop-iosched.c | |||
@@ -39,13 +39,6 @@ static void noop_add_request(struct request_queue *q, struct request *rq) | |||
39 | list_add_tail(&rq->queuelist, &nd->queue); | 39 | list_add_tail(&rq->queuelist, &nd->queue); |
40 | } | 40 | } |
41 | 41 | ||
42 | static int noop_queue_empty(struct request_queue *q) | ||
43 | { | ||
44 | struct noop_data *nd = q->elevator->elevator_data; | ||
45 | |||
46 | return list_empty(&nd->queue); | ||
47 | } | ||
48 | |||
49 | static struct request * | 42 | static struct request * |
50 | noop_former_request(struct request_queue *q, struct request *rq) | 43 | noop_former_request(struct request_queue *q, struct request *rq) |
51 | { | 44 | { |
@@ -90,7 +83,6 @@ static struct elevator_type elevator_noop = { | |||
90 | .elevator_merge_req_fn = noop_merged_requests, | 83 | .elevator_merge_req_fn = noop_merged_requests, |
91 | .elevator_dispatch_fn = noop_dispatch, | 84 | .elevator_dispatch_fn = noop_dispatch, |
92 | .elevator_add_req_fn = noop_add_request, | 85 | .elevator_add_req_fn = noop_add_request, |
93 | .elevator_queue_empty_fn = noop_queue_empty, | ||
94 | .elevator_former_req_fn = noop_former_request, | 86 | .elevator_former_req_fn = noop_former_request, |
95 | .elevator_latter_req_fn = noop_latter_request, | 87 | .elevator_latter_req_fn = noop_latter_request, |
96 | .elevator_init_fn = noop_init_queue, | 88 | .elevator_init_fn = noop_init_queue, |
diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c index a8b5a10eb5b0..4f4230b79bb6 100644 --- a/block/scsi_ioctl.c +++ b/block/scsi_ioctl.c | |||
@@ -321,33 +321,47 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk, | |||
321 | if (hdr->iovec_count) { | 321 | if (hdr->iovec_count) { |
322 | const int size = sizeof(struct sg_iovec) * hdr->iovec_count; | 322 | const int size = sizeof(struct sg_iovec) * hdr->iovec_count; |
323 | size_t iov_data_len; | 323 | size_t iov_data_len; |
324 | struct sg_iovec *iov; | 324 | struct sg_iovec *sg_iov; |
325 | struct iovec *iov; | ||
326 | int i; | ||
325 | 327 | ||
326 | iov = kmalloc(size, GFP_KERNEL); | 328 | sg_iov = kmalloc(size, GFP_KERNEL); |
327 | if (!iov) { | 329 | if (!sg_iov) { |
328 | ret = -ENOMEM; | 330 | ret = -ENOMEM; |
329 | goto out; | 331 | goto out; |
330 | } | 332 | } |
331 | 333 | ||
332 | if (copy_from_user(iov, hdr->dxferp, size)) { | 334 | if (copy_from_user(sg_iov, hdr->dxferp, size)) { |
333 | kfree(iov); | 335 | kfree(sg_iov); |
334 | ret = -EFAULT; | 336 | ret = -EFAULT; |
335 | goto out; | 337 | goto out; |
336 | } | 338 | } |
337 | 339 | ||
340 | /* | ||
341 | * Sum up the vecs, making sure they don't overflow | ||
342 | */ | ||
343 | iov = (struct iovec *) sg_iov; | ||
344 | iov_data_len = 0; | ||
345 | for (i = 0; i < hdr->iovec_count; i++) { | ||
346 | if (iov_data_len + iov[i].iov_len < iov_data_len) { | ||
347 | kfree(sg_iov); | ||
348 | ret = -EINVAL; | ||
349 | goto out; | ||
350 | } | ||
351 | iov_data_len += iov[i].iov_len; | ||
352 | } | ||
353 | |||
338 | /* SG_IO howto says that the shorter of the two wins */ | 354 | /* SG_IO howto says that the shorter of the two wins */ |
339 | iov_data_len = iov_length((struct iovec *)iov, | ||
340 | hdr->iovec_count); | ||
341 | if (hdr->dxfer_len < iov_data_len) { | 355 | if (hdr->dxfer_len < iov_data_len) { |
342 | hdr->iovec_count = iov_shorten((struct iovec *)iov, | 356 | hdr->iovec_count = iov_shorten(iov, |
343 | hdr->iovec_count, | 357 | hdr->iovec_count, |
344 | hdr->dxfer_len); | 358 | hdr->dxfer_len); |
345 | iov_data_len = hdr->dxfer_len; | 359 | iov_data_len = hdr->dxfer_len; |
346 | } | 360 | } |
347 | 361 | ||
348 | ret = blk_rq_map_user_iov(q, rq, NULL, iov, hdr->iovec_count, | 362 | ret = blk_rq_map_user_iov(q, rq, NULL, sg_iov, hdr->iovec_count, |
349 | iov_data_len, GFP_KERNEL); | 363 | iov_data_len, GFP_KERNEL); |
350 | kfree(iov); | 364 | kfree(sg_iov); |
351 | } else if (hdr->dxfer_len) | 365 | } else if (hdr->dxfer_len) |
352 | ret = blk_rq_map_user(q, rq, NULL, hdr->dxferp, hdr->dxfer_len, | 366 | ret = blk_rq_map_user(q, rq, NULL, hdr->dxferp, hdr->dxfer_len, |
353 | GFP_KERNEL); | 367 | GFP_KERNEL); |