aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/cgroups/blkio-controller.txt151
-rw-r--r--block/Kconfig23
-rw-r--r--block/Kconfig.iosched16
-rw-r--r--block/Makefile2
-rw-r--r--block/blk-barrier.c147
-rw-r--r--block/blk-cgroup.c727
-rw-r--r--block/blk-cgroup.h178
-rw-r--r--block/blk-core.c13
-rw-r--r--block/blk-lib.c233
-rw-r--r--block/cfq-iosched.c80
-rw-r--r--block/elevator.c9
-rw-r--r--block/genhd.c1
-rw-r--r--block/ioctl.c2
-rw-r--r--drivers/block/drbd/drbd_int.h3
-rw-r--r--drivers/block/drbd/drbd_receiver.c3
-rw-r--r--fs/block_dev.c257
-rw-r--r--fs/btrfs/extent-tree.c2
-rw-r--r--fs/ext3/fsync.c3
-rw-r--r--fs/ext4/fsync.c6
-rw-r--r--fs/gfs2/rgrp.c5
-rw-r--r--fs/jbd2/checkpoint.c3
-rw-r--r--fs/jbd2/commit.c6
-rw-r--r--fs/nilfs2/the_nilfs.c4
-rw-r--r--fs/reiserfs/file.c3
-rw-r--r--fs/xfs/linux-2.6/xfs_super.c3
-rw-r--r--include/linux/backing-dev.h3
-rw-r--r--include/linux/blkdev.h62
-rw-r--r--include/linux/elevator.h6
-rw-r--r--include/linux/fs.h1
-rw-r--r--include/linux/writeback.h4
-rw-r--r--init/Kconfig27
-rw-r--r--kernel/sched_clock.c1
-rw-r--r--mm/page-writeback.c39
-rw-r--r--mm/swapfile.c9
34 files changed, 1699 insertions, 333 deletions
diff --git a/Documentation/cgroups/blkio-controller.txt b/Documentation/cgroups/blkio-controller.txt
index 630879cd9a42..48e0b21b0059 100644
--- a/Documentation/cgroups/blkio-controller.txt
+++ b/Documentation/cgroups/blkio-controller.txt
@@ -17,6 +17,9 @@ HOWTO
17You can do a very simple testing of running two dd threads in two different 17You can do a very simple testing of running two dd threads in two different
18cgroups. Here is what you can do. 18cgroups. Here is what you can do.
19 19
20- Enable Block IO controller
21 CONFIG_BLK_CGROUP=y
22
20- Enable group scheduling in CFQ 23- Enable group scheduling in CFQ
21 CONFIG_CFQ_GROUP_IOSCHED=y 24 CONFIG_CFQ_GROUP_IOSCHED=y
22 25
@@ -54,32 +57,52 @@ cgroups. Here is what you can do.
54 57
55Various user visible config options 58Various user visible config options
56=================================== 59===================================
57CONFIG_CFQ_GROUP_IOSCHED
58 - Enables group scheduling in CFQ. Currently only 1 level of group
59 creation is allowed.
60
61CONFIG_DEBUG_CFQ_IOSCHED
62 - Enables some debugging messages in blktrace. Also creates extra
63 cgroup file blkio.dequeue.
64
65Config options selected automatically
66=====================================
67These config options are not user visible and are selected/deselected
68automatically based on IO scheduler configuration.
69
70CONFIG_BLK_CGROUP 60CONFIG_BLK_CGROUP
71 - Block IO controller. Selected by CONFIG_CFQ_GROUP_IOSCHED. 61 - Block IO controller.
72 62
73CONFIG_DEBUG_BLK_CGROUP 63CONFIG_DEBUG_BLK_CGROUP
74 - Debug help. Selected by CONFIG_DEBUG_CFQ_IOSCHED. 64 - Debug help. Right now some additional stats file show up in cgroup
65 if this option is enabled.
66
67CONFIG_CFQ_GROUP_IOSCHED
68 - Enables group scheduling in CFQ. Currently only 1 level of group
69 creation is allowed.
75 70
76Details of cgroup files 71Details of cgroup files
77======================= 72=======================
78- blkio.weight 73- blkio.weight
79 - Specifies per cgroup weight. 74 - Specifies per cgroup weight. This is default weight of the group
80 75 on all the devices until and unless overridden by per device rule.
76 (See blkio.weight_device).
81 Currently allowed range of weights is from 100 to 1000. 77 Currently allowed range of weights is from 100 to 1000.
82 78
79- blkio.weight_device
80 - One can specify per cgroup per device rules using this interface.
81 These rules override the default value of group weight as specified
82 by blkio.weight.
83
84 Following is the format.
85
86 #echo dev_maj:dev_minor weight > /path/to/cgroup/blkio.weight_device
87 Configure weight=300 on /dev/sdb (8:16) in this cgroup
88 # echo 8:16 300 > blkio.weight_device
89 # cat blkio.weight_device
90 dev weight
91 8:16 300
92
93 Configure weight=500 on /dev/sda (8:0) in this cgroup
94 # echo 8:0 500 > blkio.weight_device
95 # cat blkio.weight_device
96 dev weight
97 8:0 500
98 8:16 300
99
100 Remove specific weight for /dev/sda in this cgroup
101 # echo 8:0 0 > blkio.weight_device
102 # cat blkio.weight_device
103 dev weight
104 8:16 300
105
83- blkio.time 106- blkio.time
84 - disk time allocated to cgroup per device in milliseconds. First 107 - disk time allocated to cgroup per device in milliseconds. First
85 two fields specify the major and minor number of the device and 108 two fields specify the major and minor number of the device and
@@ -92,13 +115,105 @@ Details of cgroup files
92 third field specifies the number of sectors transferred by the 115 third field specifies the number of sectors transferred by the
93 group to/from the device. 116 group to/from the device.
94 117
118- blkio.io_service_bytes
119 - Number of bytes transferred to/from the disk by the group. These
120 are further divided by the type of operation - read or write, sync
121 or async. First two fields specify the major and minor number of the
122 device, third field specifies the operation type and the fourth field
123 specifies the number of bytes.
124
125- blkio.io_serviced
126 - Number of IOs completed to/from the disk by the group. These
127 are further divided by the type of operation - read or write, sync
128 or async. First two fields specify the major and minor number of the
129 device, third field specifies the operation type and the fourth field
130 specifies the number of IOs.
131
132- blkio.io_service_time
133 - Total amount of time between request dispatch and request completion
134 for the IOs done by this cgroup. This is in nanoseconds to make it
135 meaningful for flash devices too. For devices with queue depth of 1,
136 this time represents the actual service time. When queue_depth > 1,
137 that is no longer true as requests may be served out of order. This
138 may cause the service time for a given IO to include the service time
139 of multiple IOs when served out of order which may result in total
140 io_service_time > actual time elapsed. This time is further divided by
141 the type of operation - read or write, sync or async. First two fields
142 specify the major and minor number of the device, third field
143 specifies the operation type and the fourth field specifies the
144 io_service_time in ns.
145
146- blkio.io_wait_time
147 - Total amount of time the IOs for this cgroup spent waiting in the
148 scheduler queues for service. This can be greater than the total time
149 elapsed since it is cumulative io_wait_time for all IOs. It is not a
150 measure of total time the cgroup spent waiting but rather a measure of
151 the wait_time for its individual IOs. For devices with queue_depth > 1
152 this metric does not include the time spent waiting for service once
153 the IO is dispatched to the device but till it actually gets serviced
154 (there might be a time lag here due to re-ordering of requests by the
155 device). This is in nanoseconds to make it meaningful for flash
156 devices too. This time is further divided by the type of operation -
157 read or write, sync or async. First two fields specify the major and
158 minor number of the device, third field specifies the operation type
159 and the fourth field specifies the io_wait_time in ns.
160
161- blkio.io_merged
162 - Total number of bios/requests merged into requests belonging to this
163 cgroup. This is further divided by the type of operation - read or
164 write, sync or async.
165
166- blkio.io_queued
167 - Total number of requests queued up at any given instant for this
168 cgroup. This is further divided by the type of operation - read or
169 write, sync or async.
170
171- blkio.avg_queue_size
172 - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y.
173 The average queue size for this cgroup over the entire time of this
174 cgroup's existence. Queue size samples are taken each time one of the
175 queues of this cgroup gets a timeslice.
176
177- blkio.group_wait_time
178 - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y.
179 This is the amount of time the cgroup had to wait since it became busy
180 (i.e., went from 0 to 1 request queued) to get a timeslice for one of
181 its queues. This is different from the io_wait_time which is the
182 cumulative total of the amount of time spent by each IO in that cgroup
183 waiting in the scheduler queue. This is in nanoseconds. If this is
184 read when the cgroup is in a waiting (for timeslice) state, the stat
185 will only report the group_wait_time accumulated till the last time it
186 got a timeslice and will not include the current delta.
187
188- blkio.empty_time
189 - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y.
190 This is the amount of time a cgroup spends without any pending
191 requests when not being served, i.e., it does not include any time
192 spent idling for one of the queues of the cgroup. This is in
193 nanoseconds. If this is read when the cgroup is in an empty state,
194 the stat will only report the empty_time accumulated till the last
195 time it had a pending request and will not include the current delta.
196
197- blkio.idle_time
198 - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y.
199 This is the amount of time spent by the IO scheduler idling for a
200 given cgroup in anticipation of a better request than the exising ones
201 from other queues/cgroups. This is in nanoseconds. If this is read
202 when the cgroup is in an idling state, the stat will only report the
203 idle_time accumulated till the last idle period and will not include
204 the current delta.
205
95- blkio.dequeue 206- blkio.dequeue
96 - Debugging aid only enabled if CONFIG_DEBUG_CFQ_IOSCHED=y. This 207 - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y. This
97 gives the statistics about how many a times a group was dequeued 208 gives the statistics about how many a times a group was dequeued
98 from service tree of the device. First two fields specify the major 209 from service tree of the device. First two fields specify the major
99 and minor number of the device and third field specifies the number 210 and minor number of the device and third field specifies the number
100 of times a group was dequeued from a particular device. 211 of times a group was dequeued from a particular device.
101 212
213- blkio.reset_stats
214 - Writing an int to this file will result in resetting all the stats
215 for that cgroup.
216
102CFQ sysfs tunable 217CFQ sysfs tunable
103================= 218=================
104/sys/block/<disk>/queue/iosched/group_isolation 219/sys/block/<disk>/queue/iosched/group_isolation
diff --git a/block/Kconfig b/block/Kconfig
index f9e89f4d94bb..9be0b56eaee1 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -77,29 +77,6 @@ config BLK_DEV_INTEGRITY
77 T10/SCSI Data Integrity Field or the T13/ATA External Path 77 T10/SCSI Data Integrity Field or the T13/ATA External Path
78 Protection. If in doubt, say N. 78 Protection. If in doubt, say N.
79 79
80config BLK_CGROUP
81 tristate "Block cgroup support"
82 depends on CGROUPS
83 depends on CFQ_GROUP_IOSCHED
84 default n
85 ---help---
86 Generic block IO controller cgroup interface. This is the common
87 cgroup interface which should be used by various IO controlling
88 policies.
89
90 Currently, CFQ IO scheduler uses it to recognize task groups and
91 control disk bandwidth allocation (proportional time slice allocation)
92 to such task groups.
93
94config DEBUG_BLK_CGROUP
95 bool
96 depends on BLK_CGROUP
97 default n
98 ---help---
99 Enable some debugging help. Currently it stores the cgroup path
100 in the blk group which can be used by cfq for tracing various
101 group related activity.
102
103endif # BLOCK 80endif # BLOCK
104 81
105config BLOCK_COMPAT 82config BLOCK_COMPAT
diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
index fc71cf071fb2..3199b76f795d 100644
--- a/block/Kconfig.iosched
+++ b/block/Kconfig.iosched
@@ -23,7 +23,8 @@ config IOSCHED_DEADLINE
23 23
24config IOSCHED_CFQ 24config IOSCHED_CFQ
25 tristate "CFQ I/O scheduler" 25 tristate "CFQ I/O scheduler"
26 select BLK_CGROUP if CFQ_GROUP_IOSCHED 26 # If BLK_CGROUP is a module, CFQ has to be built as module.
27 depends on (BLK_CGROUP=m && m) || !BLK_CGROUP || BLK_CGROUP=y
27 default y 28 default y
28 ---help--- 29 ---help---
29 The CFQ I/O scheduler tries to distribute bandwidth equally 30 The CFQ I/O scheduler tries to distribute bandwidth equally
@@ -33,22 +34,15 @@ config IOSCHED_CFQ
33 34
34 This is the default I/O scheduler. 35 This is the default I/O scheduler.
35 36
37 Note: If BLK_CGROUP=m, then CFQ can be built only as module.
38
36config CFQ_GROUP_IOSCHED 39config CFQ_GROUP_IOSCHED
37 bool "CFQ Group Scheduling support" 40 bool "CFQ Group Scheduling support"
38 depends on IOSCHED_CFQ && CGROUPS 41 depends on IOSCHED_CFQ && BLK_CGROUP
39 default n 42 default n
40 ---help--- 43 ---help---
41 Enable group IO scheduling in CFQ. 44 Enable group IO scheduling in CFQ.
42 45
43config DEBUG_CFQ_IOSCHED
44 bool "Debug CFQ Scheduling"
45 depends on CFQ_GROUP_IOSCHED
46 select DEBUG_BLK_CGROUP
47 default n
48 ---help---
49 Enable CFQ IO scheduling debugging in CFQ. Currently it makes
50 blktrace output more verbose.
51
52choice 46choice
53 prompt "Default I/O scheduler" 47 prompt "Default I/O scheduler"
54 default DEFAULT_CFQ 48 default DEFAULT_CFQ
diff --git a/block/Makefile b/block/Makefile
index cb2d515ebd6e..0bb499a739cd 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -5,7 +5,7 @@
5obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \ 5obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \
6 blk-barrier.o blk-settings.o blk-ioc.o blk-map.o \ 6 blk-barrier.o blk-settings.o blk-ioc.o blk-map.o \
7 blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \ 7 blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \
8 blk-iopoll.o ioctl.o genhd.o scsi_ioctl.o 8 blk-iopoll.o blk-lib.o ioctl.o genhd.o scsi_ioctl.o
9 9
10obj-$(CONFIG_BLK_DEV_BSG) += bsg.o 10obj-$(CONFIG_BLK_DEV_BSG) += bsg.o
11obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o 11obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o
diff --git a/block/blk-barrier.c b/block/blk-barrier.c
index 6d88544b677f..0d710c9d403b 100644
--- a/block/blk-barrier.c
+++ b/block/blk-barrier.c
@@ -286,26 +286,31 @@ static void bio_end_empty_barrier(struct bio *bio, int err)
286 set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); 286 set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
287 clear_bit(BIO_UPTODATE, &bio->bi_flags); 287 clear_bit(BIO_UPTODATE, &bio->bi_flags);
288 } 288 }
289 289 if (bio->bi_private)
290 complete(bio->bi_private); 290 complete(bio->bi_private);
291 bio_put(bio);
291} 292}
292 293
293/** 294/**
294 * blkdev_issue_flush - queue a flush 295 * blkdev_issue_flush - queue a flush
295 * @bdev: blockdev to issue flush for 296 * @bdev: blockdev to issue flush for
297 * @gfp_mask: memory allocation flags (for bio_alloc)
296 * @error_sector: error sector 298 * @error_sector: error sector
299 * @flags: BLKDEV_IFL_* flags to control behaviour
297 * 300 *
298 * Description: 301 * Description:
299 * Issue a flush for the block device in question. Caller can supply 302 * Issue a flush for the block device in question. Caller can supply
300 * room for storing the error offset in case of a flush error, if they 303 * room for storing the error offset in case of a flush error, if they
301 * wish to. 304 * wish to. If WAIT flag is not passed then caller may check only what
305 * request was pushed in some internal queue for later handling.
302 */ 306 */
303int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector) 307int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask,
308 sector_t *error_sector, unsigned long flags)
304{ 309{
305 DECLARE_COMPLETION_ONSTACK(wait); 310 DECLARE_COMPLETION_ONSTACK(wait);
306 struct request_queue *q; 311 struct request_queue *q;
307 struct bio *bio; 312 struct bio *bio;
308 int ret; 313 int ret = 0;
309 314
310 if (bdev->bd_disk == NULL) 315 if (bdev->bd_disk == NULL)
311 return -ENXIO; 316 return -ENXIO;
@@ -314,23 +319,25 @@ int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector)
314 if (!q) 319 if (!q)
315 return -ENXIO; 320 return -ENXIO;
316 321
317 bio = bio_alloc(GFP_KERNEL, 0); 322 bio = bio_alloc(gfp_mask, 0);
318 bio->bi_end_io = bio_end_empty_barrier; 323 bio->bi_end_io = bio_end_empty_barrier;
319 bio->bi_private = &wait;
320 bio->bi_bdev = bdev; 324 bio->bi_bdev = bdev;
321 submit_bio(WRITE_BARRIER, bio); 325 if (test_bit(BLKDEV_WAIT, &flags))
322 326 bio->bi_private = &wait;
323 wait_for_completion(&wait);
324 327
325 /* 328 bio_get(bio);
326 * The driver must store the error location in ->bi_sector, if 329 submit_bio(WRITE_BARRIER, bio);
327 * it supports it. For non-stacked drivers, this should be copied 330 if (test_bit(BLKDEV_WAIT, &flags)) {
328 * from blk_rq_pos(rq). 331 wait_for_completion(&wait);
329 */ 332 /*
330 if (error_sector) 333 * The driver must store the error location in ->bi_sector, if
331 *error_sector = bio->bi_sector; 334 * it supports it. For non-stacked drivers, this should be
335 * copied from blk_rq_pos(rq).
336 */
337 if (error_sector)
338 *error_sector = bio->bi_sector;
339 }
332 340
333 ret = 0;
334 if (bio_flagged(bio, BIO_EOPNOTSUPP)) 341 if (bio_flagged(bio, BIO_EOPNOTSUPP))
335 ret = -EOPNOTSUPP; 342 ret = -EOPNOTSUPP;
336 else if (!bio_flagged(bio, BIO_UPTODATE)) 343 else if (!bio_flagged(bio, BIO_UPTODATE))
@@ -340,107 +347,3 @@ int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector)
340 return ret; 347 return ret;
341} 348}
342EXPORT_SYMBOL(blkdev_issue_flush); 349EXPORT_SYMBOL(blkdev_issue_flush);
343
344static void blkdev_discard_end_io(struct bio *bio, int err)
345{
346 if (err) {
347 if (err == -EOPNOTSUPP)
348 set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
349 clear_bit(BIO_UPTODATE, &bio->bi_flags);
350 }
351
352 if (bio->bi_private)
353 complete(bio->bi_private);
354 __free_page(bio_page(bio));
355
356 bio_put(bio);
357}
358
359/**
360 * blkdev_issue_discard - queue a discard
361 * @bdev: blockdev to issue discard for
362 * @sector: start sector
363 * @nr_sects: number of sectors to discard
364 * @gfp_mask: memory allocation flags (for bio_alloc)
365 * @flags: DISCARD_FL_* flags to control behaviour
366 *
367 * Description:
368 * Issue a discard request for the sectors in question.
369 */
370int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
371 sector_t nr_sects, gfp_t gfp_mask, int flags)
372{
373 DECLARE_COMPLETION_ONSTACK(wait);
374 struct request_queue *q = bdev_get_queue(bdev);
375 int type = flags & DISCARD_FL_BARRIER ?
376 DISCARD_BARRIER : DISCARD_NOBARRIER;
377 struct bio *bio;
378 struct page *page;
379 int ret = 0;
380
381 if (!q)
382 return -ENXIO;
383
384 if (!blk_queue_discard(q))
385 return -EOPNOTSUPP;
386
387 while (nr_sects && !ret) {
388 unsigned int sector_size = q->limits.logical_block_size;
389 unsigned int max_discard_sectors =
390 min(q->limits.max_discard_sectors, UINT_MAX >> 9);
391
392 bio = bio_alloc(gfp_mask, 1);
393 if (!bio)
394 goto out;
395 bio->bi_sector = sector;
396 bio->bi_end_io = blkdev_discard_end_io;
397 bio->bi_bdev = bdev;
398 if (flags & DISCARD_FL_WAIT)
399 bio->bi_private = &wait;
400
401 /*
402 * Add a zeroed one-sector payload as that's what
403 * our current implementations need. If we'll ever need
404 * more the interface will need revisiting.
405 */
406 page = alloc_page(gfp_mask | __GFP_ZERO);
407 if (!page)
408 goto out_free_bio;
409 if (bio_add_pc_page(q, bio, page, sector_size, 0) < sector_size)
410 goto out_free_page;
411
412 /*
413 * And override the bio size - the way discard works we
414 * touch many more blocks on disk than the actual payload
415 * length.
416 */
417 if (nr_sects > max_discard_sectors) {
418 bio->bi_size = max_discard_sectors << 9;
419 nr_sects -= max_discard_sectors;
420 sector += max_discard_sectors;
421 } else {
422 bio->bi_size = nr_sects << 9;
423 nr_sects = 0;
424 }
425
426 bio_get(bio);
427 submit_bio(type, bio);
428
429 if (flags & DISCARD_FL_WAIT)
430 wait_for_completion(&wait);
431
432 if (bio_flagged(bio, BIO_EOPNOTSUPP))
433 ret = -EOPNOTSUPP;
434 else if (!bio_flagged(bio, BIO_UPTODATE))
435 ret = -EIO;
436 bio_put(bio);
437 }
438 return ret;
439out_free_page:
440 __free_page(page);
441out_free_bio:
442 bio_put(bio);
443out:
444 return -ENOMEM;
445}
446EXPORT_SYMBOL(blkdev_issue_discard);
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 5fe03def34b2..d02bbf88de13 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -15,8 +15,12 @@
15#include <linux/kdev_t.h> 15#include <linux/kdev_t.h>
16#include <linux/module.h> 16#include <linux/module.h>
17#include <linux/err.h> 17#include <linux/err.h>
18#include <linux/blkdev.h>
18#include <linux/slab.h> 19#include <linux/slab.h>
19#include "blk-cgroup.h" 20#include "blk-cgroup.h"
21#include <linux/genhd.h>
22
23#define MAX_KEY_LEN 100
20 24
21static DEFINE_SPINLOCK(blkio_list_lock); 25static DEFINE_SPINLOCK(blkio_list_lock);
22static LIST_HEAD(blkio_list); 26static LIST_HEAD(blkio_list);
@@ -49,6 +53,32 @@ struct cgroup_subsys blkio_subsys = {
49}; 53};
50EXPORT_SYMBOL_GPL(blkio_subsys); 54EXPORT_SYMBOL_GPL(blkio_subsys);
51 55
56static inline void blkio_policy_insert_node(struct blkio_cgroup *blkcg,
57 struct blkio_policy_node *pn)
58{
59 list_add(&pn->node, &blkcg->policy_list);
60}
61
62/* Must be called with blkcg->lock held */
63static inline void blkio_policy_delete_node(struct blkio_policy_node *pn)
64{
65 list_del(&pn->node);
66}
67
68/* Must be called with blkcg->lock held */
69static struct blkio_policy_node *
70blkio_policy_search_node(const struct blkio_cgroup *blkcg, dev_t dev)
71{
72 struct blkio_policy_node *pn;
73
74 list_for_each_entry(pn, &blkcg->policy_list, node) {
75 if (pn->dev == dev)
76 return pn;
77 }
78
79 return NULL;
80}
81
52struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup) 82struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup)
53{ 83{
54 return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id), 84 return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id),
@@ -56,13 +86,259 @@ struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup)
56} 86}
57EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup); 87EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup);
58 88
59void blkiocg_update_blkio_group_stats(struct blkio_group *blkg, 89/*
60 unsigned long time, unsigned long sectors) 90 * Add to the appropriate stat variable depending on the request type.
91 * This should be called with the blkg->stats_lock held.
92 */
93static void blkio_add_stat(uint64_t *stat, uint64_t add, bool direction,
94 bool sync)
95{
96 if (direction)
97 stat[BLKIO_STAT_WRITE] += add;
98 else
99 stat[BLKIO_STAT_READ] += add;
100 if (sync)
101 stat[BLKIO_STAT_SYNC] += add;
102 else
103 stat[BLKIO_STAT_ASYNC] += add;
104}
105
106/*
107 * Decrements the appropriate stat variable if non-zero depending on the
108 * request type. Panics on value being zero.
109 * This should be called with the blkg->stats_lock held.
110 */
111static void blkio_check_and_dec_stat(uint64_t *stat, bool direction, bool sync)
112{
113 if (direction) {
114 BUG_ON(stat[BLKIO_STAT_WRITE] == 0);
115 stat[BLKIO_STAT_WRITE]--;
116 } else {
117 BUG_ON(stat[BLKIO_STAT_READ] == 0);
118 stat[BLKIO_STAT_READ]--;
119 }
120 if (sync) {
121 BUG_ON(stat[BLKIO_STAT_SYNC] == 0);
122 stat[BLKIO_STAT_SYNC]--;
123 } else {
124 BUG_ON(stat[BLKIO_STAT_ASYNC] == 0);
125 stat[BLKIO_STAT_ASYNC]--;
126 }
127}
128
129#ifdef CONFIG_DEBUG_BLK_CGROUP
130/* This should be called with the blkg->stats_lock held. */
131static void blkio_set_start_group_wait_time(struct blkio_group *blkg,
132 struct blkio_group *curr_blkg)
133{
134 if (blkio_blkg_waiting(&blkg->stats))
135 return;
136 if (blkg == curr_blkg)
137 return;
138 blkg->stats.start_group_wait_time = sched_clock();
139 blkio_mark_blkg_waiting(&blkg->stats);
140}
141
142/* This should be called with the blkg->stats_lock held. */
143static void blkio_update_group_wait_time(struct blkio_group_stats *stats)
144{
145 unsigned long long now;
146
147 if (!blkio_blkg_waiting(stats))
148 return;
149
150 now = sched_clock();
151 if (time_after64(now, stats->start_group_wait_time))
152 stats->group_wait_time += now - stats->start_group_wait_time;
153 blkio_clear_blkg_waiting(stats);
154}
155
156/* This should be called with the blkg->stats_lock held. */
157static void blkio_end_empty_time(struct blkio_group_stats *stats)
158{
159 unsigned long long now;
160
161 if (!blkio_blkg_empty(stats))
162 return;
163
164 now = sched_clock();
165 if (time_after64(now, stats->start_empty_time))
166 stats->empty_time += now - stats->start_empty_time;
167 blkio_clear_blkg_empty(stats);
168}
169
170void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg)
171{
172 unsigned long flags;
173
174 spin_lock_irqsave(&blkg->stats_lock, flags);
175 BUG_ON(blkio_blkg_idling(&blkg->stats));
176 blkg->stats.start_idle_time = sched_clock();
177 blkio_mark_blkg_idling(&blkg->stats);
178 spin_unlock_irqrestore(&blkg->stats_lock, flags);
179}
180EXPORT_SYMBOL_GPL(blkiocg_update_set_idle_time_stats);
181
182void blkiocg_update_idle_time_stats(struct blkio_group *blkg)
183{
184 unsigned long flags;
185 unsigned long long now;
186 struct blkio_group_stats *stats;
187
188 spin_lock_irqsave(&blkg->stats_lock, flags);
189 stats = &blkg->stats;
190 if (blkio_blkg_idling(stats)) {
191 now = sched_clock();
192 if (time_after64(now, stats->start_idle_time))
193 stats->idle_time += now - stats->start_idle_time;
194 blkio_clear_blkg_idling(stats);
195 }
196 spin_unlock_irqrestore(&blkg->stats_lock, flags);
197}
198EXPORT_SYMBOL_GPL(blkiocg_update_idle_time_stats);
199
200void blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg)
201{
202 unsigned long flags;
203 struct blkio_group_stats *stats;
204
205 spin_lock_irqsave(&blkg->stats_lock, flags);
206 stats = &blkg->stats;
207 stats->avg_queue_size_sum +=
208 stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] +
209 stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE];
210 stats->avg_queue_size_samples++;
211 blkio_update_group_wait_time(stats);
212 spin_unlock_irqrestore(&blkg->stats_lock, flags);
213}
214EXPORT_SYMBOL_GPL(blkiocg_update_avg_queue_size_stats);
215
216void blkiocg_set_start_empty_time(struct blkio_group *blkg)
217{
218 unsigned long flags;
219 struct blkio_group_stats *stats;
220
221 spin_lock_irqsave(&blkg->stats_lock, flags);
222 stats = &blkg->stats;
223
224 if (stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] ||
225 stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE]) {
226 spin_unlock_irqrestore(&blkg->stats_lock, flags);
227 return;
228 }
229
230 /*
231 * group is already marked empty. This can happen if cfqq got new
232 * request in parent group and moved to this group while being added
233 * to service tree. Just ignore the event and move on.
234 */
235 if(blkio_blkg_empty(stats)) {
236 spin_unlock_irqrestore(&blkg->stats_lock, flags);
237 return;
238 }
239
240 stats->start_empty_time = sched_clock();
241 blkio_mark_blkg_empty(stats);
242 spin_unlock_irqrestore(&blkg->stats_lock, flags);
243}
244EXPORT_SYMBOL_GPL(blkiocg_set_start_empty_time);
245
246void blkiocg_update_dequeue_stats(struct blkio_group *blkg,
247 unsigned long dequeue)
248{
249 blkg->stats.dequeue += dequeue;
250}
251EXPORT_SYMBOL_GPL(blkiocg_update_dequeue_stats);
252#else
253static inline void blkio_set_start_group_wait_time(struct blkio_group *blkg,
254 struct blkio_group *curr_blkg) {}
255static inline void blkio_end_empty_time(struct blkio_group_stats *stats) {}
256#endif
257
258void blkiocg_update_io_add_stats(struct blkio_group *blkg,
259 struct blkio_group *curr_blkg, bool direction,
260 bool sync)
261{
262 unsigned long flags;
263
264 spin_lock_irqsave(&blkg->stats_lock, flags);
265 blkio_add_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED], 1, direction,
266 sync);
267 blkio_end_empty_time(&blkg->stats);
268 blkio_set_start_group_wait_time(blkg, curr_blkg);
269 spin_unlock_irqrestore(&blkg->stats_lock, flags);
270}
271EXPORT_SYMBOL_GPL(blkiocg_update_io_add_stats);
272
273void blkiocg_update_io_remove_stats(struct blkio_group *blkg,
274 bool direction, bool sync)
275{
276 unsigned long flags;
277
278 spin_lock_irqsave(&blkg->stats_lock, flags);
279 blkio_check_and_dec_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED],
280 direction, sync);
281 spin_unlock_irqrestore(&blkg->stats_lock, flags);
282}
283EXPORT_SYMBOL_GPL(blkiocg_update_io_remove_stats);
284
285void blkiocg_update_timeslice_used(struct blkio_group *blkg, unsigned long time)
286{
287 unsigned long flags;
288
289 spin_lock_irqsave(&blkg->stats_lock, flags);
290 blkg->stats.time += time;
291 spin_unlock_irqrestore(&blkg->stats_lock, flags);
292}
293EXPORT_SYMBOL_GPL(blkiocg_update_timeslice_used);
294
295void blkiocg_update_dispatch_stats(struct blkio_group *blkg,
296 uint64_t bytes, bool direction, bool sync)
297{
298 struct blkio_group_stats *stats;
299 unsigned long flags;
300
301 spin_lock_irqsave(&blkg->stats_lock, flags);
302 stats = &blkg->stats;
303 stats->sectors += bytes >> 9;
304 blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICED], 1, direction,
305 sync);
306 blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICE_BYTES], bytes,
307 direction, sync);
308 spin_unlock_irqrestore(&blkg->stats_lock, flags);
309}
310EXPORT_SYMBOL_GPL(blkiocg_update_dispatch_stats);
311
312void blkiocg_update_completion_stats(struct blkio_group *blkg,
313 uint64_t start_time, uint64_t io_start_time, bool direction, bool sync)
314{
315 struct blkio_group_stats *stats;
316 unsigned long flags;
317 unsigned long long now = sched_clock();
318
319 spin_lock_irqsave(&blkg->stats_lock, flags);
320 stats = &blkg->stats;
321 if (time_after64(now, io_start_time))
322 blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICE_TIME],
323 now - io_start_time, direction, sync);
324 if (time_after64(io_start_time, start_time))
325 blkio_add_stat(stats->stat_arr[BLKIO_STAT_WAIT_TIME],
326 io_start_time - start_time, direction, sync);
327 spin_unlock_irqrestore(&blkg->stats_lock, flags);
328}
329EXPORT_SYMBOL_GPL(blkiocg_update_completion_stats);
330
331void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction,
332 bool sync)
61{ 333{
62 blkg->time += time; 334 unsigned long flags;
63 blkg->sectors += sectors; 335
336 spin_lock_irqsave(&blkg->stats_lock, flags);
337 blkio_add_stat(blkg->stats.stat_arr[BLKIO_STAT_MERGED], 1, direction,
338 sync);
339 spin_unlock_irqrestore(&blkg->stats_lock, flags);
64} 340}
65EXPORT_SYMBOL_GPL(blkiocg_update_blkio_group_stats); 341EXPORT_SYMBOL_GPL(blkiocg_update_io_merged_stats);
66 342
67void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, 343void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
68 struct blkio_group *blkg, void *key, dev_t dev) 344 struct blkio_group *blkg, void *key, dev_t dev)
@@ -70,14 +346,13 @@ void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
70 unsigned long flags; 346 unsigned long flags;
71 347
72 spin_lock_irqsave(&blkcg->lock, flags); 348 spin_lock_irqsave(&blkcg->lock, flags);
349 spin_lock_init(&blkg->stats_lock);
73 rcu_assign_pointer(blkg->key, key); 350 rcu_assign_pointer(blkg->key, key);
74 blkg->blkcg_id = css_id(&blkcg->css); 351 blkg->blkcg_id = css_id(&blkcg->css);
75 hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list); 352 hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
76 spin_unlock_irqrestore(&blkcg->lock, flags); 353 spin_unlock_irqrestore(&blkcg->lock, flags);
77#ifdef CONFIG_DEBUG_BLK_CGROUP
78 /* Need to take css reference ? */ 354 /* Need to take css reference ? */
79 cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path)); 355 cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path));
80#endif
81 blkg->dev = dev; 356 blkg->dev = dev;
82} 357}
83EXPORT_SYMBOL_GPL(blkiocg_add_blkio_group); 358EXPORT_SYMBOL_GPL(blkiocg_add_blkio_group);
@@ -154,6 +429,7 @@ blkiocg_weight_write(struct cgroup *cgroup, struct cftype *cftype, u64 val)
154 struct blkio_group *blkg; 429 struct blkio_group *blkg;
155 struct hlist_node *n; 430 struct hlist_node *n;
156 struct blkio_policy_type *blkiop; 431 struct blkio_policy_type *blkiop;
432 struct blkio_policy_node *pn;
157 433
158 if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX) 434 if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX)
159 return -EINVAL; 435 return -EINVAL;
@@ -162,7 +438,13 @@ blkiocg_weight_write(struct cgroup *cgroup, struct cftype *cftype, u64 val)
162 spin_lock(&blkio_list_lock); 438 spin_lock(&blkio_list_lock);
163 spin_lock_irq(&blkcg->lock); 439 spin_lock_irq(&blkcg->lock);
164 blkcg->weight = (unsigned int)val; 440 blkcg->weight = (unsigned int)val;
441
165 hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) { 442 hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
443 pn = blkio_policy_search_node(blkcg, blkg->dev);
444
445 if (pn)
446 continue;
447
166 list_for_each_entry(blkiop, &blkio_list, list) 448 list_for_each_entry(blkiop, &blkio_list, list)
167 blkiop->ops.blkio_update_group_weight_fn(blkg, 449 blkiop->ops.blkio_update_group_weight_fn(blkg,
168 blkcg->weight); 450 blkcg->weight);
@@ -172,13 +454,154 @@ blkiocg_weight_write(struct cgroup *cgroup, struct cftype *cftype, u64 val)
172 return 0; 454 return 0;
173} 455}
174 456
175#define SHOW_FUNCTION_PER_GROUP(__VAR) \ 457static int
458blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val)
459{
460 struct blkio_cgroup *blkcg;
461 struct blkio_group *blkg;
462 struct blkio_group_stats *stats;
463 struct hlist_node *n;
464 uint64_t queued[BLKIO_STAT_TOTAL];
465 int i;
466#ifdef CONFIG_DEBUG_BLK_CGROUP
467 bool idling, waiting, empty;
468 unsigned long long now = sched_clock();
469#endif
470
471 blkcg = cgroup_to_blkio_cgroup(cgroup);
472 spin_lock_irq(&blkcg->lock);
473 hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
474 spin_lock(&blkg->stats_lock);
475 stats = &blkg->stats;
476#ifdef CONFIG_DEBUG_BLK_CGROUP
477 idling = blkio_blkg_idling(stats);
478 waiting = blkio_blkg_waiting(stats);
479 empty = blkio_blkg_empty(stats);
480#endif
481 for (i = 0; i < BLKIO_STAT_TOTAL; i++)
482 queued[i] = stats->stat_arr[BLKIO_STAT_QUEUED][i];
483 memset(stats, 0, sizeof(struct blkio_group_stats));
484 for (i = 0; i < BLKIO_STAT_TOTAL; i++)
485 stats->stat_arr[BLKIO_STAT_QUEUED][i] = queued[i];
486#ifdef CONFIG_DEBUG_BLK_CGROUP
487 if (idling) {
488 blkio_mark_blkg_idling(stats);
489 stats->start_idle_time = now;
490 }
491 if (waiting) {
492 blkio_mark_blkg_waiting(stats);
493 stats->start_group_wait_time = now;
494 }
495 if (empty) {
496 blkio_mark_blkg_empty(stats);
497 stats->start_empty_time = now;
498 }
499#endif
500 spin_unlock(&blkg->stats_lock);
501 }
502 spin_unlock_irq(&blkcg->lock);
503 return 0;
504}
505
506static void blkio_get_key_name(enum stat_sub_type type, dev_t dev, char *str,
507 int chars_left, bool diskname_only)
508{
509 snprintf(str, chars_left, "%d:%d", MAJOR(dev), MINOR(dev));
510 chars_left -= strlen(str);
511 if (chars_left <= 0) {
512 printk(KERN_WARNING
513 "Possibly incorrect cgroup stat display format");
514 return;
515 }
516 if (diskname_only)
517 return;
518 switch (type) {
519 case BLKIO_STAT_READ:
520 strlcat(str, " Read", chars_left);
521 break;
522 case BLKIO_STAT_WRITE:
523 strlcat(str, " Write", chars_left);
524 break;
525 case BLKIO_STAT_SYNC:
526 strlcat(str, " Sync", chars_left);
527 break;
528 case BLKIO_STAT_ASYNC:
529 strlcat(str, " Async", chars_left);
530 break;
531 case BLKIO_STAT_TOTAL:
532 strlcat(str, " Total", chars_left);
533 break;
534 default:
535 strlcat(str, " Invalid", chars_left);
536 }
537}
538
539static uint64_t blkio_fill_stat(char *str, int chars_left, uint64_t val,
540 struct cgroup_map_cb *cb, dev_t dev)
541{
542 blkio_get_key_name(0, dev, str, chars_left, true);
543 cb->fill(cb, str, val);
544 return val;
545}
546
547/* This should be called with blkg->stats_lock held */
548static uint64_t blkio_get_stat(struct blkio_group *blkg,
549 struct cgroup_map_cb *cb, dev_t dev, enum stat_type type)
550{
551 uint64_t disk_total;
552 char key_str[MAX_KEY_LEN];
553 enum stat_sub_type sub_type;
554
555 if (type == BLKIO_STAT_TIME)
556 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
557 blkg->stats.time, cb, dev);
558 if (type == BLKIO_STAT_SECTORS)
559 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
560 blkg->stats.sectors, cb, dev);
561#ifdef CONFIG_DEBUG_BLK_CGROUP
562 if (type == BLKIO_STAT_AVG_QUEUE_SIZE) {
563 uint64_t sum = blkg->stats.avg_queue_size_sum;
564 uint64_t samples = blkg->stats.avg_queue_size_samples;
565 if (samples)
566 do_div(sum, samples);
567 else
568 sum = 0;
569 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, sum, cb, dev);
570 }
571 if (type == BLKIO_STAT_GROUP_WAIT_TIME)
572 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
573 blkg->stats.group_wait_time, cb, dev);
574 if (type == BLKIO_STAT_IDLE_TIME)
575 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
576 blkg->stats.idle_time, cb, dev);
577 if (type == BLKIO_STAT_EMPTY_TIME)
578 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
579 blkg->stats.empty_time, cb, dev);
580 if (type == BLKIO_STAT_DEQUEUE)
581 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
582 blkg->stats.dequeue, cb, dev);
583#endif
584
585 for (sub_type = BLKIO_STAT_READ; sub_type < BLKIO_STAT_TOTAL;
586 sub_type++) {
587 blkio_get_key_name(sub_type, dev, key_str, MAX_KEY_LEN, false);
588 cb->fill(cb, key_str, blkg->stats.stat_arr[type][sub_type]);
589 }
590 disk_total = blkg->stats.stat_arr[type][BLKIO_STAT_READ] +
591 blkg->stats.stat_arr[type][BLKIO_STAT_WRITE];
592 blkio_get_key_name(BLKIO_STAT_TOTAL, dev, key_str, MAX_KEY_LEN, false);
593 cb->fill(cb, key_str, disk_total);
594 return disk_total;
595}
596
597#define SHOW_FUNCTION_PER_GROUP(__VAR, type, show_total) \
176static int blkiocg_##__VAR##_read(struct cgroup *cgroup, \ 598static int blkiocg_##__VAR##_read(struct cgroup *cgroup, \
177 struct cftype *cftype, struct seq_file *m) \ 599 struct cftype *cftype, struct cgroup_map_cb *cb) \
178{ \ 600{ \
179 struct blkio_cgroup *blkcg; \ 601 struct blkio_cgroup *blkcg; \
180 struct blkio_group *blkg; \ 602 struct blkio_group *blkg; \
181 struct hlist_node *n; \ 603 struct hlist_node *n; \
604 uint64_t cgroup_total = 0; \
182 \ 605 \
183 if (!cgroup_lock_live_group(cgroup)) \ 606 if (!cgroup_lock_live_group(cgroup)) \
184 return -ENODEV; \ 607 return -ENODEV; \
@@ -186,50 +609,295 @@ static int blkiocg_##__VAR##_read(struct cgroup *cgroup, \
186 blkcg = cgroup_to_blkio_cgroup(cgroup); \ 609 blkcg = cgroup_to_blkio_cgroup(cgroup); \
187 rcu_read_lock(); \ 610 rcu_read_lock(); \
188 hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {\ 611 hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {\
189 if (blkg->dev) \ 612 if (blkg->dev) { \
190 seq_printf(m, "%u:%u %lu\n", MAJOR(blkg->dev), \ 613 spin_lock_irq(&blkg->stats_lock); \
191 MINOR(blkg->dev), blkg->__VAR); \ 614 cgroup_total += blkio_get_stat(blkg, cb, \
615 blkg->dev, type); \
616 spin_unlock_irq(&blkg->stats_lock); \
617 } \
192 } \ 618 } \
619 if (show_total) \
620 cb->fill(cb, "Total", cgroup_total); \
193 rcu_read_unlock(); \ 621 rcu_read_unlock(); \
194 cgroup_unlock(); \ 622 cgroup_unlock(); \
195 return 0; \ 623 return 0; \
196} 624}
197 625
198SHOW_FUNCTION_PER_GROUP(time); 626SHOW_FUNCTION_PER_GROUP(time, BLKIO_STAT_TIME, 0);
199SHOW_FUNCTION_PER_GROUP(sectors); 627SHOW_FUNCTION_PER_GROUP(sectors, BLKIO_STAT_SECTORS, 0);
628SHOW_FUNCTION_PER_GROUP(io_service_bytes, BLKIO_STAT_SERVICE_BYTES, 1);
629SHOW_FUNCTION_PER_GROUP(io_serviced, BLKIO_STAT_SERVICED, 1);
630SHOW_FUNCTION_PER_GROUP(io_service_time, BLKIO_STAT_SERVICE_TIME, 1);
631SHOW_FUNCTION_PER_GROUP(io_wait_time, BLKIO_STAT_WAIT_TIME, 1);
632SHOW_FUNCTION_PER_GROUP(io_merged, BLKIO_STAT_MERGED, 1);
633SHOW_FUNCTION_PER_GROUP(io_queued, BLKIO_STAT_QUEUED, 1);
200#ifdef CONFIG_DEBUG_BLK_CGROUP 634#ifdef CONFIG_DEBUG_BLK_CGROUP
201SHOW_FUNCTION_PER_GROUP(dequeue); 635SHOW_FUNCTION_PER_GROUP(dequeue, BLKIO_STAT_DEQUEUE, 0);
636SHOW_FUNCTION_PER_GROUP(avg_queue_size, BLKIO_STAT_AVG_QUEUE_SIZE, 0);
637SHOW_FUNCTION_PER_GROUP(group_wait_time, BLKIO_STAT_GROUP_WAIT_TIME, 0);
638SHOW_FUNCTION_PER_GROUP(idle_time, BLKIO_STAT_IDLE_TIME, 0);
639SHOW_FUNCTION_PER_GROUP(empty_time, BLKIO_STAT_EMPTY_TIME, 0);
202#endif 640#endif
203#undef SHOW_FUNCTION_PER_GROUP 641#undef SHOW_FUNCTION_PER_GROUP
204 642
205#ifdef CONFIG_DEBUG_BLK_CGROUP 643static int blkio_check_dev_num(dev_t dev)
206void blkiocg_update_blkio_group_dequeue_stats(struct blkio_group *blkg,
207 unsigned long dequeue)
208{ 644{
209 blkg->dequeue += dequeue; 645 int part = 0;
646 struct gendisk *disk;
647
648 disk = get_gendisk(dev, &part);
649 if (!disk || part)
650 return -ENODEV;
651
652 return 0;
653}
654
655static int blkio_policy_parse_and_set(char *buf,
656 struct blkio_policy_node *newpn)
657{
658 char *s[4], *p, *major_s = NULL, *minor_s = NULL;
659 int ret;
660 unsigned long major, minor, temp;
661 int i = 0;
662 dev_t dev;
663
664 memset(s, 0, sizeof(s));
665
666 while ((p = strsep(&buf, " ")) != NULL) {
667 if (!*p)
668 continue;
669
670 s[i++] = p;
671
672 /* Prevent from inputing too many things */
673 if (i == 3)
674 break;
675 }
676
677 if (i != 2)
678 return -EINVAL;
679
680 p = strsep(&s[0], ":");
681 if (p != NULL)
682 major_s = p;
683 else
684 return -EINVAL;
685
686 minor_s = s[0];
687 if (!minor_s)
688 return -EINVAL;
689
690 ret = strict_strtoul(major_s, 10, &major);
691 if (ret)
692 return -EINVAL;
693
694 ret = strict_strtoul(minor_s, 10, &minor);
695 if (ret)
696 return -EINVAL;
697
698 dev = MKDEV(major, minor);
699
700 ret = blkio_check_dev_num(dev);
701 if (ret)
702 return ret;
703
704 newpn->dev = dev;
705
706 if (s[1] == NULL)
707 return -EINVAL;
708
709 ret = strict_strtoul(s[1], 10, &temp);
710 if (ret || (temp < BLKIO_WEIGHT_MIN && temp > 0) ||
711 temp > BLKIO_WEIGHT_MAX)
712 return -EINVAL;
713
714 newpn->weight = temp;
715
716 return 0;
717}
718
719unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
720 dev_t dev)
721{
722 struct blkio_policy_node *pn;
723
724 pn = blkio_policy_search_node(blkcg, dev);
725 if (pn)
726 return pn->weight;
727 else
728 return blkcg->weight;
729}
730EXPORT_SYMBOL_GPL(blkcg_get_weight);
731
732
733static int blkiocg_weight_device_write(struct cgroup *cgrp, struct cftype *cft,
734 const char *buffer)
735{
736 int ret = 0;
737 char *buf;
738 struct blkio_policy_node *newpn, *pn;
739 struct blkio_cgroup *blkcg;
740 struct blkio_group *blkg;
741 int keep_newpn = 0;
742 struct hlist_node *n;
743 struct blkio_policy_type *blkiop;
744
745 buf = kstrdup(buffer, GFP_KERNEL);
746 if (!buf)
747 return -ENOMEM;
748
749 newpn = kzalloc(sizeof(*newpn), GFP_KERNEL);
750 if (!newpn) {
751 ret = -ENOMEM;
752 goto free_buf;
753 }
754
755 ret = blkio_policy_parse_and_set(buf, newpn);
756 if (ret)
757 goto free_newpn;
758
759 blkcg = cgroup_to_blkio_cgroup(cgrp);
760
761 spin_lock_irq(&blkcg->lock);
762
763 pn = blkio_policy_search_node(blkcg, newpn->dev);
764 if (!pn) {
765 if (newpn->weight != 0) {
766 blkio_policy_insert_node(blkcg, newpn);
767 keep_newpn = 1;
768 }
769 spin_unlock_irq(&blkcg->lock);
770 goto update_io_group;
771 }
772
773 if (newpn->weight == 0) {
774 /* weight == 0 means deleteing a specific weight */
775 blkio_policy_delete_node(pn);
776 spin_unlock_irq(&blkcg->lock);
777 goto update_io_group;
778 }
779 spin_unlock_irq(&blkcg->lock);
780
781 pn->weight = newpn->weight;
782
783update_io_group:
784 /* update weight for each cfqg */
785 spin_lock(&blkio_list_lock);
786 spin_lock_irq(&blkcg->lock);
787
788 hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
789 if (newpn->dev == blkg->dev) {
790 list_for_each_entry(blkiop, &blkio_list, list)
791 blkiop->ops.blkio_update_group_weight_fn(blkg,
792 newpn->weight ?
793 newpn->weight :
794 blkcg->weight);
795 }
796 }
797
798 spin_unlock_irq(&blkcg->lock);
799 spin_unlock(&blkio_list_lock);
800
801free_newpn:
802 if (!keep_newpn)
803 kfree(newpn);
804free_buf:
805 kfree(buf);
806 return ret;
807}
808
809static int blkiocg_weight_device_read(struct cgroup *cgrp, struct cftype *cft,
810 struct seq_file *m)
811{
812 struct blkio_cgroup *blkcg;
813 struct blkio_policy_node *pn;
814
815 seq_printf(m, "dev\tweight\n");
816
817 blkcg = cgroup_to_blkio_cgroup(cgrp);
818 if (list_empty(&blkcg->policy_list))
819 goto out;
820
821 spin_lock_irq(&blkcg->lock);
822 list_for_each_entry(pn, &blkcg->policy_list, node) {
823 seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev),
824 MINOR(pn->dev), pn->weight);
825 }
826 spin_unlock_irq(&blkcg->lock);
827
828out:
829 return 0;
210} 830}
211EXPORT_SYMBOL_GPL(blkiocg_update_blkio_group_dequeue_stats);
212#endif
213 831
214struct cftype blkio_files[] = { 832struct cftype blkio_files[] = {
215 { 833 {
834 .name = "weight_device",
835 .read_seq_string = blkiocg_weight_device_read,
836 .write_string = blkiocg_weight_device_write,
837 .max_write_len = 256,
838 },
839 {
216 .name = "weight", 840 .name = "weight",
217 .read_u64 = blkiocg_weight_read, 841 .read_u64 = blkiocg_weight_read,
218 .write_u64 = blkiocg_weight_write, 842 .write_u64 = blkiocg_weight_write,
219 }, 843 },
220 { 844 {
221 .name = "time", 845 .name = "time",
222 .read_seq_string = blkiocg_time_read, 846 .read_map = blkiocg_time_read,
223 }, 847 },
224 { 848 {
225 .name = "sectors", 849 .name = "sectors",
226 .read_seq_string = blkiocg_sectors_read, 850 .read_map = blkiocg_sectors_read,
851 },
852 {
853 .name = "io_service_bytes",
854 .read_map = blkiocg_io_service_bytes_read,
855 },
856 {
857 .name = "io_serviced",
858 .read_map = blkiocg_io_serviced_read,
859 },
860 {
861 .name = "io_service_time",
862 .read_map = blkiocg_io_service_time_read,
863 },
864 {
865 .name = "io_wait_time",
866 .read_map = blkiocg_io_wait_time_read,
867 },
868 {
869 .name = "io_merged",
870 .read_map = blkiocg_io_merged_read,
871 },
872 {
873 .name = "io_queued",
874 .read_map = blkiocg_io_queued_read,
875 },
876 {
877 .name = "reset_stats",
878 .write_u64 = blkiocg_reset_stats,
227 }, 879 },
228#ifdef CONFIG_DEBUG_BLK_CGROUP 880#ifdef CONFIG_DEBUG_BLK_CGROUP
229 { 881 {
882 .name = "avg_queue_size",
883 .read_map = blkiocg_avg_queue_size_read,
884 },
885 {
886 .name = "group_wait_time",
887 .read_map = blkiocg_group_wait_time_read,
888 },
889 {
890 .name = "idle_time",
891 .read_map = blkiocg_idle_time_read,
892 },
893 {
894 .name = "empty_time",
895 .read_map = blkiocg_empty_time_read,
896 },
897 {
230 .name = "dequeue", 898 .name = "dequeue",
231 .read_seq_string = blkiocg_dequeue_read, 899 .read_map = blkiocg_dequeue_read,
232 }, 900 },
233#endif 901#endif
234}; 902};
235 903
@@ -246,6 +914,7 @@ static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup)
246 struct blkio_group *blkg; 914 struct blkio_group *blkg;
247 void *key; 915 void *key;
248 struct blkio_policy_type *blkiop; 916 struct blkio_policy_type *blkiop;
917 struct blkio_policy_node *pn, *pntmp;
249 918
250 rcu_read_lock(); 919 rcu_read_lock();
251remove_entry: 920remove_entry:
@@ -276,7 +945,12 @@ remove_entry:
276 blkiop->ops.blkio_unlink_group_fn(key, blkg); 945 blkiop->ops.blkio_unlink_group_fn(key, blkg);
277 spin_unlock(&blkio_list_lock); 946 spin_unlock(&blkio_list_lock);
278 goto remove_entry; 947 goto remove_entry;
948
279done: 949done:
950 list_for_each_entry_safe(pn, pntmp, &blkcg->policy_list, node) {
951 blkio_policy_delete_node(pn);
952 kfree(pn);
953 }
280 free_css_id(&blkio_subsys, &blkcg->css); 954 free_css_id(&blkio_subsys, &blkcg->css);
281 rcu_read_unlock(); 955 rcu_read_unlock();
282 if (blkcg != &blkio_root_cgroup) 956 if (blkcg != &blkio_root_cgroup)
@@ -307,6 +981,7 @@ done:
307 spin_lock_init(&blkcg->lock); 981 spin_lock_init(&blkcg->lock);
308 INIT_HLIST_HEAD(&blkcg->blkg_list); 982 INIT_HLIST_HEAD(&blkcg->blkg_list);
309 983
984 INIT_LIST_HEAD(&blkcg->policy_list);
310 return &blkcg->css; 985 return &blkcg->css;
311} 986}
312 987
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 8ccc20464dae..2b866ec1dcea 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -23,11 +23,84 @@ extern struct cgroup_subsys blkio_subsys;
23#define blkio_subsys_id blkio_subsys.subsys_id 23#define blkio_subsys_id blkio_subsys.subsys_id
24#endif 24#endif
25 25
26enum stat_type {
27 /* Total time spent (in ns) between request dispatch to the driver and
28 * request completion for IOs doen by this cgroup. This may not be
29 * accurate when NCQ is turned on. */
30 BLKIO_STAT_SERVICE_TIME = 0,
31 /* Total bytes transferred */
32 BLKIO_STAT_SERVICE_BYTES,
33 /* Total IOs serviced, post merge */
34 BLKIO_STAT_SERVICED,
35 /* Total time spent waiting in scheduler queue in ns */
36 BLKIO_STAT_WAIT_TIME,
37 /* Number of IOs merged */
38 BLKIO_STAT_MERGED,
39 /* Number of IOs queued up */
40 BLKIO_STAT_QUEUED,
41 /* All the single valued stats go below this */
42 BLKIO_STAT_TIME,
43 BLKIO_STAT_SECTORS,
44#ifdef CONFIG_DEBUG_BLK_CGROUP
45 BLKIO_STAT_AVG_QUEUE_SIZE,
46 BLKIO_STAT_IDLE_TIME,
47 BLKIO_STAT_EMPTY_TIME,
48 BLKIO_STAT_GROUP_WAIT_TIME,
49 BLKIO_STAT_DEQUEUE
50#endif
51};
52
53enum stat_sub_type {
54 BLKIO_STAT_READ = 0,
55 BLKIO_STAT_WRITE,
56 BLKIO_STAT_SYNC,
57 BLKIO_STAT_ASYNC,
58 BLKIO_STAT_TOTAL
59};
60
61/* blkg state flags */
62enum blkg_state_flags {
63 BLKG_waiting = 0,
64 BLKG_idling,
65 BLKG_empty,
66};
67
26struct blkio_cgroup { 68struct blkio_cgroup {
27 struct cgroup_subsys_state css; 69 struct cgroup_subsys_state css;
28 unsigned int weight; 70 unsigned int weight;
29 spinlock_t lock; 71 spinlock_t lock;
30 struct hlist_head blkg_list; 72 struct hlist_head blkg_list;
73 struct list_head policy_list; /* list of blkio_policy_node */
74};
75
76struct blkio_group_stats {
77 /* total disk time and nr sectors dispatched by this group */
78 uint64_t time;
79 uint64_t sectors;
80 uint64_t stat_arr[BLKIO_STAT_QUEUED + 1][BLKIO_STAT_TOTAL];
81#ifdef CONFIG_DEBUG_BLK_CGROUP
82 /* Sum of number of IOs queued across all samples */
83 uint64_t avg_queue_size_sum;
84 /* Count of samples taken for average */
85 uint64_t avg_queue_size_samples;
86 /* How many times this group has been removed from service tree */
87 unsigned long dequeue;
88
89 /* Total time spent waiting for it to be assigned a timeslice. */
90 uint64_t group_wait_time;
91 uint64_t start_group_wait_time;
92
93 /* Time spent idling for this blkio_group */
94 uint64_t idle_time;
95 uint64_t start_idle_time;
96 /*
97 * Total time when we have requests queued and do not contain the
98 * current active queue.
99 */
100 uint64_t empty_time;
101 uint64_t start_empty_time;
102 uint16_t flags;
103#endif
31}; 104};
32 105
33struct blkio_group { 106struct blkio_group {
@@ -35,20 +108,25 @@ struct blkio_group {
35 void *key; 108 void *key;
36 struct hlist_node blkcg_node; 109 struct hlist_node blkcg_node;
37 unsigned short blkcg_id; 110 unsigned short blkcg_id;
38#ifdef CONFIG_DEBUG_BLK_CGROUP
39 /* Store cgroup path */ 111 /* Store cgroup path */
40 char path[128]; 112 char path[128];
41 /* How many times this group has been removed from service tree */
42 unsigned long dequeue;
43#endif
44 /* The device MKDEV(major, minor), this group has been created for */ 113 /* The device MKDEV(major, minor), this group has been created for */
45 dev_t dev; 114 dev_t dev;
46 115
47 /* total disk time and nr sectors dispatched by this group */ 116 /* Need to serialize the stats in the case of reset/update */
48 unsigned long time; 117 spinlock_t stats_lock;
49 unsigned long sectors; 118 struct blkio_group_stats stats;
50}; 119};
51 120
121struct blkio_policy_node {
122 struct list_head node;
123 dev_t dev;
124 unsigned int weight;
125};
126
127extern unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
128 dev_t dev);
129
52typedef void (blkio_unlink_group_fn) (void *key, struct blkio_group *blkg); 130typedef void (blkio_unlink_group_fn) (void *key, struct blkio_group *blkg);
53typedef void (blkio_update_group_weight_fn) (struct blkio_group *blkg, 131typedef void (blkio_update_group_weight_fn) (struct blkio_group *blkg,
54 unsigned int weight); 132 unsigned int weight);
@@ -67,6 +145,11 @@ struct blkio_policy_type {
67extern void blkio_policy_register(struct blkio_policy_type *); 145extern void blkio_policy_register(struct blkio_policy_type *);
68extern void blkio_policy_unregister(struct blkio_policy_type *); 146extern void blkio_policy_unregister(struct blkio_policy_type *);
69 147
148static inline char *blkg_path(struct blkio_group *blkg)
149{
150 return blkg->path;
151}
152
70#else 153#else
71 154
72struct blkio_group { 155struct blkio_group {
@@ -78,6 +161,8 @@ struct blkio_policy_type {
78static inline void blkio_policy_register(struct blkio_policy_type *blkiop) { } 161static inline void blkio_policy_register(struct blkio_policy_type *blkiop) { }
79static inline void blkio_policy_unregister(struct blkio_policy_type *blkiop) { } 162static inline void blkio_policy_unregister(struct blkio_policy_type *blkiop) { }
80 163
164static inline char *blkg_path(struct blkio_group *blkg) { return NULL; }
165
81#endif 166#endif
82 167
83#define BLKIO_WEIGHT_MIN 100 168#define BLKIO_WEIGHT_MIN 100
@@ -85,16 +170,42 @@ static inline void blkio_policy_unregister(struct blkio_policy_type *blkiop) { }
85#define BLKIO_WEIGHT_DEFAULT 500 170#define BLKIO_WEIGHT_DEFAULT 500
86 171
87#ifdef CONFIG_DEBUG_BLK_CGROUP 172#ifdef CONFIG_DEBUG_BLK_CGROUP
88static inline char *blkg_path(struct blkio_group *blkg) 173void blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg);
89{ 174void blkiocg_update_dequeue_stats(struct blkio_group *blkg,
90 return blkg->path;
91}
92void blkiocg_update_blkio_group_dequeue_stats(struct blkio_group *blkg,
93 unsigned long dequeue); 175 unsigned long dequeue);
176void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg);
177void blkiocg_update_idle_time_stats(struct blkio_group *blkg);
178void blkiocg_set_start_empty_time(struct blkio_group *blkg);
179
180#define BLKG_FLAG_FNS(name) \
181static inline void blkio_mark_blkg_##name( \
182 struct blkio_group_stats *stats) \
183{ \
184 stats->flags |= (1 << BLKG_##name); \
185} \
186static inline void blkio_clear_blkg_##name( \
187 struct blkio_group_stats *stats) \
188{ \
189 stats->flags &= ~(1 << BLKG_##name); \
190} \
191static inline int blkio_blkg_##name(struct blkio_group_stats *stats) \
192{ \
193 return (stats->flags & (1 << BLKG_##name)) != 0; \
194} \
195
196BLKG_FLAG_FNS(waiting)
197BLKG_FLAG_FNS(idling)
198BLKG_FLAG_FNS(empty)
199#undef BLKG_FLAG_FNS
94#else 200#else
95static inline char *blkg_path(struct blkio_group *blkg) { return NULL; } 201static inline void blkiocg_update_avg_queue_size_stats(
96static inline void blkiocg_update_blkio_group_dequeue_stats( 202 struct blkio_group *blkg) {}
97 struct blkio_group *blkg, unsigned long dequeue) {} 203static inline void blkiocg_update_dequeue_stats(struct blkio_group *blkg,
204 unsigned long dequeue) {}
205static inline void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg)
206{}
207static inline void blkiocg_update_idle_time_stats(struct blkio_group *blkg) {}
208static inline void blkiocg_set_start_empty_time(struct blkio_group *blkg) {}
98#endif 209#endif
99 210
100#if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE) 211#if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE)
@@ -105,26 +216,43 @@ extern void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
105extern int blkiocg_del_blkio_group(struct blkio_group *blkg); 216extern int blkiocg_del_blkio_group(struct blkio_group *blkg);
106extern struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, 217extern struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg,
107 void *key); 218 void *key);
108void blkiocg_update_blkio_group_stats(struct blkio_group *blkg, 219void blkiocg_update_timeslice_used(struct blkio_group *blkg,
109 unsigned long time, unsigned long sectors); 220 unsigned long time);
221void blkiocg_update_dispatch_stats(struct blkio_group *blkg, uint64_t bytes,
222 bool direction, bool sync);
223void blkiocg_update_completion_stats(struct blkio_group *blkg,
224 uint64_t start_time, uint64_t io_start_time, bool direction, bool sync);
225void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction,
226 bool sync);
227void blkiocg_update_io_add_stats(struct blkio_group *blkg,
228 struct blkio_group *curr_blkg, bool direction, bool sync);
229void blkiocg_update_io_remove_stats(struct blkio_group *blkg,
230 bool direction, bool sync);
110#else 231#else
111struct cgroup; 232struct cgroup;
112static inline struct blkio_cgroup * 233static inline struct blkio_cgroup *
113cgroup_to_blkio_cgroup(struct cgroup *cgroup) { return NULL; } 234cgroup_to_blkio_cgroup(struct cgroup *cgroup) { return NULL; }
114 235
115static inline void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, 236static inline void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
116 struct blkio_group *blkg, void *key, dev_t dev) 237 struct blkio_group *blkg, void *key, dev_t dev) {}
117{
118}
119 238
120static inline int 239static inline int
121blkiocg_del_blkio_group(struct blkio_group *blkg) { return 0; } 240blkiocg_del_blkio_group(struct blkio_group *blkg) { return 0; }
122 241
123static inline struct blkio_group * 242static inline struct blkio_group *
124blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key) { return NULL; } 243blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key) { return NULL; }
125static inline void blkiocg_update_blkio_group_stats(struct blkio_group *blkg, 244static inline void blkiocg_update_timeslice_used(struct blkio_group *blkg,
126 unsigned long time, unsigned long sectors) 245 unsigned long time) {}
127{ 246static inline void blkiocg_update_dispatch_stats(struct blkio_group *blkg,
128} 247 uint64_t bytes, bool direction, bool sync) {}
248static inline void blkiocg_update_completion_stats(struct blkio_group *blkg,
249 uint64_t start_time, uint64_t io_start_time, bool direction,
250 bool sync) {}
251static inline void blkiocg_update_io_merged_stats(struct blkio_group *blkg,
252 bool direction, bool sync) {}
253static inline void blkiocg_update_io_add_stats(struct blkio_group *blkg,
254 struct blkio_group *curr_blkg, bool direction, bool sync) {}
255static inline void blkiocg_update_io_remove_stats(struct blkio_group *blkg,
256 bool direction, bool sync) {}
129#endif 257#endif
130#endif /* _BLK_CGROUP_H */ 258#endif /* _BLK_CGROUP_H */
diff --git a/block/blk-core.c b/block/blk-core.c
index 9fe174dc74d1..e9a5ae25db8c 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -127,6 +127,7 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
127 rq->tag = -1; 127 rq->tag = -1;
128 rq->ref_count = 1; 128 rq->ref_count = 1;
129 rq->start_time = jiffies; 129 rq->start_time = jiffies;
130 set_start_time_ns(rq);
130} 131}
131EXPORT_SYMBOL(blk_rq_init); 132EXPORT_SYMBOL(blk_rq_init);
132 133
@@ -450,6 +451,7 @@ void blk_cleanup_queue(struct request_queue *q)
450 */ 451 */
451 blk_sync_queue(q); 452 blk_sync_queue(q);
452 453
454 del_timer_sync(&q->backing_dev_info.laptop_mode_wb_timer);
453 mutex_lock(&q->sysfs_lock); 455 mutex_lock(&q->sysfs_lock);
454 queue_flag_set_unlocked(QUEUE_FLAG_DEAD, q); 456 queue_flag_set_unlocked(QUEUE_FLAG_DEAD, q);
455 mutex_unlock(&q->sysfs_lock); 457 mutex_unlock(&q->sysfs_lock);
@@ -510,6 +512,8 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
510 return NULL; 512 return NULL;
511 } 513 }
512 514
515 setup_timer(&q->backing_dev_info.laptop_mode_wb_timer,
516 laptop_mode_timer_fn, (unsigned long) q);
513 init_timer(&q->unplug_timer); 517 init_timer(&q->unplug_timer);
514 setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q); 518 setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q);
515 INIT_LIST_HEAD(&q->timeout_list); 519 INIT_LIST_HEAD(&q->timeout_list);
@@ -1198,6 +1202,7 @@ static int __make_request(struct request_queue *q, struct bio *bio)
1198 if (!blk_rq_cpu_valid(req)) 1202 if (!blk_rq_cpu_valid(req))
1199 req->cpu = bio->bi_comp_cpu; 1203 req->cpu = bio->bi_comp_cpu;
1200 drive_stat_acct(req, 0); 1204 drive_stat_acct(req, 0);
1205 elv_bio_merged(q, req, bio);
1201 if (!attempt_back_merge(q, req)) 1206 if (!attempt_back_merge(q, req))
1202 elv_merged_request(q, req, el_ret); 1207 elv_merged_request(q, req, el_ret);
1203 goto out; 1208 goto out;
@@ -1231,6 +1236,7 @@ static int __make_request(struct request_queue *q, struct bio *bio)
1231 if (!blk_rq_cpu_valid(req)) 1236 if (!blk_rq_cpu_valid(req))
1232 req->cpu = bio->bi_comp_cpu; 1237 req->cpu = bio->bi_comp_cpu;
1233 drive_stat_acct(req, 0); 1238 drive_stat_acct(req, 0);
1239 elv_bio_merged(q, req, bio);
1234 if (!attempt_front_merge(q, req)) 1240 if (!attempt_front_merge(q, req))
1235 elv_merged_request(q, req, el_ret); 1241 elv_merged_request(q, req, el_ret);
1236 goto out; 1242 goto out;
@@ -1855,8 +1861,10 @@ void blk_dequeue_request(struct request *rq)
1855 * and to it is freed is accounted as io that is in progress at 1861 * and to it is freed is accounted as io that is in progress at
1856 * the driver side. 1862 * the driver side.
1857 */ 1863 */
1858 if (blk_account_rq(rq)) 1864 if (blk_account_rq(rq)) {
1859 q->in_flight[rq_is_sync(rq)]++; 1865 q->in_flight[rq_is_sync(rq)]++;
1866 set_io_start_time_ns(rq);
1867 }
1860} 1868}
1861 1869
1862/** 1870/**
@@ -2098,7 +2106,7 @@ static void blk_finish_request(struct request *req, int error)
2098 BUG_ON(blk_queued_rq(req)); 2106 BUG_ON(blk_queued_rq(req));
2099 2107
2100 if (unlikely(laptop_mode) && blk_fs_request(req)) 2108 if (unlikely(laptop_mode) && blk_fs_request(req))
2101 laptop_io_completion(); 2109 laptop_io_completion(&req->q->backing_dev_info);
2102 2110
2103 blk_delete_timer(req); 2111 blk_delete_timer(req);
2104 2112
@@ -2517,4 +2525,3 @@ int __init blk_dev_init(void)
2517 2525
2518 return 0; 2526 return 0;
2519} 2527}
2520
diff --git a/block/blk-lib.c b/block/blk-lib.c
new file mode 100644
index 000000000000..d0216b9f22d4
--- /dev/null
+++ b/block/blk-lib.c
@@ -0,0 +1,233 @@
1/*
2 * Functions related to generic helpers functions
3 */
4#include <linux/kernel.h>
5#include <linux/module.h>
6#include <linux/bio.h>
7#include <linux/blkdev.h>
8#include <linux/scatterlist.h>
9
10#include "blk.h"
11
12static void blkdev_discard_end_io(struct bio *bio, int err)
13{
14 if (err) {
15 if (err == -EOPNOTSUPP)
16 set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
17 clear_bit(BIO_UPTODATE, &bio->bi_flags);
18 }
19
20 if (bio->bi_private)
21 complete(bio->bi_private);
22 __free_page(bio_page(bio));
23
24 bio_put(bio);
25}
26
27/**
28 * blkdev_issue_discard - queue a discard
29 * @bdev: blockdev to issue discard for
30 * @sector: start sector
31 * @nr_sects: number of sectors to discard
32 * @gfp_mask: memory allocation flags (for bio_alloc)
33 * @flags: BLKDEV_IFL_* flags to control behaviour
34 *
35 * Description:
36 * Issue a discard request for the sectors in question.
37 */
38int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
39 sector_t nr_sects, gfp_t gfp_mask, unsigned long flags)
40{
41 DECLARE_COMPLETION_ONSTACK(wait);
42 struct request_queue *q = bdev_get_queue(bdev);
43 int type = flags & BLKDEV_IFL_BARRIER ?
44 DISCARD_BARRIER : DISCARD_NOBARRIER;
45 struct bio *bio;
46 struct page *page;
47 int ret = 0;
48
49 if (!q)
50 return -ENXIO;
51
52 if (!blk_queue_discard(q))
53 return -EOPNOTSUPP;
54
55 while (nr_sects && !ret) {
56 unsigned int sector_size = q->limits.logical_block_size;
57 unsigned int max_discard_sectors =
58 min(q->limits.max_discard_sectors, UINT_MAX >> 9);
59
60 bio = bio_alloc(gfp_mask, 1);
61 if (!bio)
62 goto out;
63 bio->bi_sector = sector;
64 bio->bi_end_io = blkdev_discard_end_io;
65 bio->bi_bdev = bdev;
66 if (flags & BLKDEV_IFL_WAIT)
67 bio->bi_private = &wait;
68
69 /*
70 * Add a zeroed one-sector payload as that's what
71 * our current implementations need. If we'll ever need
72 * more the interface will need revisiting.
73 */
74 page = alloc_page(gfp_mask | __GFP_ZERO);
75 if (!page)
76 goto out_free_bio;
77 if (bio_add_pc_page(q, bio, page, sector_size, 0) < sector_size)
78 goto out_free_page;
79
80 /*
81 * And override the bio size - the way discard works we
82 * touch many more blocks on disk than the actual payload
83 * length.
84 */
85 if (nr_sects > max_discard_sectors) {
86 bio->bi_size = max_discard_sectors << 9;
87 nr_sects -= max_discard_sectors;
88 sector += max_discard_sectors;
89 } else {
90 bio->bi_size = nr_sects << 9;
91 nr_sects = 0;
92 }
93
94 bio_get(bio);
95 submit_bio(type, bio);
96
97 if (flags & BLKDEV_IFL_WAIT)
98 wait_for_completion(&wait);
99
100 if (bio_flagged(bio, BIO_EOPNOTSUPP))
101 ret = -EOPNOTSUPP;
102 else if (!bio_flagged(bio, BIO_UPTODATE))
103 ret = -EIO;
104 bio_put(bio);
105 }
106 return ret;
107out_free_page:
108 __free_page(page);
109out_free_bio:
110 bio_put(bio);
111out:
112 return -ENOMEM;
113}
114EXPORT_SYMBOL(blkdev_issue_discard);
115
116struct bio_batch
117{
118 atomic_t done;
119 unsigned long flags;
120 struct completion *wait;
121 bio_end_io_t *end_io;
122};
123
124static void bio_batch_end_io(struct bio *bio, int err)
125{
126 struct bio_batch *bb = bio->bi_private;
127
128 if (err) {
129 if (err == -EOPNOTSUPP)
130 set_bit(BIO_EOPNOTSUPP, &bb->flags);
131 else
132 clear_bit(BIO_UPTODATE, &bb->flags);
133 }
134 if (bb) {
135 if (bb->end_io)
136 bb->end_io(bio, err);
137 atomic_inc(&bb->done);
138 complete(bb->wait);
139 }
140 bio_put(bio);
141}
142
143/**
144 * blkdev_issue_zeroout generate number of zero filed write bios
145 * @bdev: blockdev to issue
146 * @sector: start sector
147 * @nr_sects: number of sectors to write
148 * @gfp_mask: memory allocation flags (for bio_alloc)
149 * @flags: BLKDEV_IFL_* flags to control behaviour
150 *
151 * Description:
152 * Generate and issue number of bios with zerofiled pages.
153 * Send barrier at the beginning and at the end if requested. This guarantie
154 * correct request ordering. Empty barrier allow us to avoid post queue flush.
155 */
156
157int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
158 sector_t nr_sects, gfp_t gfp_mask, unsigned long flags)
159{
160 int ret = 0;
161 struct bio *bio;
162 struct bio_batch bb;
163 unsigned int sz, issued = 0;
164 DECLARE_COMPLETION_ONSTACK(wait);
165
166 atomic_set(&bb.done, 0);
167 bb.flags = 1 << BIO_UPTODATE;
168 bb.wait = &wait;
169 bb.end_io = NULL;
170
171 if (flags & BLKDEV_IFL_BARRIER) {
172 /* issue async barrier before the data */
173 ret = blkdev_issue_flush(bdev, gfp_mask, NULL, 0);
174 if (ret)
175 return ret;
176 }
177submit:
178 while (nr_sects != 0) {
179 bio = bio_alloc(gfp_mask,
180 min(nr_sects, (sector_t)BIO_MAX_PAGES));
181 if (!bio)
182 break;
183
184 bio->bi_sector = sector;
185 bio->bi_bdev = bdev;
186 bio->bi_end_io = bio_batch_end_io;
187 if (flags & BLKDEV_IFL_WAIT)
188 bio->bi_private = &bb;
189
190 while (nr_sects != 0) {
191 sz = min((sector_t) PAGE_SIZE >> 9 , nr_sects);
192 if (sz == 0)
193 /* bio has maximum size possible */
194 break;
195 ret = bio_add_page(bio, ZERO_PAGE(0), sz << 9, 0);
196 nr_sects -= ret >> 9;
197 sector += ret >> 9;
198 if (ret < (sz << 9))
199 break;
200 }
201 issued++;
202 submit_bio(WRITE, bio);
203 }
204 /*
205 * When all data bios are in flight. Send final barrier if requeted.
206 */
207 if (nr_sects == 0 && flags & BLKDEV_IFL_BARRIER)
208 ret = blkdev_issue_flush(bdev, gfp_mask, NULL,
209 flags & BLKDEV_IFL_WAIT);
210
211
212 if (flags & BLKDEV_IFL_WAIT)
213 /* Wait for bios in-flight */
214 while ( issued != atomic_read(&bb.done))
215 wait_for_completion(&wait);
216
217 if (!test_bit(BIO_UPTODATE, &bb.flags))
218 /* One of bios in the batch was completed with error.*/
219 ret = -EIO;
220
221 if (ret)
222 goto out;
223
224 if (test_bit(BIO_EOPNOTSUPP, &bb.flags)) {
225 ret = -EOPNOTSUPP;
226 goto out;
227 }
228 if (nr_sects != 0)
229 goto submit;
230out:
231 return ret;
232}
233EXPORT_SYMBOL(blkdev_issue_zeroout);
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 838834be115b..0f3eb70f9ce1 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -55,6 +55,7 @@ static const int cfq_hist_divisor = 4;
55#define RQ_CIC(rq) \ 55#define RQ_CIC(rq) \
56 ((struct cfq_io_context *) (rq)->elevator_private) 56 ((struct cfq_io_context *) (rq)->elevator_private)
57#define RQ_CFQQ(rq) (struct cfq_queue *) ((rq)->elevator_private2) 57#define RQ_CFQQ(rq) (struct cfq_queue *) ((rq)->elevator_private2)
58#define RQ_CFQG(rq) (struct cfq_group *) ((rq)->elevator_private3)
58 59
59static struct kmem_cache *cfq_pool; 60static struct kmem_cache *cfq_pool;
60static struct kmem_cache *cfq_ioc_pool; 61static struct kmem_cache *cfq_ioc_pool;
@@ -143,8 +144,6 @@ struct cfq_queue {
143 struct cfq_queue *new_cfqq; 144 struct cfq_queue *new_cfqq;
144 struct cfq_group *cfqg; 145 struct cfq_group *cfqg;
145 struct cfq_group *orig_cfqg; 146 struct cfq_group *orig_cfqg;
146 /* Sectors dispatched in current dispatch round */
147 unsigned long nr_sectors;
148}; 147};
149 148
150/* 149/*
@@ -346,7 +345,7 @@ CFQ_CFQQ_FNS(deep);
346CFQ_CFQQ_FNS(wait_busy); 345CFQ_CFQQ_FNS(wait_busy);
347#undef CFQ_CFQQ_FNS 346#undef CFQ_CFQQ_FNS
348 347
349#ifdef CONFIG_DEBUG_CFQ_IOSCHED 348#ifdef CONFIG_CFQ_GROUP_IOSCHED
350#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \ 349#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \
351 blk_add_trace_msg((cfqd)->queue, "cfq%d%c %s " fmt, (cfqq)->pid, \ 350 blk_add_trace_msg((cfqd)->queue, "cfq%d%c %s " fmt, (cfqq)->pid, \
352 cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \ 351 cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \
@@ -858,7 +857,7 @@ cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
858 if (!RB_EMPTY_NODE(&cfqg->rb_node)) 857 if (!RB_EMPTY_NODE(&cfqg->rb_node))
859 cfq_rb_erase(&cfqg->rb_node, st); 858 cfq_rb_erase(&cfqg->rb_node, st);
860 cfqg->saved_workload_slice = 0; 859 cfqg->saved_workload_slice = 0;
861 blkiocg_update_blkio_group_dequeue_stats(&cfqg->blkg, 1); 860 blkiocg_update_dequeue_stats(&cfqg->blkg, 1);
862} 861}
863 862
864static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq) 863static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
@@ -884,8 +883,7 @@ static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
884 slice_used = cfqq->allocated_slice; 883 slice_used = cfqq->allocated_slice;
885 } 884 }
886 885
887 cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u sect=%lu", slice_used, 886 cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u", slice_used);
888 cfqq->nr_sectors);
889 return slice_used; 887 return slice_used;
890} 888}
891 889
@@ -919,8 +917,8 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
919 917
920 cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime, 918 cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime,
921 st->min_vdisktime); 919 st->min_vdisktime);
922 blkiocg_update_blkio_group_stats(&cfqg->blkg, used_sl, 920 blkiocg_update_timeslice_used(&cfqg->blkg, used_sl);
923 cfqq->nr_sectors); 921 blkiocg_set_start_empty_time(&cfqg->blkg);
924} 922}
925 923
926#ifdef CONFIG_CFQ_GROUP_IOSCHED 924#ifdef CONFIG_CFQ_GROUP_IOSCHED
@@ -961,7 +959,6 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
961 if (!cfqg) 959 if (!cfqg)
962 goto done; 960 goto done;
963 961
964 cfqg->weight = blkcg->weight;
965 for_each_cfqg_st(cfqg, i, j, st) 962 for_each_cfqg_st(cfqg, i, j, st)
966 *st = CFQ_RB_ROOT; 963 *st = CFQ_RB_ROOT;
967 RB_CLEAR_NODE(&cfqg->rb_node); 964 RB_CLEAR_NODE(&cfqg->rb_node);
@@ -978,6 +975,7 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
978 sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); 975 sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
979 blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd, 976 blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
980 MKDEV(major, minor)); 977 MKDEV(major, minor));
978 cfqg->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
981 979
982 /* Add group on cfqd list */ 980 /* Add group on cfqd list */
983 hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list); 981 hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
@@ -1004,6 +1002,12 @@ static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create)
1004 return cfqg; 1002 return cfqg;
1005} 1003}
1006 1004
1005static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg)
1006{
1007 atomic_inc(&cfqg->ref);
1008 return cfqg;
1009}
1010
1007static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) 1011static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg)
1008{ 1012{
1009 /* Currently, all async queues are mapped to root group */ 1013 /* Currently, all async queues are mapped to root group */
@@ -1087,6 +1091,12 @@ static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create)
1087{ 1091{
1088 return &cfqd->root_group; 1092 return &cfqd->root_group;
1089} 1093}
1094
1095static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg)
1096{
1097 return cfqg;
1098}
1099
1090static inline void 1100static inline void
1091cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) { 1101cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) {
1092 cfqq->cfqg = cfqg; 1102 cfqq->cfqg = cfqg;
@@ -1389,7 +1399,12 @@ static void cfq_reposition_rq_rb(struct cfq_queue *cfqq, struct request *rq)
1389{ 1399{
1390 elv_rb_del(&cfqq->sort_list, rq); 1400 elv_rb_del(&cfqq->sort_list, rq);
1391 cfqq->queued[rq_is_sync(rq)]--; 1401 cfqq->queued[rq_is_sync(rq)]--;
1402 blkiocg_update_io_remove_stats(&(RQ_CFQG(rq))->blkg, rq_data_dir(rq),
1403 rq_is_sync(rq));
1392 cfq_add_rq_rb(rq); 1404 cfq_add_rq_rb(rq);
1405 blkiocg_update_io_add_stats(&(RQ_CFQG(rq))->blkg,
1406 &cfqq->cfqd->serving_group->blkg, rq_data_dir(rq),
1407 rq_is_sync(rq));
1393} 1408}
1394 1409
1395static struct request * 1410static struct request *
@@ -1445,6 +1460,8 @@ static void cfq_remove_request(struct request *rq)
1445 cfq_del_rq_rb(rq); 1460 cfq_del_rq_rb(rq);
1446 1461
1447 cfqq->cfqd->rq_queued--; 1462 cfqq->cfqd->rq_queued--;
1463 blkiocg_update_io_remove_stats(&(RQ_CFQG(rq))->blkg, rq_data_dir(rq),
1464 rq_is_sync(rq));
1448 if (rq_is_meta(rq)) { 1465 if (rq_is_meta(rq)) {
1449 WARN_ON(!cfqq->meta_pending); 1466 WARN_ON(!cfqq->meta_pending);
1450 cfqq->meta_pending--; 1467 cfqq->meta_pending--;
@@ -1476,6 +1493,13 @@ static void cfq_merged_request(struct request_queue *q, struct request *req,
1476 } 1493 }
1477} 1494}
1478 1495
1496static void cfq_bio_merged(struct request_queue *q, struct request *req,
1497 struct bio *bio)
1498{
1499 blkiocg_update_io_merged_stats(&(RQ_CFQG(req))->blkg, bio_data_dir(bio),
1500 cfq_bio_sync(bio));
1501}
1502
1479static void 1503static void
1480cfq_merged_requests(struct request_queue *q, struct request *rq, 1504cfq_merged_requests(struct request_queue *q, struct request *rq,
1481 struct request *next) 1505 struct request *next)
@@ -1493,6 +1517,8 @@ cfq_merged_requests(struct request_queue *q, struct request *rq,
1493 if (cfqq->next_rq == next) 1517 if (cfqq->next_rq == next)
1494 cfqq->next_rq = rq; 1518 cfqq->next_rq = rq;
1495 cfq_remove_request(next); 1519 cfq_remove_request(next);
1520 blkiocg_update_io_merged_stats(&(RQ_CFQG(rq))->blkg, rq_data_dir(next),
1521 rq_is_sync(next));
1496} 1522}
1497 1523
1498static int cfq_allow_merge(struct request_queue *q, struct request *rq, 1524static int cfq_allow_merge(struct request_queue *q, struct request *rq,
@@ -1520,18 +1546,24 @@ static int cfq_allow_merge(struct request_queue *q, struct request *rq,
1520 return cfqq == RQ_CFQQ(rq); 1546 return cfqq == RQ_CFQQ(rq);
1521} 1547}
1522 1548
1549static inline void cfq_del_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1550{
1551 del_timer(&cfqd->idle_slice_timer);
1552 blkiocg_update_idle_time_stats(&cfqq->cfqg->blkg);
1553}
1554
1523static void __cfq_set_active_queue(struct cfq_data *cfqd, 1555static void __cfq_set_active_queue(struct cfq_data *cfqd,
1524 struct cfq_queue *cfqq) 1556 struct cfq_queue *cfqq)
1525{ 1557{
1526 if (cfqq) { 1558 if (cfqq) {
1527 cfq_log_cfqq(cfqd, cfqq, "set_active wl_prio:%d wl_type:%d", 1559 cfq_log_cfqq(cfqd, cfqq, "set_active wl_prio:%d wl_type:%d",
1528 cfqd->serving_prio, cfqd->serving_type); 1560 cfqd->serving_prio, cfqd->serving_type);
1561 blkiocg_update_avg_queue_size_stats(&cfqq->cfqg->blkg);
1529 cfqq->slice_start = 0; 1562 cfqq->slice_start = 0;
1530 cfqq->dispatch_start = jiffies; 1563 cfqq->dispatch_start = jiffies;
1531 cfqq->allocated_slice = 0; 1564 cfqq->allocated_slice = 0;
1532 cfqq->slice_end = 0; 1565 cfqq->slice_end = 0;
1533 cfqq->slice_dispatch = 0; 1566 cfqq->slice_dispatch = 0;
1534 cfqq->nr_sectors = 0;
1535 1567
1536 cfq_clear_cfqq_wait_request(cfqq); 1568 cfq_clear_cfqq_wait_request(cfqq);
1537 cfq_clear_cfqq_must_dispatch(cfqq); 1569 cfq_clear_cfqq_must_dispatch(cfqq);
@@ -1539,7 +1571,7 @@ static void __cfq_set_active_queue(struct cfq_data *cfqd,
1539 cfq_clear_cfqq_fifo_expire(cfqq); 1571 cfq_clear_cfqq_fifo_expire(cfqq);
1540 cfq_mark_cfqq_slice_new(cfqq); 1572 cfq_mark_cfqq_slice_new(cfqq);
1541 1573
1542 del_timer(&cfqd->idle_slice_timer); 1574 cfq_del_timer(cfqd, cfqq);
1543 } 1575 }
1544 1576
1545 cfqd->active_queue = cfqq; 1577 cfqd->active_queue = cfqq;
@@ -1555,7 +1587,7 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1555 cfq_log_cfqq(cfqd, cfqq, "slice expired t=%d", timed_out); 1587 cfq_log_cfqq(cfqd, cfqq, "slice expired t=%d", timed_out);
1556 1588
1557 if (cfq_cfqq_wait_request(cfqq)) 1589 if (cfq_cfqq_wait_request(cfqq))
1558 del_timer(&cfqd->idle_slice_timer); 1590 cfq_del_timer(cfqd, cfqq);
1559 1591
1560 cfq_clear_cfqq_wait_request(cfqq); 1592 cfq_clear_cfqq_wait_request(cfqq);
1561 cfq_clear_cfqq_wait_busy(cfqq); 1593 cfq_clear_cfqq_wait_busy(cfqq);
@@ -1857,6 +1889,7 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
1857 sl = cfqd->cfq_slice_idle; 1889 sl = cfqd->cfq_slice_idle;
1858 1890
1859 mod_timer(&cfqd->idle_slice_timer, jiffies + sl); 1891 mod_timer(&cfqd->idle_slice_timer, jiffies + sl);
1892 blkiocg_update_set_idle_time_stats(&cfqq->cfqg->blkg);
1860 cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu", sl); 1893 cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu", sl);
1861} 1894}
1862 1895
@@ -1876,7 +1909,8 @@ static void cfq_dispatch_insert(struct request_queue *q, struct request *rq)
1876 elv_dispatch_sort(q, rq); 1909 elv_dispatch_sort(q, rq);
1877 1910
1878 cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++; 1911 cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++;
1879 cfqq->nr_sectors += blk_rq_sectors(rq); 1912 blkiocg_update_dispatch_stats(&cfqq->cfqg->blkg, blk_rq_bytes(rq),
1913 rq_data_dir(rq), rq_is_sync(rq));
1880} 1914}
1881 1915
1882/* 1916/*
@@ -3185,11 +3219,14 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
3185 if (cfq_cfqq_wait_request(cfqq)) { 3219 if (cfq_cfqq_wait_request(cfqq)) {
3186 if (blk_rq_bytes(rq) > PAGE_CACHE_SIZE || 3220 if (blk_rq_bytes(rq) > PAGE_CACHE_SIZE ||
3187 cfqd->busy_queues > 1) { 3221 cfqd->busy_queues > 1) {
3188 del_timer(&cfqd->idle_slice_timer); 3222 cfq_del_timer(cfqd, cfqq);
3189 cfq_clear_cfqq_wait_request(cfqq); 3223 cfq_clear_cfqq_wait_request(cfqq);
3190 __blk_run_queue(cfqd->queue); 3224 __blk_run_queue(cfqd->queue);
3191 } else 3225 } else {
3226 blkiocg_update_idle_time_stats(
3227 &cfqq->cfqg->blkg);
3192 cfq_mark_cfqq_must_dispatch(cfqq); 3228 cfq_mark_cfqq_must_dispatch(cfqq);
3229 }
3193 } 3230 }
3194 } else if (cfq_should_preempt(cfqd, cfqq, rq)) { 3231 } else if (cfq_should_preempt(cfqd, cfqq, rq)) {
3195 /* 3232 /*
@@ -3214,7 +3251,9 @@ static void cfq_insert_request(struct request_queue *q, struct request *rq)
3214 rq_set_fifo_time(rq, jiffies + cfqd->cfq_fifo_expire[rq_is_sync(rq)]); 3251 rq_set_fifo_time(rq, jiffies + cfqd->cfq_fifo_expire[rq_is_sync(rq)]);
3215 list_add_tail(&rq->queuelist, &cfqq->fifo); 3252 list_add_tail(&rq->queuelist, &cfqq->fifo);
3216 cfq_add_rq_rb(rq); 3253 cfq_add_rq_rb(rq);
3217 3254 blkiocg_update_io_add_stats(&(RQ_CFQG(rq))->blkg,
3255 &cfqd->serving_group->blkg, rq_data_dir(rq),
3256 rq_is_sync(rq));
3218 cfq_rq_enqueued(cfqd, cfqq, rq); 3257 cfq_rq_enqueued(cfqd, cfqq, rq);
3219} 3258}
3220 3259
@@ -3300,6 +3339,9 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
3300 WARN_ON(!cfqq->dispatched); 3339 WARN_ON(!cfqq->dispatched);
3301 cfqd->rq_in_driver--; 3340 cfqd->rq_in_driver--;
3302 cfqq->dispatched--; 3341 cfqq->dispatched--;
3342 blkiocg_update_completion_stats(&cfqq->cfqg->blkg, rq_start_time_ns(rq),
3343 rq_io_start_time_ns(rq), rq_data_dir(rq),
3344 rq_is_sync(rq));
3303 3345
3304 cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--; 3346 cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--;
3305 3347
@@ -3440,6 +3482,10 @@ static void cfq_put_request(struct request *rq)
3440 rq->elevator_private = NULL; 3482 rq->elevator_private = NULL;
3441 rq->elevator_private2 = NULL; 3483 rq->elevator_private2 = NULL;
3442 3484
3485 /* Put down rq reference on cfqg */
3486 cfq_put_cfqg(RQ_CFQG(rq));
3487 rq->elevator_private3 = NULL;
3488
3443 cfq_put_queue(cfqq); 3489 cfq_put_queue(cfqq);
3444 } 3490 }
3445} 3491}
@@ -3528,6 +3574,7 @@ new_queue:
3528 3574
3529 rq->elevator_private = cic; 3575 rq->elevator_private = cic;
3530 rq->elevator_private2 = cfqq; 3576 rq->elevator_private2 = cfqq;
3577 rq->elevator_private3 = cfq_ref_get_cfqg(cfqq->cfqg);
3531 return 0; 3578 return 0;
3532 3579
3533queue_fail: 3580queue_fail:
@@ -3870,6 +3917,7 @@ static struct elevator_type iosched_cfq = {
3870 .elevator_merged_fn = cfq_merged_request, 3917 .elevator_merged_fn = cfq_merged_request,
3871 .elevator_merge_req_fn = cfq_merged_requests, 3918 .elevator_merge_req_fn = cfq_merged_requests,
3872 .elevator_allow_merge_fn = cfq_allow_merge, 3919 .elevator_allow_merge_fn = cfq_allow_merge,
3920 .elevator_bio_merged_fn = cfq_bio_merged,
3873 .elevator_dispatch_fn = cfq_dispatch_requests, 3921 .elevator_dispatch_fn = cfq_dispatch_requests,
3874 .elevator_add_req_fn = cfq_insert_request, 3922 .elevator_add_req_fn = cfq_insert_request,
3875 .elevator_activate_req_fn = cfq_activate_request, 3923 .elevator_activate_req_fn = cfq_activate_request,
diff --git a/block/elevator.c b/block/elevator.c
index 76e3702d5381..5e734592bb40 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -539,6 +539,15 @@ void elv_merge_requests(struct request_queue *q, struct request *rq,
539 q->last_merge = rq; 539 q->last_merge = rq;
540} 540}
541 541
542void elv_bio_merged(struct request_queue *q, struct request *rq,
543 struct bio *bio)
544{
545 struct elevator_queue *e = q->elevator;
546
547 if (e->ops->elevator_bio_merged_fn)
548 e->ops->elevator_bio_merged_fn(q, rq, bio);
549}
550
542void elv_requeue_request(struct request_queue *q, struct request *rq) 551void elv_requeue_request(struct request_queue *q, struct request *rq)
543{ 552{
544 /* 553 /*
diff --git a/block/genhd.c b/block/genhd.c
index d13ba76a169c..154b5f80b3ab 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -596,6 +596,7 @@ struct gendisk *get_gendisk(dev_t devt, int *partno)
596 596
597 return disk; 597 return disk;
598} 598}
599EXPORT_SYMBOL(get_gendisk);
599 600
600/** 601/**
601 * bdget_disk - do bdget() by gendisk and partition number 602 * bdget_disk - do bdget() by gendisk and partition number
diff --git a/block/ioctl.c b/block/ioctl.c
index 8905d2a2a717..e8eb679f2f9b 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -126,7 +126,7 @@ static int blk_ioctl_discard(struct block_device *bdev, uint64_t start,
126 if (start + len > (bdev->bd_inode->i_size >> 9)) 126 if (start + len > (bdev->bd_inode->i_size >> 9))
127 return -EINVAL; 127 return -EINVAL;
128 return blkdev_issue_discard(bdev, start, len, GFP_KERNEL, 128 return blkdev_issue_discard(bdev, start, len, GFP_KERNEL,
129 DISCARD_FL_WAIT); 129 BLKDEV_IFL_WAIT);
130} 130}
131 131
132static int put_ushort(unsigned long arg, unsigned short val) 132static int put_ushort(unsigned long arg, unsigned short val)
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index e5e86a781820..d6f1ae342b1d 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -2251,7 +2251,8 @@ static inline void drbd_md_flush(struct drbd_conf *mdev)
2251 if (test_bit(MD_NO_BARRIER, &mdev->flags)) 2251 if (test_bit(MD_NO_BARRIER, &mdev->flags))
2252 return; 2252 return;
2253 2253
2254 r = blkdev_issue_flush(mdev->ldev->md_bdev, NULL); 2254 r = blkdev_issue_flush(mdev->ldev->md_bdev, GFP_KERNEL, NULL,
2255 BLKDEV_IFL_WAIT);
2255 if (r) { 2256 if (r) {
2256 set_bit(MD_NO_BARRIER, &mdev->flags); 2257 set_bit(MD_NO_BARRIER, &mdev->flags);
2257 dev_err(DEV, "meta data flush failed with status %d, disabling md-flushes\n", r); 2258 dev_err(DEV, "meta data flush failed with status %d, disabling md-flushes\n", r);
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 3f096e7959b4..c786023001d2 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -946,7 +946,8 @@ static enum finish_epoch drbd_flush_after_epoch(struct drbd_conf *mdev, struct d
946 int rv; 946 int rv;
947 947
948 if (mdev->write_ordering >= WO_bdev_flush && get_ldev(mdev)) { 948 if (mdev->write_ordering >= WO_bdev_flush && get_ldev(mdev)) {
949 rv = blkdev_issue_flush(mdev->ldev->backing_bdev, NULL); 949 rv = blkdev_issue_flush(mdev->ldev->backing_bdev, GFP_KERNEL,
950 NULL, BLKDEV_IFL_WAIT);
950 if (rv) { 951 if (rv) {
951 dev_err(DEV, "local disk flush failed with status %d\n", rv); 952 dev_err(DEV, "local disk flush failed with status %d\n", rv);
952 /* would rather check on EOPNOTSUPP, but that is not reliable. 953 /* would rather check on EOPNOTSUPP, but that is not reliable.
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 6dcee88c2e5d..55dcb7884f4d 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -417,7 +417,7 @@ int blkdev_fsync(struct file *filp, struct dentry *dentry, int datasync)
417 */ 417 */
418 mutex_unlock(&bd_inode->i_mutex); 418 mutex_unlock(&bd_inode->i_mutex);
419 419
420 error = blkdev_issue_flush(bdev, NULL); 420 error = blkdev_issue_flush(bdev, GFP_KERNEL, NULL, BLKDEV_IFL_WAIT);
421 if (error == -EOPNOTSUPP) 421 if (error == -EOPNOTSUPP)
422 error = 0; 422 error = 0;
423 423
@@ -668,41 +668,209 @@ void bd_forget(struct inode *inode)
668 iput(bdev->bd_inode); 668 iput(bdev->bd_inode);
669} 669}
670 670
671int bd_claim(struct block_device *bdev, void *holder) 671/**
672 * bd_may_claim - test whether a block device can be claimed
673 * @bdev: block device of interest
674 * @whole: whole block device containing @bdev, may equal @bdev
675 * @holder: holder trying to claim @bdev
676 *
677 * Test whther @bdev can be claimed by @holder.
678 *
679 * CONTEXT:
680 * spin_lock(&bdev_lock).
681 *
682 * RETURNS:
683 * %true if @bdev can be claimed, %false otherwise.
684 */
685static bool bd_may_claim(struct block_device *bdev, struct block_device *whole,
686 void *holder)
672{ 687{
673 int res;
674 spin_lock(&bdev_lock);
675
676 /* first decide result */
677 if (bdev->bd_holder == holder) 688 if (bdev->bd_holder == holder)
678 res = 0; /* already a holder */ 689 return true; /* already a holder */
679 else if (bdev->bd_holder != NULL) 690 else if (bdev->bd_holder != NULL)
680 res = -EBUSY; /* held by someone else */ 691 return false; /* held by someone else */
681 else if (bdev->bd_contains == bdev) 692 else if (bdev->bd_contains == bdev)
682 res = 0; /* is a whole device which isn't held */ 693 return true; /* is a whole device which isn't held */
683 694
684 else if (bdev->bd_contains->bd_holder == bd_claim) 695 else if (whole->bd_holder == bd_claim)
685 res = 0; /* is a partition of a device that is being partitioned */ 696 return true; /* is a partition of a device that is being partitioned */
686 else if (bdev->bd_contains->bd_holder != NULL) 697 else if (whole->bd_holder != NULL)
687 res = -EBUSY; /* is a partition of a held device */ 698 return false; /* is a partition of a held device */
688 else 699 else
689 res = 0; /* is a partition of an un-held device */ 700 return true; /* is a partition of an un-held device */
701}
702
703/**
704 * bd_prepare_to_claim - prepare to claim a block device
705 * @bdev: block device of interest
706 * @whole: the whole device containing @bdev, may equal @bdev
707 * @holder: holder trying to claim @bdev
708 *
709 * Prepare to claim @bdev. This function fails if @bdev is already
710 * claimed by another holder and waits if another claiming is in
711 * progress. This function doesn't actually claim. On successful
712 * return, the caller has ownership of bd_claiming and bd_holder[s].
713 *
714 * CONTEXT:
715 * spin_lock(&bdev_lock). Might release bdev_lock, sleep and regrab
716 * it multiple times.
717 *
718 * RETURNS:
719 * 0 if @bdev can be claimed, -EBUSY otherwise.
720 */
721static int bd_prepare_to_claim(struct block_device *bdev,
722 struct block_device *whole, void *holder)
723{
724retry:
725 /* if someone else claimed, fail */
726 if (!bd_may_claim(bdev, whole, holder))
727 return -EBUSY;
728
729 /* if someone else is claiming, wait for it to finish */
730 if (whole->bd_claiming && whole->bd_claiming != holder) {
731 wait_queue_head_t *wq = bit_waitqueue(&whole->bd_claiming, 0);
732 DEFINE_WAIT(wait);
733
734 prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE);
735 spin_unlock(&bdev_lock);
736 schedule();
737 finish_wait(wq, &wait);
738 spin_lock(&bdev_lock);
739 goto retry;
740 }
741
742 /* yay, all mine */
743 return 0;
744}
745
746/**
747 * bd_start_claiming - start claiming a block device
748 * @bdev: block device of interest
749 * @holder: holder trying to claim @bdev
750 *
751 * @bdev is about to be opened exclusively. Check @bdev can be opened
752 * exclusively and mark that an exclusive open is in progress. Each
753 * successful call to this function must be matched with a call to
754 * either bd_claim() or bd_abort_claiming(). If this function
755 * succeeds, the matching bd_claim() is guaranteed to succeed.
756 *
757 * CONTEXT:
758 * Might sleep.
759 *
760 * RETURNS:
761 * Pointer to the block device containing @bdev on success, ERR_PTR()
762 * value on failure.
763 */
764static struct block_device *bd_start_claiming(struct block_device *bdev,
765 void *holder)
766{
767 struct gendisk *disk;
768 struct block_device *whole;
769 int partno, err;
770
771 might_sleep();
772
773 /*
774 * @bdev might not have been initialized properly yet, look up
775 * and grab the outer block device the hard way.
776 */
777 disk = get_gendisk(bdev->bd_dev, &partno);
778 if (!disk)
779 return ERR_PTR(-ENXIO);
780
781 whole = bdget_disk(disk, 0);
782 put_disk(disk);
783 if (!whole)
784 return ERR_PTR(-ENOMEM);
785
786 /* prepare to claim, if successful, mark claiming in progress */
787 spin_lock(&bdev_lock);
788
789 err = bd_prepare_to_claim(bdev, whole, holder);
790 if (err == 0) {
791 whole->bd_claiming = holder;
792 spin_unlock(&bdev_lock);
793 return whole;
794 } else {
795 spin_unlock(&bdev_lock);
796 bdput(whole);
797 return ERR_PTR(err);
798 }
799}
690 800
691 /* now impose change */ 801/* releases bdev_lock */
692 if (res==0) { 802static void __bd_abort_claiming(struct block_device *whole, void *holder)
803{
804 BUG_ON(whole->bd_claiming != holder);
805 whole->bd_claiming = NULL;
806 wake_up_bit(&whole->bd_claiming, 0);
807
808 spin_unlock(&bdev_lock);
809 bdput(whole);
810}
811
812/**
813 * bd_abort_claiming - abort claiming a block device
814 * @whole: whole block device returned by bd_start_claiming()
815 * @holder: holder trying to claim @bdev
816 *
817 * Abort a claiming block started by bd_start_claiming(). Note that
818 * @whole is not the block device to be claimed but the whole device
819 * returned by bd_start_claiming().
820 *
821 * CONTEXT:
822 * Grabs and releases bdev_lock.
823 */
824static void bd_abort_claiming(struct block_device *whole, void *holder)
825{
826 spin_lock(&bdev_lock);
827 __bd_abort_claiming(whole, holder); /* releases bdev_lock */
828}
829
830/**
831 * bd_claim - claim a block device
832 * @bdev: block device to claim
833 * @holder: holder trying to claim @bdev
834 *
835 * Try to claim @bdev which must have been opened successfully. This
836 * function may be called with or without preceding
837 * blk_start_claiming(). In the former case, this function is always
838 * successful and terminates the claiming block.
839 *
840 * CONTEXT:
841 * Might sleep.
842 *
843 * RETURNS:
844 * 0 if successful, -EBUSY if @bdev is already claimed.
845 */
846int bd_claim(struct block_device *bdev, void *holder)
847{
848 struct block_device *whole = bdev->bd_contains;
849 int res;
850
851 might_sleep();
852
853 spin_lock(&bdev_lock);
854
855 res = bd_prepare_to_claim(bdev, whole, holder);
856 if (res == 0) {
693 /* note that for a whole device bd_holders 857 /* note that for a whole device bd_holders
694 * will be incremented twice, and bd_holder will 858 * will be incremented twice, and bd_holder will
695 * be set to bd_claim before being set to holder 859 * be set to bd_claim before being set to holder
696 */ 860 */
697 bdev->bd_contains->bd_holders ++; 861 whole->bd_holders++;
698 bdev->bd_contains->bd_holder = bd_claim; 862 whole->bd_holder = bd_claim;
699 bdev->bd_holders++; 863 bdev->bd_holders++;
700 bdev->bd_holder = holder; 864 bdev->bd_holder = holder;
701 } 865 }
702 spin_unlock(&bdev_lock); 866
867 if (whole->bd_claiming)
868 __bd_abort_claiming(whole, holder); /* releases bdev_lock */
869 else
870 spin_unlock(&bdev_lock);
871
703 return res; 872 return res;
704} 873}
705
706EXPORT_SYMBOL(bd_claim); 874EXPORT_SYMBOL(bd_claim);
707 875
708void bd_release(struct block_device *bdev) 876void bd_release(struct block_device *bdev)
@@ -1316,6 +1484,7 @@ EXPORT_SYMBOL(blkdev_get);
1316 1484
1317static int blkdev_open(struct inode * inode, struct file * filp) 1485static int blkdev_open(struct inode * inode, struct file * filp)
1318{ 1486{
1487 struct block_device *whole = NULL;
1319 struct block_device *bdev; 1488 struct block_device *bdev;
1320 int res; 1489 int res;
1321 1490
@@ -1338,22 +1507,25 @@ static int blkdev_open(struct inode * inode, struct file * filp)
1338 if (bdev == NULL) 1507 if (bdev == NULL)
1339 return -ENOMEM; 1508 return -ENOMEM;
1340 1509
1510 if (filp->f_mode & FMODE_EXCL) {
1511 whole = bd_start_claiming(bdev, filp);
1512 if (IS_ERR(whole)) {
1513 bdput(bdev);
1514 return PTR_ERR(whole);
1515 }
1516 }
1517
1341 filp->f_mapping = bdev->bd_inode->i_mapping; 1518 filp->f_mapping = bdev->bd_inode->i_mapping;
1342 1519
1343 res = blkdev_get(bdev, filp->f_mode); 1520 res = blkdev_get(bdev, filp->f_mode);
1344 if (res)
1345 return res;
1346 1521
1347 if (filp->f_mode & FMODE_EXCL) { 1522 if (whole) {
1348 res = bd_claim(bdev, filp); 1523 if (res == 0)
1349 if (res) 1524 BUG_ON(bd_claim(bdev, filp) != 0);
1350 goto out_blkdev_put; 1525 else
1526 bd_abort_claiming(whole, filp);
1351 } 1527 }
1352 1528
1353 return 0;
1354
1355 out_blkdev_put:
1356 blkdev_put(bdev, filp->f_mode);
1357 return res; 1529 return res;
1358} 1530}
1359 1531
@@ -1564,27 +1736,34 @@ EXPORT_SYMBOL(lookup_bdev);
1564 */ 1736 */
1565struct block_device *open_bdev_exclusive(const char *path, fmode_t mode, void *holder) 1737struct block_device *open_bdev_exclusive(const char *path, fmode_t mode, void *holder)
1566{ 1738{
1567 struct block_device *bdev; 1739 struct block_device *bdev, *whole;
1568 int error = 0; 1740 int error;
1569 1741
1570 bdev = lookup_bdev(path); 1742 bdev = lookup_bdev(path);
1571 if (IS_ERR(bdev)) 1743 if (IS_ERR(bdev))
1572 return bdev; 1744 return bdev;
1573 1745
1746 whole = bd_start_claiming(bdev, holder);
1747 if (IS_ERR(whole)) {
1748 bdput(bdev);
1749 return whole;
1750 }
1751
1574 error = blkdev_get(bdev, mode); 1752 error = blkdev_get(bdev, mode);
1575 if (error) 1753 if (error)
1576 return ERR_PTR(error); 1754 goto out_abort_claiming;
1755
1577 error = -EACCES; 1756 error = -EACCES;
1578 if ((mode & FMODE_WRITE) && bdev_read_only(bdev)) 1757 if ((mode & FMODE_WRITE) && bdev_read_only(bdev))
1579 goto blkdev_put; 1758 goto out_blkdev_put;
1580 error = bd_claim(bdev, holder);
1581 if (error)
1582 goto blkdev_put;
1583 1759
1760 BUG_ON(bd_claim(bdev, holder) != 0);
1584 return bdev; 1761 return bdev;
1585 1762
1586blkdev_put: 1763out_blkdev_put:
1587 blkdev_put(bdev, mode); 1764 blkdev_put(bdev, mode);
1765out_abort_claiming:
1766 bd_abort_claiming(whole, holder);
1588 return ERR_PTR(error); 1767 return ERR_PTR(error);
1589} 1768}
1590 1769
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index b34d32fdaaec..c6a4f459ad76 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1589,7 +1589,7 @@ static void btrfs_issue_discard(struct block_device *bdev,
1589 u64 start, u64 len) 1589 u64 start, u64 len)
1590{ 1590{
1591 blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL, 1591 blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL,
1592 DISCARD_FL_BARRIER); 1592 BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
1593} 1593}
1594 1594
1595static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, 1595static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c
index 8209f266e9ad..9492f6003ef9 100644
--- a/fs/ext3/fsync.c
+++ b/fs/ext3/fsync.c
@@ -91,7 +91,8 @@ int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync)
91 * storage 91 * storage
92 */ 92 */
93 if (test_opt(inode->i_sb, BARRIER)) 93 if (test_opt(inode->i_sb, BARRIER))
94 blkdev_issue_flush(inode->i_sb->s_bdev, NULL); 94 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL,
95 BLKDEV_IFL_WAIT);
95out: 96out:
96 return ret; 97 return ret;
97} 98}
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 0d0c3239c1cd..ef3d980e67cb 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -100,9 +100,11 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
100 if (ext4_should_writeback_data(inode) && 100 if (ext4_should_writeback_data(inode) &&
101 (journal->j_fs_dev != journal->j_dev) && 101 (journal->j_fs_dev != journal->j_dev) &&
102 (journal->j_flags & JBD2_BARRIER)) 102 (journal->j_flags & JBD2_BARRIER))
103 blkdev_issue_flush(inode->i_sb->s_bdev, NULL); 103 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL,
104 NULL, BLKDEV_IFL_WAIT);
104 jbd2_log_wait_commit(journal, commit_tid); 105 jbd2_log_wait_commit(journal, commit_tid);
105 } else if (journal->j_flags & JBD2_BARRIER) 106 } else if (journal->j_flags & JBD2_BARRIER)
106 blkdev_issue_flush(inode->i_sb->s_bdev, NULL); 107 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL,
108 BLKDEV_IFL_WAIT);
107 return ret; 109 return ret;
108} 110}
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 503b842f3ba2..bf011dc63471 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -854,7 +854,8 @@ static void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
854 if ((start + nr_sects) != blk) { 854 if ((start + nr_sects) != blk) {
855 rv = blkdev_issue_discard(bdev, start, 855 rv = blkdev_issue_discard(bdev, start,
856 nr_sects, GFP_NOFS, 856 nr_sects, GFP_NOFS,
857 DISCARD_FL_BARRIER); 857 BLKDEV_IFL_WAIT |
858 BLKDEV_IFL_BARRIER);
858 if (rv) 859 if (rv)
859 goto fail; 860 goto fail;
860 nr_sects = 0; 861 nr_sects = 0;
@@ -869,7 +870,7 @@ start_new_extent:
869 } 870 }
870 if (nr_sects) { 871 if (nr_sects) {
871 rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS, 872 rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS,
872 DISCARD_FL_BARRIER); 873 BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
873 if (rv) 874 if (rv)
874 goto fail; 875 goto fail;
875 } 876 }
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 30beb11ef928..076d1cc44f95 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -530,7 +530,8 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
530 */ 530 */
531 if ((journal->j_fs_dev != journal->j_dev) && 531 if ((journal->j_fs_dev != journal->j_dev) &&
532 (journal->j_flags & JBD2_BARRIER)) 532 (journal->j_flags & JBD2_BARRIER))
533 blkdev_issue_flush(journal->j_fs_dev, NULL); 533 blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL,
534 BLKDEV_IFL_WAIT);
534 if (!(journal->j_flags & JBD2_ABORT)) 535 if (!(journal->j_flags & JBD2_ABORT))
535 jbd2_journal_update_superblock(journal, 1); 536 jbd2_journal_update_superblock(journal, 1);
536 return 0; 537 return 0;
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 671da7fb7ffd..75716d3d2be0 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -717,7 +717,8 @@ start_journal_io:
717 if (commit_transaction->t_flushed_data_blocks && 717 if (commit_transaction->t_flushed_data_blocks &&
718 (journal->j_fs_dev != journal->j_dev) && 718 (journal->j_fs_dev != journal->j_dev) &&
719 (journal->j_flags & JBD2_BARRIER)) 719 (journal->j_flags & JBD2_BARRIER))
720 blkdev_issue_flush(journal->j_fs_dev, NULL); 720 blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL,
721 BLKDEV_IFL_WAIT);
721 722
722 /* Done it all: now write the commit record asynchronously. */ 723 /* Done it all: now write the commit record asynchronously. */
723 if (JBD2_HAS_INCOMPAT_FEATURE(journal, 724 if (JBD2_HAS_INCOMPAT_FEATURE(journal,
@@ -727,7 +728,8 @@ start_journal_io:
727 if (err) 728 if (err)
728 __jbd2_journal_abort_hard(journal); 729 __jbd2_journal_abort_hard(journal);
729 if (journal->j_flags & JBD2_BARRIER) 730 if (journal->j_flags & JBD2_BARRIER)
730 blkdev_issue_flush(journal->j_dev, NULL); 731 blkdev_issue_flush(journal->j_dev, GFP_KERNEL, NULL,
732 BLKDEV_IFL_WAIT);
731 } 733 }
732 734
733 err = journal_finish_inode_data_buffers(journal, commit_transaction); 735 err = journal_finish_inode_data_buffers(journal, commit_transaction);
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 33871f7e4f01..7ffcf2b8b1f4 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -670,7 +670,7 @@ int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump,
670 start * sects_per_block, 670 start * sects_per_block,
671 nblocks * sects_per_block, 671 nblocks * sects_per_block,
672 GFP_NOFS, 672 GFP_NOFS,
673 DISCARD_FL_BARRIER); 673 BLKDEV_IFL_BARRIER);
674 if (ret < 0) 674 if (ret < 0)
675 return ret; 675 return ret;
676 nblocks = 0; 676 nblocks = 0;
@@ -680,7 +680,7 @@ int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump,
680 ret = blkdev_issue_discard(nilfs->ns_bdev, 680 ret = blkdev_issue_discard(nilfs->ns_bdev,
681 start * sects_per_block, 681 start * sects_per_block,
682 nblocks * sects_per_block, 682 nblocks * sects_per_block,
683 GFP_NOFS, DISCARD_FL_BARRIER); 683 GFP_NOFS, BLKDEV_IFL_BARRIER);
684 return ret; 684 return ret;
685} 685}
686 686
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index 1d9c12714c5c..9977df9f3a54 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -147,7 +147,8 @@ static int reiserfs_sync_file(struct file *filp,
147 barrier_done = reiserfs_commit_for_inode(inode); 147 barrier_done = reiserfs_commit_for_inode(inode);
148 reiserfs_write_unlock(inode->i_sb); 148 reiserfs_write_unlock(inode->i_sb);
149 if (barrier_done != 1 && reiserfs_barrier_flush(inode->i_sb)) 149 if (barrier_done != 1 && reiserfs_barrier_flush(inode->i_sb))
150 blkdev_issue_flush(inode->i_sb->s_bdev, NULL); 150 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL,
151 BLKDEV_IFL_WAIT);
151 if (barrier_done < 0) 152 if (barrier_done < 0)
152 return barrier_done; 153 return barrier_done;
153 return (err < 0) ? -EIO : 0; 154 return (err < 0) ? -EIO : 0;
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 52e06b487ced..2b177c778ba7 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -725,7 +725,8 @@ void
725xfs_blkdev_issue_flush( 725xfs_blkdev_issue_flush(
726 xfs_buftarg_t *buftarg) 726 xfs_buftarg_t *buftarg)
727{ 727{
728 blkdev_issue_flush(buftarg->bt_bdev, NULL); 728 blkdev_issue_flush(buftarg->bt_bdev, GFP_KERNEL, NULL,
729 BLKDEV_IFL_WAIT);
729} 730}
730 731
731STATIC void 732STATIC void
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index bd0e3c6f323f..7534979d83bd 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -14,6 +14,7 @@
14#include <linux/kernel.h> 14#include <linux/kernel.h>
15#include <linux/fs.h> 15#include <linux/fs.h>
16#include <linux/sched.h> 16#include <linux/sched.h>
17#include <linux/timer.h>
17#include <linux/writeback.h> 18#include <linux/writeback.h>
18#include <asm/atomic.h> 19#include <asm/atomic.h>
19 20
@@ -88,6 +89,8 @@ struct backing_dev_info {
88 89
89 struct device *dev; 90 struct device *dev;
90 91
92 struct timer_list laptop_mode_wb_timer;
93
91#ifdef CONFIG_DEBUG_FS 94#ifdef CONFIG_DEBUG_FS
92 struct dentry *debug_dir; 95 struct dentry *debug_dir;
93 struct dentry *debug_stats; 96 struct dentry *debug_stats;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 6690e8bae7bb..3ac2bd2fc485 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -186,15 +186,19 @@ struct request {
186 }; 186 };
187 187
188 /* 188 /*
189 * two pointers are available for the IO schedulers, if they need 189 * Three pointers are available for the IO schedulers, if they need
190 * more they have to dynamically allocate it. 190 * more they have to dynamically allocate it.
191 */ 191 */
192 void *elevator_private; 192 void *elevator_private;
193 void *elevator_private2; 193 void *elevator_private2;
194 void *elevator_private3;
194 195
195 struct gendisk *rq_disk; 196 struct gendisk *rq_disk;
196 unsigned long start_time; 197 unsigned long start_time;
197 198#ifdef CONFIG_BLK_CGROUP
199 unsigned long long start_time_ns;
200 unsigned long long io_start_time_ns; /* when passed to hardware */
201#endif
198 /* Number of scatter-gather DMA addr+len pairs after 202 /* Number of scatter-gather DMA addr+len pairs after
199 * physical address coalescing is performed. 203 * physical address coalescing is performed.
200 */ 204 */
@@ -994,20 +998,25 @@ static inline struct request *blk_map_queue_find_tag(struct blk_queue_tag *bqt,
994 return NULL; 998 return NULL;
995 return bqt->tag_index[tag]; 999 return bqt->tag_index[tag];
996} 1000}
997 1001enum{
998extern int blkdev_issue_flush(struct block_device *, sector_t *); 1002 BLKDEV_WAIT, /* wait for completion */
999#define DISCARD_FL_WAIT 0x01 /* wait for completion */ 1003 BLKDEV_BARRIER, /*issue request with barrier */
1000#define DISCARD_FL_BARRIER 0x02 /* issue DISCARD_BARRIER request */ 1004};
1001extern int blkdev_issue_discard(struct block_device *, sector_t sector, 1005#define BLKDEV_IFL_WAIT (1 << BLKDEV_WAIT)
1002 sector_t nr_sects, gfp_t, int flags); 1006#define BLKDEV_IFL_BARRIER (1 << BLKDEV_BARRIER)
1003 1007extern int blkdev_issue_flush(struct block_device *, gfp_t, sector_t *,
1008 unsigned long);
1009extern int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
1010 sector_t nr_sects, gfp_t gfp_mask, unsigned long flags);
1011extern int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
1012 sector_t nr_sects, gfp_t gfp_mask, unsigned long flags);
1004static inline int sb_issue_discard(struct super_block *sb, 1013static inline int sb_issue_discard(struct super_block *sb,
1005 sector_t block, sector_t nr_blocks) 1014 sector_t block, sector_t nr_blocks)
1006{ 1015{
1007 block <<= (sb->s_blocksize_bits - 9); 1016 block <<= (sb->s_blocksize_bits - 9);
1008 nr_blocks <<= (sb->s_blocksize_bits - 9); 1017 nr_blocks <<= (sb->s_blocksize_bits - 9);
1009 return blkdev_issue_discard(sb->s_bdev, block, nr_blocks, GFP_KERNEL, 1018 return blkdev_issue_discard(sb->s_bdev, block, nr_blocks, GFP_KERNEL,
1010 DISCARD_FL_BARRIER); 1019 BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
1011} 1020}
1012 1021
1013extern int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm); 1022extern int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm);
@@ -1196,6 +1205,39 @@ static inline void put_dev_sector(Sector p)
1196struct work_struct; 1205struct work_struct;
1197int kblockd_schedule_work(struct request_queue *q, struct work_struct *work); 1206int kblockd_schedule_work(struct request_queue *q, struct work_struct *work);
1198 1207
1208#ifdef CONFIG_BLK_CGROUP
1209static inline void set_start_time_ns(struct request *req)
1210{
1211 req->start_time_ns = sched_clock();
1212}
1213
1214static inline void set_io_start_time_ns(struct request *req)
1215{
1216 req->io_start_time_ns = sched_clock();
1217}
1218
1219static inline uint64_t rq_start_time_ns(struct request *req)
1220{
1221 return req->start_time_ns;
1222}
1223
1224static inline uint64_t rq_io_start_time_ns(struct request *req)
1225{
1226 return req->io_start_time_ns;
1227}
1228#else
1229static inline void set_start_time_ns(struct request *req) {}
1230static inline void set_io_start_time_ns(struct request *req) {}
1231static inline uint64_t rq_start_time_ns(struct request *req)
1232{
1233 return 0;
1234}
1235static inline uint64_t rq_io_start_time_ns(struct request *req)
1236{
1237 return 0;
1238}
1239#endif
1240
1199#define MODULE_ALIAS_BLOCKDEV(major,minor) \ 1241#define MODULE_ALIAS_BLOCKDEV(major,minor) \
1200 MODULE_ALIAS("block-major-" __stringify(major) "-" __stringify(minor)) 1242 MODULE_ALIAS("block-major-" __stringify(major) "-" __stringify(minor))
1201#define MODULE_ALIAS_BLOCKDEV_MAJOR(major) \ 1243#define MODULE_ALIAS_BLOCKDEV_MAJOR(major) \
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index 1cb3372e65d8..2c958f4fce1e 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -14,6 +14,9 @@ typedef void (elevator_merged_fn) (struct request_queue *, struct request *, int
14 14
15typedef int (elevator_allow_merge_fn) (struct request_queue *, struct request *, struct bio *); 15typedef int (elevator_allow_merge_fn) (struct request_queue *, struct request *, struct bio *);
16 16
17typedef void (elevator_bio_merged_fn) (struct request_queue *,
18 struct request *, struct bio *);
19
17typedef int (elevator_dispatch_fn) (struct request_queue *, int); 20typedef int (elevator_dispatch_fn) (struct request_queue *, int);
18 21
19typedef void (elevator_add_req_fn) (struct request_queue *, struct request *); 22typedef void (elevator_add_req_fn) (struct request_queue *, struct request *);
@@ -36,6 +39,7 @@ struct elevator_ops
36 elevator_merged_fn *elevator_merged_fn; 39 elevator_merged_fn *elevator_merged_fn;
37 elevator_merge_req_fn *elevator_merge_req_fn; 40 elevator_merge_req_fn *elevator_merge_req_fn;
38 elevator_allow_merge_fn *elevator_allow_merge_fn; 41 elevator_allow_merge_fn *elevator_allow_merge_fn;
42 elevator_bio_merged_fn *elevator_bio_merged_fn;
39 43
40 elevator_dispatch_fn *elevator_dispatch_fn; 44 elevator_dispatch_fn *elevator_dispatch_fn;
41 elevator_add_req_fn *elevator_add_req_fn; 45 elevator_add_req_fn *elevator_add_req_fn;
@@ -103,6 +107,8 @@ extern int elv_merge(struct request_queue *, struct request **, struct bio *);
103extern void elv_merge_requests(struct request_queue *, struct request *, 107extern void elv_merge_requests(struct request_queue *, struct request *,
104 struct request *); 108 struct request *);
105extern void elv_merged_request(struct request_queue *, struct request *, int); 109extern void elv_merged_request(struct request_queue *, struct request *, int);
110extern void elv_bio_merged(struct request_queue *q, struct request *,
111 struct bio *);
106extern void elv_requeue_request(struct request_queue *, struct request *); 112extern void elv_requeue_request(struct request_queue *, struct request *);
107extern int elv_queue_empty(struct request_queue *); 113extern int elv_queue_empty(struct request_queue *);
108extern struct request *elv_former_request(struct request_queue *, struct request *); 114extern struct request *elv_former_request(struct request_queue *, struct request *);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 44f35aea2f1f..f30970c97acf 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -651,6 +651,7 @@ struct block_device {
651 int bd_openers; 651 int bd_openers;
652 struct mutex bd_mutex; /* open/close mutex */ 652 struct mutex bd_mutex; /* open/close mutex */
653 struct list_head bd_inodes; 653 struct list_head bd_inodes;
654 void * bd_claiming;
654 void * bd_holder; 655 void * bd_holder;
655 int bd_holders; 656 int bd_holders;
656#ifdef CONFIG_SYSFS 657#ifdef CONFIG_SYSFS
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 36520ded3e06..eb38a2c645f6 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -96,8 +96,10 @@ static inline void inode_sync_wait(struct inode *inode)
96/* 96/*
97 * mm/page-writeback.c 97 * mm/page-writeback.c
98 */ 98 */
99void laptop_io_completion(void); 99void laptop_io_completion(struct backing_dev_info *info);
100void laptop_sync_completion(void); 100void laptop_sync_completion(void);
101void laptop_mode_sync(struct work_struct *work);
102void laptop_mode_timer_fn(unsigned long data);
101void throttle_vm_writeout(gfp_t gfp_mask); 103void throttle_vm_writeout(gfp_t gfp_mask);
102 104
103/* These are exported to sysctl. */ 105/* These are exported to sysctl. */
diff --git a/init/Kconfig b/init/Kconfig
index eb77e8ccde1c..087c14f3c595 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -612,6 +612,33 @@ config RT_GROUP_SCHED
612 612
613endif #CGROUP_SCHED 613endif #CGROUP_SCHED
614 614
615config BLK_CGROUP
616 tristate "Block IO controller"
617 depends on CGROUPS && BLOCK
618 default n
619 ---help---
620 Generic block IO controller cgroup interface. This is the common
621 cgroup interface which should be used by various IO controlling
622 policies.
623
624 Currently, CFQ IO scheduler uses it to recognize task groups and
625 control disk bandwidth allocation (proportional time slice allocation)
626 to such task groups.
627
628 This option only enables generic Block IO controller infrastructure.
629 One needs to also enable actual IO controlling logic in CFQ for it
630 to take effect. (CONFIG_CFQ_GROUP_IOSCHED=y).
631
632 See Documentation/cgroups/blkio-controller.txt for more information.
633
634config DEBUG_BLK_CGROUP
635 bool "Enable Block IO controller debugging"
636 depends on BLK_CGROUP
637 default n
638 ---help---
639 Enable some debugging help. Currently it exports additional stat
640 files in a cgroup which can be useful for debugging.
641
615endif # CGROUPS 642endif # CGROUPS
616 643
617config MM_OWNER 644config MM_OWNER
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index 5b496132c28a..906a0f718cb3 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -41,6 +41,7 @@ unsigned long long __attribute__((weak)) sched_clock(void)
41 return (unsigned long long)(jiffies - INITIAL_JIFFIES) 41 return (unsigned long long)(jiffies - INITIAL_JIFFIES)
42 * (NSEC_PER_SEC / HZ); 42 * (NSEC_PER_SEC / HZ);
43} 43}
44EXPORT_SYMBOL_GPL(sched_clock);
44 45
45static __read_mostly int sched_clock_running; 46static __read_mostly int sched_clock_running;
46 47
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 0b19943ecf8b..d0f2b3765f8d 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -683,10 +683,6 @@ void throttle_vm_writeout(gfp_t gfp_mask)
683 } 683 }
684} 684}
685 685
686static void laptop_timer_fn(unsigned long unused);
687
688static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0);
689
690/* 686/*
691 * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs 687 * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs
692 */ 688 */
@@ -697,21 +693,19 @@ int dirty_writeback_centisecs_handler(ctl_table *table, int write,
697 return 0; 693 return 0;
698} 694}
699 695
700static void do_laptop_sync(struct work_struct *work) 696void laptop_mode_timer_fn(unsigned long data)
701{ 697{
702 wakeup_flusher_threads(0); 698 struct request_queue *q = (struct request_queue *)data;
703 kfree(work); 699 int nr_pages = global_page_state(NR_FILE_DIRTY) +
704} 700 global_page_state(NR_UNSTABLE_NFS);
705 701
706static void laptop_timer_fn(unsigned long unused) 702 /*
707{ 703 * We want to write everything out, not just down to the dirty
708 struct work_struct *work; 704 * threshold
705 */
709 706
710 work = kmalloc(sizeof(*work), GFP_ATOMIC); 707 if (bdi_has_dirty_io(&q->backing_dev_info))
711 if (work) { 708 bdi_start_writeback(&q->backing_dev_info, NULL, nr_pages);
712 INIT_WORK(work, do_laptop_sync);
713 schedule_work(work);
714 }
715} 709}
716 710
717/* 711/*
@@ -719,9 +713,9 @@ static void laptop_timer_fn(unsigned long unused)
719 * of all dirty data a few seconds from now. If the flush is already scheduled 713 * of all dirty data a few seconds from now. If the flush is already scheduled
720 * then push it back - the user is still using the disk. 714 * then push it back - the user is still using the disk.
721 */ 715 */
722void laptop_io_completion(void) 716void laptop_io_completion(struct backing_dev_info *info)
723{ 717{
724 mod_timer(&laptop_mode_wb_timer, jiffies + laptop_mode); 718 mod_timer(&info->laptop_mode_wb_timer, jiffies + laptop_mode);
725} 719}
726 720
727/* 721/*
@@ -731,7 +725,14 @@ void laptop_io_completion(void)
731 */ 725 */
732void laptop_sync_completion(void) 726void laptop_sync_completion(void)
733{ 727{
734 del_timer(&laptop_mode_wb_timer); 728 struct backing_dev_info *bdi;
729
730 rcu_read_lock();
731
732 list_for_each_entry_rcu(bdi, &bdi_list, bdi_list)
733 del_timer(&bdi->laptop_mode_wb_timer);
734
735 rcu_read_unlock();
735} 736}
736 737
737/* 738/*
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 6cd0a8f90dc7..eb086e0f4dcc 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -139,7 +139,8 @@ static int discard_swap(struct swap_info_struct *si)
139 nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9); 139 nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9);
140 if (nr_blocks) { 140 if (nr_blocks) {
141 err = blkdev_issue_discard(si->bdev, start_block, 141 err = blkdev_issue_discard(si->bdev, start_block,
142 nr_blocks, GFP_KERNEL, DISCARD_FL_BARRIER); 142 nr_blocks, GFP_KERNEL,
143 BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
143 if (err) 144 if (err)
144 return err; 145 return err;
145 cond_resched(); 146 cond_resched();
@@ -150,7 +151,8 @@ static int discard_swap(struct swap_info_struct *si)
150 nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9); 151 nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9);
151 152
152 err = blkdev_issue_discard(si->bdev, start_block, 153 err = blkdev_issue_discard(si->bdev, start_block,
153 nr_blocks, GFP_KERNEL, DISCARD_FL_BARRIER); 154 nr_blocks, GFP_KERNEL,
155 BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
154 if (err) 156 if (err)
155 break; 157 break;
156 158
@@ -189,7 +191,8 @@ static void discard_swap_cluster(struct swap_info_struct *si,
189 start_block <<= PAGE_SHIFT - 9; 191 start_block <<= PAGE_SHIFT - 9;
190 nr_blocks <<= PAGE_SHIFT - 9; 192 nr_blocks <<= PAGE_SHIFT - 9;
191 if (blkdev_issue_discard(si->bdev, start_block, 193 if (blkdev_issue_discard(si->bdev, start_block,
192 nr_blocks, GFP_NOIO, DISCARD_FL_BARRIER)) 194 nr_blocks, GFP_NOIO, BLKDEV_IFL_WAIT |
195 BLKDEV_IFL_BARRIER))
193 break; 196 break;
194 } 197 }
195 198