diff options
34 files changed, 1699 insertions, 333 deletions
diff --git a/Documentation/cgroups/blkio-controller.txt b/Documentation/cgroups/blkio-controller.txt index 630879cd9a42..48e0b21b0059 100644 --- a/Documentation/cgroups/blkio-controller.txt +++ b/Documentation/cgroups/blkio-controller.txt | |||
| @@ -17,6 +17,9 @@ HOWTO | |||
| 17 | You can do a very simple testing of running two dd threads in two different | 17 | You can do a very simple testing of running two dd threads in two different |
| 18 | cgroups. Here is what you can do. | 18 | cgroups. Here is what you can do. |
| 19 | 19 | ||
| 20 | - Enable Block IO controller | ||
| 21 | CONFIG_BLK_CGROUP=y | ||
| 22 | |||
| 20 | - Enable group scheduling in CFQ | 23 | - Enable group scheduling in CFQ |
| 21 | CONFIG_CFQ_GROUP_IOSCHED=y | 24 | CONFIG_CFQ_GROUP_IOSCHED=y |
| 22 | 25 | ||
| @@ -54,32 +57,52 @@ cgroups. Here is what you can do. | |||
| 54 | 57 | ||
| 55 | Various user visible config options | 58 | Various user visible config options |
| 56 | =================================== | 59 | =================================== |
| 57 | CONFIG_CFQ_GROUP_IOSCHED | ||
| 58 | - Enables group scheduling in CFQ. Currently only 1 level of group | ||
| 59 | creation is allowed. | ||
| 60 | |||
| 61 | CONFIG_DEBUG_CFQ_IOSCHED | ||
| 62 | - Enables some debugging messages in blktrace. Also creates extra | ||
| 63 | cgroup file blkio.dequeue. | ||
| 64 | |||
| 65 | Config options selected automatically | ||
| 66 | ===================================== | ||
| 67 | These config options are not user visible and are selected/deselected | ||
| 68 | automatically based on IO scheduler configuration. | ||
| 69 | |||
| 70 | CONFIG_BLK_CGROUP | 60 | CONFIG_BLK_CGROUP |
| 71 | - Block IO controller. Selected by CONFIG_CFQ_GROUP_IOSCHED. | 61 | - Block IO controller. |
| 72 | 62 | ||
| 73 | CONFIG_DEBUG_BLK_CGROUP | 63 | CONFIG_DEBUG_BLK_CGROUP |
| 74 | - Debug help. Selected by CONFIG_DEBUG_CFQ_IOSCHED. | 64 | - Debug help. Right now some additional stats file show up in cgroup |
| 65 | if this option is enabled. | ||
| 66 | |||
| 67 | CONFIG_CFQ_GROUP_IOSCHED | ||
| 68 | - Enables group scheduling in CFQ. Currently only 1 level of group | ||
| 69 | creation is allowed. | ||
| 75 | 70 | ||
| 76 | Details of cgroup files | 71 | Details of cgroup files |
| 77 | ======================= | 72 | ======================= |
| 78 | - blkio.weight | 73 | - blkio.weight |
| 79 | - Specifies per cgroup weight. | 74 | - Specifies per cgroup weight. This is default weight of the group |
| 80 | 75 | on all the devices until and unless overridden by per device rule. | |
| 76 | (See blkio.weight_device). | ||
| 81 | Currently allowed range of weights is from 100 to 1000. | 77 | Currently allowed range of weights is from 100 to 1000. |
| 82 | 78 | ||
| 79 | - blkio.weight_device | ||
| 80 | - One can specify per cgroup per device rules using this interface. | ||
| 81 | These rules override the default value of group weight as specified | ||
| 82 | by blkio.weight. | ||
| 83 | |||
| 84 | Following is the format. | ||
| 85 | |||
| 86 | #echo dev_maj:dev_minor weight > /path/to/cgroup/blkio.weight_device | ||
| 87 | Configure weight=300 on /dev/sdb (8:16) in this cgroup | ||
| 88 | # echo 8:16 300 > blkio.weight_device | ||
| 89 | # cat blkio.weight_device | ||
| 90 | dev weight | ||
| 91 | 8:16 300 | ||
| 92 | |||
| 93 | Configure weight=500 on /dev/sda (8:0) in this cgroup | ||
| 94 | # echo 8:0 500 > blkio.weight_device | ||
| 95 | # cat blkio.weight_device | ||
| 96 | dev weight | ||
| 97 | 8:0 500 | ||
| 98 | 8:16 300 | ||
| 99 | |||
| 100 | Remove specific weight for /dev/sda in this cgroup | ||
| 101 | # echo 8:0 0 > blkio.weight_device | ||
| 102 | # cat blkio.weight_device | ||
| 103 | dev weight | ||
| 104 | 8:16 300 | ||
| 105 | |||
| 83 | - blkio.time | 106 | - blkio.time |
| 84 | - disk time allocated to cgroup per device in milliseconds. First | 107 | - disk time allocated to cgroup per device in milliseconds. First |
| 85 | two fields specify the major and minor number of the device and | 108 | two fields specify the major and minor number of the device and |
| @@ -92,13 +115,105 @@ Details of cgroup files | |||
| 92 | third field specifies the number of sectors transferred by the | 115 | third field specifies the number of sectors transferred by the |
| 93 | group to/from the device. | 116 | group to/from the device. |
| 94 | 117 | ||
| 118 | - blkio.io_service_bytes | ||
| 119 | - Number of bytes transferred to/from the disk by the group. These | ||
| 120 | are further divided by the type of operation - read or write, sync | ||
| 121 | or async. First two fields specify the major and minor number of the | ||
| 122 | device, third field specifies the operation type and the fourth field | ||
| 123 | specifies the number of bytes. | ||
| 124 | |||
| 125 | - blkio.io_serviced | ||
| 126 | - Number of IOs completed to/from the disk by the group. These | ||
| 127 | are further divided by the type of operation - read or write, sync | ||
| 128 | or async. First two fields specify the major and minor number of the | ||
| 129 | device, third field specifies the operation type and the fourth field | ||
| 130 | specifies the number of IOs. | ||
| 131 | |||
| 132 | - blkio.io_service_time | ||
| 133 | - Total amount of time between request dispatch and request completion | ||
| 134 | for the IOs done by this cgroup. This is in nanoseconds to make it | ||
| 135 | meaningful for flash devices too. For devices with queue depth of 1, | ||
| 136 | this time represents the actual service time. When queue_depth > 1, | ||
| 137 | that is no longer true as requests may be served out of order. This | ||
| 138 | may cause the service time for a given IO to include the service time | ||
| 139 | of multiple IOs when served out of order which may result in total | ||
| 140 | io_service_time > actual time elapsed. This time is further divided by | ||
| 141 | the type of operation - read or write, sync or async. First two fields | ||
| 142 | specify the major and minor number of the device, third field | ||
| 143 | specifies the operation type and the fourth field specifies the | ||
| 144 | io_service_time in ns. | ||
| 145 | |||
| 146 | - blkio.io_wait_time | ||
| 147 | - Total amount of time the IOs for this cgroup spent waiting in the | ||
| 148 | scheduler queues for service. This can be greater than the total time | ||
| 149 | elapsed since it is cumulative io_wait_time for all IOs. It is not a | ||
| 150 | measure of total time the cgroup spent waiting but rather a measure of | ||
| 151 | the wait_time for its individual IOs. For devices with queue_depth > 1 | ||
| 152 | this metric does not include the time spent waiting for service once | ||
| 153 | the IO is dispatched to the device but till it actually gets serviced | ||
| 154 | (there might be a time lag here due to re-ordering of requests by the | ||
| 155 | device). This is in nanoseconds to make it meaningful for flash | ||
| 156 | devices too. This time is further divided by the type of operation - | ||
| 157 | read or write, sync or async. First two fields specify the major and | ||
| 158 | minor number of the device, third field specifies the operation type | ||
| 159 | and the fourth field specifies the io_wait_time in ns. | ||
| 160 | |||
| 161 | - blkio.io_merged | ||
| 162 | - Total number of bios/requests merged into requests belonging to this | ||
| 163 | cgroup. This is further divided by the type of operation - read or | ||
| 164 | write, sync or async. | ||
| 165 | |||
| 166 | - blkio.io_queued | ||
| 167 | - Total number of requests queued up at any given instant for this | ||
| 168 | cgroup. This is further divided by the type of operation - read or | ||
| 169 | write, sync or async. | ||
| 170 | |||
| 171 | - blkio.avg_queue_size | ||
| 172 | - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y. | ||
| 173 | The average queue size for this cgroup over the entire time of this | ||
| 174 | cgroup's existence. Queue size samples are taken each time one of the | ||
| 175 | queues of this cgroup gets a timeslice. | ||
| 176 | |||
| 177 | - blkio.group_wait_time | ||
| 178 | - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y. | ||
| 179 | This is the amount of time the cgroup had to wait since it became busy | ||
| 180 | (i.e., went from 0 to 1 request queued) to get a timeslice for one of | ||
| 181 | its queues. This is different from the io_wait_time which is the | ||
| 182 | cumulative total of the amount of time spent by each IO in that cgroup | ||
| 183 | waiting in the scheduler queue. This is in nanoseconds. If this is | ||
| 184 | read when the cgroup is in a waiting (for timeslice) state, the stat | ||
| 185 | will only report the group_wait_time accumulated till the last time it | ||
| 186 | got a timeslice and will not include the current delta. | ||
| 187 | |||
| 188 | - blkio.empty_time | ||
| 189 | - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y. | ||
| 190 | This is the amount of time a cgroup spends without any pending | ||
| 191 | requests when not being served, i.e., it does not include any time | ||
| 192 | spent idling for one of the queues of the cgroup. This is in | ||
| 193 | nanoseconds. If this is read when the cgroup is in an empty state, | ||
| 194 | the stat will only report the empty_time accumulated till the last | ||
| 195 | time it had a pending request and will not include the current delta. | ||
| 196 | |||
| 197 | - blkio.idle_time | ||
| 198 | - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y. | ||
| 199 | This is the amount of time spent by the IO scheduler idling for a | ||
| 200 | given cgroup in anticipation of a better request than the exising ones | ||
| 201 | from other queues/cgroups. This is in nanoseconds. If this is read | ||
| 202 | when the cgroup is in an idling state, the stat will only report the | ||
| 203 | idle_time accumulated till the last idle period and will not include | ||
| 204 | the current delta. | ||
| 205 | |||
| 95 | - blkio.dequeue | 206 | - blkio.dequeue |
| 96 | - Debugging aid only enabled if CONFIG_DEBUG_CFQ_IOSCHED=y. This | 207 | - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y. This |
| 97 | gives the statistics about how many a times a group was dequeued | 208 | gives the statistics about how many a times a group was dequeued |
| 98 | from service tree of the device. First two fields specify the major | 209 | from service tree of the device. First two fields specify the major |
| 99 | and minor number of the device and third field specifies the number | 210 | and minor number of the device and third field specifies the number |
| 100 | of times a group was dequeued from a particular device. | 211 | of times a group was dequeued from a particular device. |
| 101 | 212 | ||
| 213 | - blkio.reset_stats | ||
| 214 | - Writing an int to this file will result in resetting all the stats | ||
| 215 | for that cgroup. | ||
| 216 | |||
| 102 | CFQ sysfs tunable | 217 | CFQ sysfs tunable |
| 103 | ================= | 218 | ================= |
| 104 | /sys/block/<disk>/queue/iosched/group_isolation | 219 | /sys/block/<disk>/queue/iosched/group_isolation |
diff --git a/block/Kconfig b/block/Kconfig index f9e89f4d94bb..9be0b56eaee1 100644 --- a/block/Kconfig +++ b/block/Kconfig | |||
| @@ -77,29 +77,6 @@ config BLK_DEV_INTEGRITY | |||
| 77 | T10/SCSI Data Integrity Field or the T13/ATA External Path | 77 | T10/SCSI Data Integrity Field or the T13/ATA External Path |
| 78 | Protection. If in doubt, say N. | 78 | Protection. If in doubt, say N. |
| 79 | 79 | ||
| 80 | config BLK_CGROUP | ||
| 81 | tristate "Block cgroup support" | ||
| 82 | depends on CGROUPS | ||
| 83 | depends on CFQ_GROUP_IOSCHED | ||
| 84 | default n | ||
| 85 | ---help--- | ||
| 86 | Generic block IO controller cgroup interface. This is the common | ||
| 87 | cgroup interface which should be used by various IO controlling | ||
| 88 | policies. | ||
| 89 | |||
| 90 | Currently, CFQ IO scheduler uses it to recognize task groups and | ||
| 91 | control disk bandwidth allocation (proportional time slice allocation) | ||
| 92 | to such task groups. | ||
| 93 | |||
| 94 | config DEBUG_BLK_CGROUP | ||
| 95 | bool | ||
| 96 | depends on BLK_CGROUP | ||
| 97 | default n | ||
| 98 | ---help--- | ||
| 99 | Enable some debugging help. Currently it stores the cgroup path | ||
| 100 | in the blk group which can be used by cfq for tracing various | ||
| 101 | group related activity. | ||
| 102 | |||
| 103 | endif # BLOCK | 80 | endif # BLOCK |
| 104 | 81 | ||
| 105 | config BLOCK_COMPAT | 82 | config BLOCK_COMPAT |
diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched index fc71cf071fb2..3199b76f795d 100644 --- a/block/Kconfig.iosched +++ b/block/Kconfig.iosched | |||
| @@ -23,7 +23,8 @@ config IOSCHED_DEADLINE | |||
| 23 | 23 | ||
| 24 | config IOSCHED_CFQ | 24 | config IOSCHED_CFQ |
| 25 | tristate "CFQ I/O scheduler" | 25 | tristate "CFQ I/O scheduler" |
| 26 | select BLK_CGROUP if CFQ_GROUP_IOSCHED | 26 | # If BLK_CGROUP is a module, CFQ has to be built as module. |
| 27 | depends on (BLK_CGROUP=m && m) || !BLK_CGROUP || BLK_CGROUP=y | ||
| 27 | default y | 28 | default y |
| 28 | ---help--- | 29 | ---help--- |
| 29 | The CFQ I/O scheduler tries to distribute bandwidth equally | 30 | The CFQ I/O scheduler tries to distribute bandwidth equally |
| @@ -33,22 +34,15 @@ config IOSCHED_CFQ | |||
| 33 | 34 | ||
| 34 | This is the default I/O scheduler. | 35 | This is the default I/O scheduler. |
| 35 | 36 | ||
| 37 | Note: If BLK_CGROUP=m, then CFQ can be built only as module. | ||
| 38 | |||
| 36 | config CFQ_GROUP_IOSCHED | 39 | config CFQ_GROUP_IOSCHED |
| 37 | bool "CFQ Group Scheduling support" | 40 | bool "CFQ Group Scheduling support" |
| 38 | depends on IOSCHED_CFQ && CGROUPS | 41 | depends on IOSCHED_CFQ && BLK_CGROUP |
| 39 | default n | 42 | default n |
| 40 | ---help--- | 43 | ---help--- |
| 41 | Enable group IO scheduling in CFQ. | 44 | Enable group IO scheduling in CFQ. |
| 42 | 45 | ||
| 43 | config DEBUG_CFQ_IOSCHED | ||
| 44 | bool "Debug CFQ Scheduling" | ||
| 45 | depends on CFQ_GROUP_IOSCHED | ||
| 46 | select DEBUG_BLK_CGROUP | ||
| 47 | default n | ||
| 48 | ---help--- | ||
| 49 | Enable CFQ IO scheduling debugging in CFQ. Currently it makes | ||
| 50 | blktrace output more verbose. | ||
| 51 | |||
| 52 | choice | 46 | choice |
| 53 | prompt "Default I/O scheduler" | 47 | prompt "Default I/O scheduler" |
| 54 | default DEFAULT_CFQ | 48 | default DEFAULT_CFQ |
diff --git a/block/Makefile b/block/Makefile index cb2d515ebd6e..0bb499a739cd 100644 --- a/block/Makefile +++ b/block/Makefile | |||
| @@ -5,7 +5,7 @@ | |||
| 5 | obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \ | 5 | obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \ |
| 6 | blk-barrier.o blk-settings.o blk-ioc.o blk-map.o \ | 6 | blk-barrier.o blk-settings.o blk-ioc.o blk-map.o \ |
| 7 | blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \ | 7 | blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \ |
| 8 | blk-iopoll.o ioctl.o genhd.o scsi_ioctl.o | 8 | blk-iopoll.o blk-lib.o ioctl.o genhd.o scsi_ioctl.o |
| 9 | 9 | ||
| 10 | obj-$(CONFIG_BLK_DEV_BSG) += bsg.o | 10 | obj-$(CONFIG_BLK_DEV_BSG) += bsg.o |
| 11 | obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o | 11 | obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o |
diff --git a/block/blk-barrier.c b/block/blk-barrier.c index 6d88544b677f..0d710c9d403b 100644 --- a/block/blk-barrier.c +++ b/block/blk-barrier.c | |||
| @@ -286,26 +286,31 @@ static void bio_end_empty_barrier(struct bio *bio, int err) | |||
| 286 | set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); | 286 | set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); |
| 287 | clear_bit(BIO_UPTODATE, &bio->bi_flags); | 287 | clear_bit(BIO_UPTODATE, &bio->bi_flags); |
| 288 | } | 288 | } |
| 289 | 289 | if (bio->bi_private) | |
| 290 | complete(bio->bi_private); | 290 | complete(bio->bi_private); |
| 291 | bio_put(bio); | ||
| 291 | } | 292 | } |
| 292 | 293 | ||
| 293 | /** | 294 | /** |
| 294 | * blkdev_issue_flush - queue a flush | 295 | * blkdev_issue_flush - queue a flush |
| 295 | * @bdev: blockdev to issue flush for | 296 | * @bdev: blockdev to issue flush for |
| 297 | * @gfp_mask: memory allocation flags (for bio_alloc) | ||
| 296 | * @error_sector: error sector | 298 | * @error_sector: error sector |
| 299 | * @flags: BLKDEV_IFL_* flags to control behaviour | ||
| 297 | * | 300 | * |
| 298 | * Description: | 301 | * Description: |
| 299 | * Issue a flush for the block device in question. Caller can supply | 302 | * Issue a flush for the block device in question. Caller can supply |
| 300 | * room for storing the error offset in case of a flush error, if they | 303 | * room for storing the error offset in case of a flush error, if they |
| 301 | * wish to. | 304 | * wish to. If WAIT flag is not passed then caller may check only what |
| 305 | * request was pushed in some internal queue for later handling. | ||
| 302 | */ | 306 | */ |
| 303 | int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector) | 307 | int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask, |
| 308 | sector_t *error_sector, unsigned long flags) | ||
| 304 | { | 309 | { |
| 305 | DECLARE_COMPLETION_ONSTACK(wait); | 310 | DECLARE_COMPLETION_ONSTACK(wait); |
| 306 | struct request_queue *q; | 311 | struct request_queue *q; |
| 307 | struct bio *bio; | 312 | struct bio *bio; |
| 308 | int ret; | 313 | int ret = 0; |
| 309 | 314 | ||
| 310 | if (bdev->bd_disk == NULL) | 315 | if (bdev->bd_disk == NULL) |
| 311 | return -ENXIO; | 316 | return -ENXIO; |
| @@ -314,23 +319,25 @@ int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector) | |||
| 314 | if (!q) | 319 | if (!q) |
| 315 | return -ENXIO; | 320 | return -ENXIO; |
| 316 | 321 | ||
| 317 | bio = bio_alloc(GFP_KERNEL, 0); | 322 | bio = bio_alloc(gfp_mask, 0); |
| 318 | bio->bi_end_io = bio_end_empty_barrier; | 323 | bio->bi_end_io = bio_end_empty_barrier; |
| 319 | bio->bi_private = &wait; | ||
| 320 | bio->bi_bdev = bdev; | 324 | bio->bi_bdev = bdev; |
| 321 | submit_bio(WRITE_BARRIER, bio); | 325 | if (test_bit(BLKDEV_WAIT, &flags)) |
| 322 | 326 | bio->bi_private = &wait; | |
| 323 | wait_for_completion(&wait); | ||
| 324 | 327 | ||
| 325 | /* | 328 | bio_get(bio); |
| 326 | * The driver must store the error location in ->bi_sector, if | 329 | submit_bio(WRITE_BARRIER, bio); |
| 327 | * it supports it. For non-stacked drivers, this should be copied | 330 | if (test_bit(BLKDEV_WAIT, &flags)) { |
| 328 | * from blk_rq_pos(rq). | 331 | wait_for_completion(&wait); |
| 329 | */ | 332 | /* |
| 330 | if (error_sector) | 333 | * The driver must store the error location in ->bi_sector, if |
| 331 | *error_sector = bio->bi_sector; | 334 | * it supports it. For non-stacked drivers, this should be |
| 335 | * copied from blk_rq_pos(rq). | ||
| 336 | */ | ||
| 337 | if (error_sector) | ||
| 338 | *error_sector = bio->bi_sector; | ||
| 339 | } | ||
| 332 | 340 | ||
| 333 | ret = 0; | ||
| 334 | if (bio_flagged(bio, BIO_EOPNOTSUPP)) | 341 | if (bio_flagged(bio, BIO_EOPNOTSUPP)) |
| 335 | ret = -EOPNOTSUPP; | 342 | ret = -EOPNOTSUPP; |
| 336 | else if (!bio_flagged(bio, BIO_UPTODATE)) | 343 | else if (!bio_flagged(bio, BIO_UPTODATE)) |
| @@ -340,107 +347,3 @@ int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector) | |||
| 340 | return ret; | 347 | return ret; |
| 341 | } | 348 | } |
| 342 | EXPORT_SYMBOL(blkdev_issue_flush); | 349 | EXPORT_SYMBOL(blkdev_issue_flush); |
| 343 | |||
| 344 | static void blkdev_discard_end_io(struct bio *bio, int err) | ||
| 345 | { | ||
| 346 | if (err) { | ||
| 347 | if (err == -EOPNOTSUPP) | ||
| 348 | set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); | ||
| 349 | clear_bit(BIO_UPTODATE, &bio->bi_flags); | ||
| 350 | } | ||
| 351 | |||
| 352 | if (bio->bi_private) | ||
| 353 | complete(bio->bi_private); | ||
| 354 | __free_page(bio_page(bio)); | ||
| 355 | |||
| 356 | bio_put(bio); | ||
| 357 | } | ||
| 358 | |||
| 359 | /** | ||
| 360 | * blkdev_issue_discard - queue a discard | ||
| 361 | * @bdev: blockdev to issue discard for | ||
| 362 | * @sector: start sector | ||
| 363 | * @nr_sects: number of sectors to discard | ||
| 364 | * @gfp_mask: memory allocation flags (for bio_alloc) | ||
| 365 | * @flags: DISCARD_FL_* flags to control behaviour | ||
| 366 | * | ||
| 367 | * Description: | ||
| 368 | * Issue a discard request for the sectors in question. | ||
| 369 | */ | ||
| 370 | int blkdev_issue_discard(struct block_device *bdev, sector_t sector, | ||
| 371 | sector_t nr_sects, gfp_t gfp_mask, int flags) | ||
| 372 | { | ||
| 373 | DECLARE_COMPLETION_ONSTACK(wait); | ||
| 374 | struct request_queue *q = bdev_get_queue(bdev); | ||
| 375 | int type = flags & DISCARD_FL_BARRIER ? | ||
| 376 | DISCARD_BARRIER : DISCARD_NOBARRIER; | ||
| 377 | struct bio *bio; | ||
| 378 | struct page *page; | ||
| 379 | int ret = 0; | ||
| 380 | |||
| 381 | if (!q) | ||
| 382 | return -ENXIO; | ||
| 383 | |||
| 384 | if (!blk_queue_discard(q)) | ||
| 385 | return -EOPNOTSUPP; | ||
| 386 | |||
| 387 | while (nr_sects && !ret) { | ||
| 388 | unsigned int sector_size = q->limits.logical_block_size; | ||
| 389 | unsigned int max_discard_sectors = | ||
| 390 | min(q->limits.max_discard_sectors, UINT_MAX >> 9); | ||
| 391 | |||
| 392 | bio = bio_alloc(gfp_mask, 1); | ||
| 393 | if (!bio) | ||
| 394 | goto out; | ||
| 395 | bio->bi_sector = sector; | ||
| 396 | bio->bi_end_io = blkdev_discard_end_io; | ||
| 397 | bio->bi_bdev = bdev; | ||
| 398 | if (flags & DISCARD_FL_WAIT) | ||
| 399 | bio->bi_private = &wait; | ||
| 400 | |||
| 401 | /* | ||
| 402 | * Add a zeroed one-sector payload as that's what | ||
| 403 | * our current implementations need. If we'll ever need | ||
| 404 | * more the interface will need revisiting. | ||
| 405 | */ | ||
| 406 | page = alloc_page(gfp_mask | __GFP_ZERO); | ||
| 407 | if (!page) | ||
| 408 | goto out_free_bio; | ||
| 409 | if (bio_add_pc_page(q, bio, page, sector_size, 0) < sector_size) | ||
| 410 | goto out_free_page; | ||
| 411 | |||
| 412 | /* | ||
| 413 | * And override the bio size - the way discard works we | ||
| 414 | * touch many more blocks on disk than the actual payload | ||
| 415 | * length. | ||
| 416 | */ | ||
| 417 | if (nr_sects > max_discard_sectors) { | ||
| 418 | bio->bi_size = max_discard_sectors << 9; | ||
| 419 | nr_sects -= max_discard_sectors; | ||
| 420 | sector += max_discard_sectors; | ||
| 421 | } else { | ||
| 422 | bio->bi_size = nr_sects << 9; | ||
| 423 | nr_sects = 0; | ||
| 424 | } | ||
| 425 | |||
| 426 | bio_get(bio); | ||
| 427 | submit_bio(type, bio); | ||
| 428 | |||
| 429 | if (flags & DISCARD_FL_WAIT) | ||
| 430 | wait_for_completion(&wait); | ||
| 431 | |||
| 432 | if (bio_flagged(bio, BIO_EOPNOTSUPP)) | ||
| 433 | ret = -EOPNOTSUPP; | ||
| 434 | else if (!bio_flagged(bio, BIO_UPTODATE)) | ||
| 435 | ret = -EIO; | ||
| 436 | bio_put(bio); | ||
| 437 | } | ||
| 438 | return ret; | ||
| 439 | out_free_page: | ||
| 440 | __free_page(page); | ||
| 441 | out_free_bio: | ||
| 442 | bio_put(bio); | ||
| 443 | out: | ||
| 444 | return -ENOMEM; | ||
| 445 | } | ||
| 446 | EXPORT_SYMBOL(blkdev_issue_discard); | ||
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 5fe03def34b2..d02bbf88de13 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c | |||
| @@ -15,8 +15,12 @@ | |||
| 15 | #include <linux/kdev_t.h> | 15 | #include <linux/kdev_t.h> |
| 16 | #include <linux/module.h> | 16 | #include <linux/module.h> |
| 17 | #include <linux/err.h> | 17 | #include <linux/err.h> |
| 18 | #include <linux/blkdev.h> | ||
| 18 | #include <linux/slab.h> | 19 | #include <linux/slab.h> |
| 19 | #include "blk-cgroup.h" | 20 | #include "blk-cgroup.h" |
| 21 | #include <linux/genhd.h> | ||
| 22 | |||
| 23 | #define MAX_KEY_LEN 100 | ||
| 20 | 24 | ||
| 21 | static DEFINE_SPINLOCK(blkio_list_lock); | 25 | static DEFINE_SPINLOCK(blkio_list_lock); |
| 22 | static LIST_HEAD(blkio_list); | 26 | static LIST_HEAD(blkio_list); |
| @@ -49,6 +53,32 @@ struct cgroup_subsys blkio_subsys = { | |||
| 49 | }; | 53 | }; |
| 50 | EXPORT_SYMBOL_GPL(blkio_subsys); | 54 | EXPORT_SYMBOL_GPL(blkio_subsys); |
| 51 | 55 | ||
| 56 | static inline void blkio_policy_insert_node(struct blkio_cgroup *blkcg, | ||
| 57 | struct blkio_policy_node *pn) | ||
| 58 | { | ||
| 59 | list_add(&pn->node, &blkcg->policy_list); | ||
| 60 | } | ||
| 61 | |||
| 62 | /* Must be called with blkcg->lock held */ | ||
| 63 | static inline void blkio_policy_delete_node(struct blkio_policy_node *pn) | ||
| 64 | { | ||
| 65 | list_del(&pn->node); | ||
| 66 | } | ||
| 67 | |||
| 68 | /* Must be called with blkcg->lock held */ | ||
| 69 | static struct blkio_policy_node * | ||
| 70 | blkio_policy_search_node(const struct blkio_cgroup *blkcg, dev_t dev) | ||
| 71 | { | ||
| 72 | struct blkio_policy_node *pn; | ||
| 73 | |||
| 74 | list_for_each_entry(pn, &blkcg->policy_list, node) { | ||
| 75 | if (pn->dev == dev) | ||
| 76 | return pn; | ||
| 77 | } | ||
| 78 | |||
| 79 | return NULL; | ||
| 80 | } | ||
| 81 | |||
| 52 | struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup) | 82 | struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup) |
| 53 | { | 83 | { |
| 54 | return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id), | 84 | return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id), |
| @@ -56,13 +86,259 @@ struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup) | |||
| 56 | } | 86 | } |
| 57 | EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup); | 87 | EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup); |
| 58 | 88 | ||
| 59 | void blkiocg_update_blkio_group_stats(struct blkio_group *blkg, | 89 | /* |
| 60 | unsigned long time, unsigned long sectors) | 90 | * Add to the appropriate stat variable depending on the request type. |
| 91 | * This should be called with the blkg->stats_lock held. | ||
| 92 | */ | ||
| 93 | static void blkio_add_stat(uint64_t *stat, uint64_t add, bool direction, | ||
| 94 | bool sync) | ||
| 95 | { | ||
| 96 | if (direction) | ||
| 97 | stat[BLKIO_STAT_WRITE] += add; | ||
| 98 | else | ||
| 99 | stat[BLKIO_STAT_READ] += add; | ||
| 100 | if (sync) | ||
| 101 | stat[BLKIO_STAT_SYNC] += add; | ||
| 102 | else | ||
| 103 | stat[BLKIO_STAT_ASYNC] += add; | ||
| 104 | } | ||
| 105 | |||
| 106 | /* | ||
| 107 | * Decrements the appropriate stat variable if non-zero depending on the | ||
| 108 | * request type. Panics on value being zero. | ||
| 109 | * This should be called with the blkg->stats_lock held. | ||
| 110 | */ | ||
| 111 | static void blkio_check_and_dec_stat(uint64_t *stat, bool direction, bool sync) | ||
| 112 | { | ||
| 113 | if (direction) { | ||
| 114 | BUG_ON(stat[BLKIO_STAT_WRITE] == 0); | ||
| 115 | stat[BLKIO_STAT_WRITE]--; | ||
| 116 | } else { | ||
| 117 | BUG_ON(stat[BLKIO_STAT_READ] == 0); | ||
| 118 | stat[BLKIO_STAT_READ]--; | ||
| 119 | } | ||
| 120 | if (sync) { | ||
| 121 | BUG_ON(stat[BLKIO_STAT_SYNC] == 0); | ||
| 122 | stat[BLKIO_STAT_SYNC]--; | ||
| 123 | } else { | ||
| 124 | BUG_ON(stat[BLKIO_STAT_ASYNC] == 0); | ||
| 125 | stat[BLKIO_STAT_ASYNC]--; | ||
| 126 | } | ||
| 127 | } | ||
| 128 | |||
| 129 | #ifdef CONFIG_DEBUG_BLK_CGROUP | ||
| 130 | /* This should be called with the blkg->stats_lock held. */ | ||
| 131 | static void blkio_set_start_group_wait_time(struct blkio_group *blkg, | ||
| 132 | struct blkio_group *curr_blkg) | ||
| 133 | { | ||
| 134 | if (blkio_blkg_waiting(&blkg->stats)) | ||
| 135 | return; | ||
| 136 | if (blkg == curr_blkg) | ||
| 137 | return; | ||
| 138 | blkg->stats.start_group_wait_time = sched_clock(); | ||
| 139 | blkio_mark_blkg_waiting(&blkg->stats); | ||
| 140 | } | ||
| 141 | |||
| 142 | /* This should be called with the blkg->stats_lock held. */ | ||
| 143 | static void blkio_update_group_wait_time(struct blkio_group_stats *stats) | ||
| 144 | { | ||
| 145 | unsigned long long now; | ||
| 146 | |||
| 147 | if (!blkio_blkg_waiting(stats)) | ||
| 148 | return; | ||
| 149 | |||
| 150 | now = sched_clock(); | ||
| 151 | if (time_after64(now, stats->start_group_wait_time)) | ||
| 152 | stats->group_wait_time += now - stats->start_group_wait_time; | ||
| 153 | blkio_clear_blkg_waiting(stats); | ||
| 154 | } | ||
| 155 | |||
| 156 | /* This should be called with the blkg->stats_lock held. */ | ||
| 157 | static void blkio_end_empty_time(struct blkio_group_stats *stats) | ||
| 158 | { | ||
| 159 | unsigned long long now; | ||
| 160 | |||
| 161 | if (!blkio_blkg_empty(stats)) | ||
| 162 | return; | ||
| 163 | |||
| 164 | now = sched_clock(); | ||
| 165 | if (time_after64(now, stats->start_empty_time)) | ||
| 166 | stats->empty_time += now - stats->start_empty_time; | ||
| 167 | blkio_clear_blkg_empty(stats); | ||
| 168 | } | ||
| 169 | |||
| 170 | void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg) | ||
| 171 | { | ||
| 172 | unsigned long flags; | ||
| 173 | |||
| 174 | spin_lock_irqsave(&blkg->stats_lock, flags); | ||
| 175 | BUG_ON(blkio_blkg_idling(&blkg->stats)); | ||
| 176 | blkg->stats.start_idle_time = sched_clock(); | ||
| 177 | blkio_mark_blkg_idling(&blkg->stats); | ||
| 178 | spin_unlock_irqrestore(&blkg->stats_lock, flags); | ||
| 179 | } | ||
| 180 | EXPORT_SYMBOL_GPL(blkiocg_update_set_idle_time_stats); | ||
| 181 | |||
| 182 | void blkiocg_update_idle_time_stats(struct blkio_group *blkg) | ||
| 183 | { | ||
| 184 | unsigned long flags; | ||
| 185 | unsigned long long now; | ||
| 186 | struct blkio_group_stats *stats; | ||
| 187 | |||
| 188 | spin_lock_irqsave(&blkg->stats_lock, flags); | ||
| 189 | stats = &blkg->stats; | ||
| 190 | if (blkio_blkg_idling(stats)) { | ||
| 191 | now = sched_clock(); | ||
| 192 | if (time_after64(now, stats->start_idle_time)) | ||
| 193 | stats->idle_time += now - stats->start_idle_time; | ||
| 194 | blkio_clear_blkg_idling(stats); | ||
| 195 | } | ||
| 196 | spin_unlock_irqrestore(&blkg->stats_lock, flags); | ||
| 197 | } | ||
| 198 | EXPORT_SYMBOL_GPL(blkiocg_update_idle_time_stats); | ||
| 199 | |||
| 200 | void blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg) | ||
| 201 | { | ||
| 202 | unsigned long flags; | ||
| 203 | struct blkio_group_stats *stats; | ||
| 204 | |||
| 205 | spin_lock_irqsave(&blkg->stats_lock, flags); | ||
| 206 | stats = &blkg->stats; | ||
| 207 | stats->avg_queue_size_sum += | ||
| 208 | stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] + | ||
| 209 | stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE]; | ||
| 210 | stats->avg_queue_size_samples++; | ||
| 211 | blkio_update_group_wait_time(stats); | ||
| 212 | spin_unlock_irqrestore(&blkg->stats_lock, flags); | ||
| 213 | } | ||
| 214 | EXPORT_SYMBOL_GPL(blkiocg_update_avg_queue_size_stats); | ||
| 215 | |||
| 216 | void blkiocg_set_start_empty_time(struct blkio_group *blkg) | ||
| 217 | { | ||
| 218 | unsigned long flags; | ||
| 219 | struct blkio_group_stats *stats; | ||
| 220 | |||
| 221 | spin_lock_irqsave(&blkg->stats_lock, flags); | ||
| 222 | stats = &blkg->stats; | ||
| 223 | |||
| 224 | if (stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] || | ||
| 225 | stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE]) { | ||
| 226 | spin_unlock_irqrestore(&blkg->stats_lock, flags); | ||
| 227 | return; | ||
| 228 | } | ||
| 229 | |||
| 230 | /* | ||
| 231 | * group is already marked empty. This can happen if cfqq got new | ||
| 232 | * request in parent group and moved to this group while being added | ||
| 233 | * to service tree. Just ignore the event and move on. | ||
| 234 | */ | ||
| 235 | if(blkio_blkg_empty(stats)) { | ||
| 236 | spin_unlock_irqrestore(&blkg->stats_lock, flags); | ||
| 237 | return; | ||
| 238 | } | ||
| 239 | |||
| 240 | stats->start_empty_time = sched_clock(); | ||
| 241 | blkio_mark_blkg_empty(stats); | ||
| 242 | spin_unlock_irqrestore(&blkg->stats_lock, flags); | ||
| 243 | } | ||
| 244 | EXPORT_SYMBOL_GPL(blkiocg_set_start_empty_time); | ||
| 245 | |||
| 246 | void blkiocg_update_dequeue_stats(struct blkio_group *blkg, | ||
| 247 | unsigned long dequeue) | ||
| 248 | { | ||
| 249 | blkg->stats.dequeue += dequeue; | ||
| 250 | } | ||
| 251 | EXPORT_SYMBOL_GPL(blkiocg_update_dequeue_stats); | ||
| 252 | #else | ||
| 253 | static inline void blkio_set_start_group_wait_time(struct blkio_group *blkg, | ||
| 254 | struct blkio_group *curr_blkg) {} | ||
| 255 | static inline void blkio_end_empty_time(struct blkio_group_stats *stats) {} | ||
| 256 | #endif | ||
| 257 | |||
| 258 | void blkiocg_update_io_add_stats(struct blkio_group *blkg, | ||
| 259 | struct blkio_group *curr_blkg, bool direction, | ||
| 260 | bool sync) | ||
| 261 | { | ||
| 262 | unsigned long flags; | ||
| 263 | |||
| 264 | spin_lock_irqsave(&blkg->stats_lock, flags); | ||
| 265 | blkio_add_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED], 1, direction, | ||
| 266 | sync); | ||
| 267 | blkio_end_empty_time(&blkg->stats); | ||
| 268 | blkio_set_start_group_wait_time(blkg, curr_blkg); | ||
| 269 | spin_unlock_irqrestore(&blkg->stats_lock, flags); | ||
| 270 | } | ||
| 271 | EXPORT_SYMBOL_GPL(blkiocg_update_io_add_stats); | ||
| 272 | |||
| 273 | void blkiocg_update_io_remove_stats(struct blkio_group *blkg, | ||
| 274 | bool direction, bool sync) | ||
| 275 | { | ||
| 276 | unsigned long flags; | ||
| 277 | |||
| 278 | spin_lock_irqsave(&blkg->stats_lock, flags); | ||
| 279 | blkio_check_and_dec_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED], | ||
| 280 | direction, sync); | ||
| 281 | spin_unlock_irqrestore(&blkg->stats_lock, flags); | ||
| 282 | } | ||
| 283 | EXPORT_SYMBOL_GPL(blkiocg_update_io_remove_stats); | ||
| 284 | |||
| 285 | void blkiocg_update_timeslice_used(struct blkio_group *blkg, unsigned long time) | ||
| 286 | { | ||
| 287 | unsigned long flags; | ||
| 288 | |||
| 289 | spin_lock_irqsave(&blkg->stats_lock, flags); | ||
| 290 | blkg->stats.time += time; | ||
| 291 | spin_unlock_irqrestore(&blkg->stats_lock, flags); | ||
| 292 | } | ||
| 293 | EXPORT_SYMBOL_GPL(blkiocg_update_timeslice_used); | ||
| 294 | |||
| 295 | void blkiocg_update_dispatch_stats(struct blkio_group *blkg, | ||
| 296 | uint64_t bytes, bool direction, bool sync) | ||
| 297 | { | ||
| 298 | struct blkio_group_stats *stats; | ||
| 299 | unsigned long flags; | ||
| 300 | |||
| 301 | spin_lock_irqsave(&blkg->stats_lock, flags); | ||
| 302 | stats = &blkg->stats; | ||
| 303 | stats->sectors += bytes >> 9; | ||
| 304 | blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICED], 1, direction, | ||
| 305 | sync); | ||
| 306 | blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICE_BYTES], bytes, | ||
| 307 | direction, sync); | ||
| 308 | spin_unlock_irqrestore(&blkg->stats_lock, flags); | ||
| 309 | } | ||
| 310 | EXPORT_SYMBOL_GPL(blkiocg_update_dispatch_stats); | ||
| 311 | |||
| 312 | void blkiocg_update_completion_stats(struct blkio_group *blkg, | ||
| 313 | uint64_t start_time, uint64_t io_start_time, bool direction, bool sync) | ||
| 314 | { | ||
| 315 | struct blkio_group_stats *stats; | ||
| 316 | unsigned long flags; | ||
| 317 | unsigned long long now = sched_clock(); | ||
| 318 | |||
| 319 | spin_lock_irqsave(&blkg->stats_lock, flags); | ||
| 320 | stats = &blkg->stats; | ||
| 321 | if (time_after64(now, io_start_time)) | ||
| 322 | blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICE_TIME], | ||
| 323 | now - io_start_time, direction, sync); | ||
| 324 | if (time_after64(io_start_time, start_time)) | ||
| 325 | blkio_add_stat(stats->stat_arr[BLKIO_STAT_WAIT_TIME], | ||
| 326 | io_start_time - start_time, direction, sync); | ||
| 327 | spin_unlock_irqrestore(&blkg->stats_lock, flags); | ||
| 328 | } | ||
| 329 | EXPORT_SYMBOL_GPL(blkiocg_update_completion_stats); | ||
| 330 | |||
| 331 | void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction, | ||
| 332 | bool sync) | ||
| 61 | { | 333 | { |
| 62 | blkg->time += time; | 334 | unsigned long flags; |
| 63 | blkg->sectors += sectors; | 335 | |
| 336 | spin_lock_irqsave(&blkg->stats_lock, flags); | ||
| 337 | blkio_add_stat(blkg->stats.stat_arr[BLKIO_STAT_MERGED], 1, direction, | ||
| 338 | sync); | ||
| 339 | spin_unlock_irqrestore(&blkg->stats_lock, flags); | ||
| 64 | } | 340 | } |
| 65 | EXPORT_SYMBOL_GPL(blkiocg_update_blkio_group_stats); | 341 | EXPORT_SYMBOL_GPL(blkiocg_update_io_merged_stats); |
| 66 | 342 | ||
| 67 | void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, | 343 | void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, |
| 68 | struct blkio_group *blkg, void *key, dev_t dev) | 344 | struct blkio_group *blkg, void *key, dev_t dev) |
| @@ -70,14 +346,13 @@ void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, | |||
| 70 | unsigned long flags; | 346 | unsigned long flags; |
| 71 | 347 | ||
| 72 | spin_lock_irqsave(&blkcg->lock, flags); | 348 | spin_lock_irqsave(&blkcg->lock, flags); |
| 349 | spin_lock_init(&blkg->stats_lock); | ||
| 73 | rcu_assign_pointer(blkg->key, key); | 350 | rcu_assign_pointer(blkg->key, key); |
| 74 | blkg->blkcg_id = css_id(&blkcg->css); | 351 | blkg->blkcg_id = css_id(&blkcg->css); |
| 75 | hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list); | 352 | hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list); |
| 76 | spin_unlock_irqrestore(&blkcg->lock, flags); | 353 | spin_unlock_irqrestore(&blkcg->lock, flags); |
| 77 | #ifdef CONFIG_DEBUG_BLK_CGROUP | ||
| 78 | /* Need to take css reference ? */ | 354 | /* Need to take css reference ? */ |
| 79 | cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path)); | 355 | cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path)); |
| 80 | #endif | ||
| 81 | blkg->dev = dev; | 356 | blkg->dev = dev; |
| 82 | } | 357 | } |
| 83 | EXPORT_SYMBOL_GPL(blkiocg_add_blkio_group); | 358 | EXPORT_SYMBOL_GPL(blkiocg_add_blkio_group); |
| @@ -154,6 +429,7 @@ blkiocg_weight_write(struct cgroup *cgroup, struct cftype *cftype, u64 val) | |||
| 154 | struct blkio_group *blkg; | 429 | struct blkio_group *blkg; |
| 155 | struct hlist_node *n; | 430 | struct hlist_node *n; |
| 156 | struct blkio_policy_type *blkiop; | 431 | struct blkio_policy_type *blkiop; |
| 432 | struct blkio_policy_node *pn; | ||
| 157 | 433 | ||
| 158 | if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX) | 434 | if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX) |
| 159 | return -EINVAL; | 435 | return -EINVAL; |
| @@ -162,7 +438,13 @@ blkiocg_weight_write(struct cgroup *cgroup, struct cftype *cftype, u64 val) | |||
| 162 | spin_lock(&blkio_list_lock); | 438 | spin_lock(&blkio_list_lock); |
| 163 | spin_lock_irq(&blkcg->lock); | 439 | spin_lock_irq(&blkcg->lock); |
| 164 | blkcg->weight = (unsigned int)val; | 440 | blkcg->weight = (unsigned int)val; |
| 441 | |||
| 165 | hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) { | 442 | hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) { |
| 443 | pn = blkio_policy_search_node(blkcg, blkg->dev); | ||
| 444 | |||
| 445 | if (pn) | ||
| 446 | continue; | ||
| 447 | |||
| 166 | list_for_each_entry(blkiop, &blkio_list, list) | 448 | list_for_each_entry(blkiop, &blkio_list, list) |
| 167 | blkiop->ops.blkio_update_group_weight_fn(blkg, | 449 | blkiop->ops.blkio_update_group_weight_fn(blkg, |
| 168 | blkcg->weight); | 450 | blkcg->weight); |
| @@ -172,13 +454,154 @@ blkiocg_weight_write(struct cgroup *cgroup, struct cftype *cftype, u64 val) | |||
| 172 | return 0; | 454 | return 0; |
| 173 | } | 455 | } |
| 174 | 456 | ||
| 175 | #define SHOW_FUNCTION_PER_GROUP(__VAR) \ | 457 | static int |
| 458 | blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val) | ||
| 459 | { | ||
| 460 | struct blkio_cgroup *blkcg; | ||
| 461 | struct blkio_group *blkg; | ||
| 462 | struct blkio_group_stats *stats; | ||
| 463 | struct hlist_node *n; | ||
| 464 | uint64_t queued[BLKIO_STAT_TOTAL]; | ||
| 465 | int i; | ||
| 466 | #ifdef CONFIG_DEBUG_BLK_CGROUP | ||
| 467 | bool idling, waiting, empty; | ||
| 468 | unsigned long long now = sched_clock(); | ||
| 469 | #endif | ||
| 470 | |||
| 471 | blkcg = cgroup_to_blkio_cgroup(cgroup); | ||
| 472 | spin_lock_irq(&blkcg->lock); | ||
| 473 | hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) { | ||
| 474 | spin_lock(&blkg->stats_lock); | ||
| 475 | stats = &blkg->stats; | ||
| 476 | #ifdef CONFIG_DEBUG_BLK_CGROUP | ||
| 477 | idling = blkio_blkg_idling(stats); | ||
| 478 | waiting = blkio_blkg_waiting(stats); | ||
| 479 | empty = blkio_blkg_empty(stats); | ||
| 480 | #endif | ||
| 481 | for (i = 0; i < BLKIO_STAT_TOTAL; i++) | ||
| 482 | queued[i] = stats->stat_arr[BLKIO_STAT_QUEUED][i]; | ||
| 483 | memset(stats, 0, sizeof(struct blkio_group_stats)); | ||
| 484 | for (i = 0; i < BLKIO_STAT_TOTAL; i++) | ||
| 485 | stats->stat_arr[BLKIO_STAT_QUEUED][i] = queued[i]; | ||
| 486 | #ifdef CONFIG_DEBUG_BLK_CGROUP | ||
| 487 | if (idling) { | ||
| 488 | blkio_mark_blkg_idling(stats); | ||
| 489 | stats->start_idle_time = now; | ||
| 490 | } | ||
| 491 | if (waiting) { | ||
| 492 | blkio_mark_blkg_waiting(stats); | ||
| 493 | stats->start_group_wait_time = now; | ||
| 494 | } | ||
| 495 | if (empty) { | ||
| 496 | blkio_mark_blkg_empty(stats); | ||
| 497 | stats->start_empty_time = now; | ||
| 498 | } | ||
| 499 | #endif | ||
| 500 | spin_unlock(&blkg->stats_lock); | ||
| 501 | } | ||
| 502 | spin_unlock_irq(&blkcg->lock); | ||
| 503 | return 0; | ||
| 504 | } | ||
| 505 | |||
| 506 | static void blkio_get_key_name(enum stat_sub_type type, dev_t dev, char *str, | ||
| 507 | int chars_left, bool diskname_only) | ||
| 508 | { | ||
| 509 | snprintf(str, chars_left, "%d:%d", MAJOR(dev), MINOR(dev)); | ||
| 510 | chars_left -= strlen(str); | ||
| 511 | if (chars_left <= 0) { | ||
| 512 | printk(KERN_WARNING | ||
| 513 | "Possibly incorrect cgroup stat display format"); | ||
| 514 | return; | ||
| 515 | } | ||
| 516 | if (diskname_only) | ||
| 517 | return; | ||
| 518 | switch (type) { | ||
| 519 | case BLKIO_STAT_READ: | ||
| 520 | strlcat(str, " Read", chars_left); | ||
| 521 | break; | ||
| 522 | case BLKIO_STAT_WRITE: | ||
| 523 | strlcat(str, " Write", chars_left); | ||
| 524 | break; | ||
| 525 | case BLKIO_STAT_SYNC: | ||
| 526 | strlcat(str, " Sync", chars_left); | ||
| 527 | break; | ||
| 528 | case BLKIO_STAT_ASYNC: | ||
| 529 | strlcat(str, " Async", chars_left); | ||
| 530 | break; | ||
| 531 | case BLKIO_STAT_TOTAL: | ||
| 532 | strlcat(str, " Total", chars_left); | ||
| 533 | break; | ||
| 534 | default: | ||
| 535 | strlcat(str, " Invalid", chars_left); | ||
| 536 | } | ||
| 537 | } | ||
| 538 | |||
| 539 | static uint64_t blkio_fill_stat(char *str, int chars_left, uint64_t val, | ||
| 540 | struct cgroup_map_cb *cb, dev_t dev) | ||
| 541 | { | ||
| 542 | blkio_get_key_name(0, dev, str, chars_left, true); | ||
| 543 | cb->fill(cb, str, val); | ||
| 544 | return val; | ||
| 545 | } | ||
| 546 | |||
| 547 | /* This should be called with blkg->stats_lock held */ | ||
| 548 | static uint64_t blkio_get_stat(struct blkio_group *blkg, | ||
| 549 | struct cgroup_map_cb *cb, dev_t dev, enum stat_type type) | ||
| 550 | { | ||
| 551 | uint64_t disk_total; | ||
| 552 | char key_str[MAX_KEY_LEN]; | ||
| 553 | enum stat_sub_type sub_type; | ||
| 554 | |||
| 555 | if (type == BLKIO_STAT_TIME) | ||
| 556 | return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, | ||
| 557 | blkg->stats.time, cb, dev); | ||
| 558 | if (type == BLKIO_STAT_SECTORS) | ||
| 559 | return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, | ||
| 560 | blkg->stats.sectors, cb, dev); | ||
| 561 | #ifdef CONFIG_DEBUG_BLK_CGROUP | ||
| 562 | if (type == BLKIO_STAT_AVG_QUEUE_SIZE) { | ||
| 563 | uint64_t sum = blkg->stats.avg_queue_size_sum; | ||
| 564 | uint64_t samples = blkg->stats.avg_queue_size_samples; | ||
| 565 | if (samples) | ||
| 566 | do_div(sum, samples); | ||
| 567 | else | ||
| 568 | sum = 0; | ||
| 569 | return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, sum, cb, dev); | ||
| 570 | } | ||
| 571 | if (type == BLKIO_STAT_GROUP_WAIT_TIME) | ||
| 572 | return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, | ||
| 573 | blkg->stats.group_wait_time, cb, dev); | ||
| 574 | if (type == BLKIO_STAT_IDLE_TIME) | ||
| 575 | return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, | ||
| 576 | blkg->stats.idle_time, cb, dev); | ||
| 577 | if (type == BLKIO_STAT_EMPTY_TIME) | ||
| 578 | return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, | ||
| 579 | blkg->stats.empty_time, cb, dev); | ||
| 580 | if (type == BLKIO_STAT_DEQUEUE) | ||
| 581 | return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, | ||
| 582 | blkg->stats.dequeue, cb, dev); | ||
| 583 | #endif | ||
| 584 | |||
| 585 | for (sub_type = BLKIO_STAT_READ; sub_type < BLKIO_STAT_TOTAL; | ||
| 586 | sub_type++) { | ||
| 587 | blkio_get_key_name(sub_type, dev, key_str, MAX_KEY_LEN, false); | ||
| 588 | cb->fill(cb, key_str, blkg->stats.stat_arr[type][sub_type]); | ||
| 589 | } | ||
| 590 | disk_total = blkg->stats.stat_arr[type][BLKIO_STAT_READ] + | ||
| 591 | blkg->stats.stat_arr[type][BLKIO_STAT_WRITE]; | ||
| 592 | blkio_get_key_name(BLKIO_STAT_TOTAL, dev, key_str, MAX_KEY_LEN, false); | ||
| 593 | cb->fill(cb, key_str, disk_total); | ||
| 594 | return disk_total; | ||
| 595 | } | ||
| 596 | |||
| 597 | #define SHOW_FUNCTION_PER_GROUP(__VAR, type, show_total) \ | ||
| 176 | static int blkiocg_##__VAR##_read(struct cgroup *cgroup, \ | 598 | static int blkiocg_##__VAR##_read(struct cgroup *cgroup, \ |
| 177 | struct cftype *cftype, struct seq_file *m) \ | 599 | struct cftype *cftype, struct cgroup_map_cb *cb) \ |
| 178 | { \ | 600 | { \ |
| 179 | struct blkio_cgroup *blkcg; \ | 601 | struct blkio_cgroup *blkcg; \ |
| 180 | struct blkio_group *blkg; \ | 602 | struct blkio_group *blkg; \ |
| 181 | struct hlist_node *n; \ | 603 | struct hlist_node *n; \ |
| 604 | uint64_t cgroup_total = 0; \ | ||
| 182 | \ | 605 | \ |
| 183 | if (!cgroup_lock_live_group(cgroup)) \ | 606 | if (!cgroup_lock_live_group(cgroup)) \ |
| 184 | return -ENODEV; \ | 607 | return -ENODEV; \ |
| @@ -186,50 +609,295 @@ static int blkiocg_##__VAR##_read(struct cgroup *cgroup, \ | |||
| 186 | blkcg = cgroup_to_blkio_cgroup(cgroup); \ | 609 | blkcg = cgroup_to_blkio_cgroup(cgroup); \ |
| 187 | rcu_read_lock(); \ | 610 | rcu_read_lock(); \ |
| 188 | hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {\ | 611 | hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {\ |
| 189 | if (blkg->dev) \ | 612 | if (blkg->dev) { \ |
| 190 | seq_printf(m, "%u:%u %lu\n", MAJOR(blkg->dev), \ | 613 | spin_lock_irq(&blkg->stats_lock); \ |
| 191 | MINOR(blkg->dev), blkg->__VAR); \ | 614 | cgroup_total += blkio_get_stat(blkg, cb, \ |
| 615 | blkg->dev, type); \ | ||
| 616 | spin_unlock_irq(&blkg->stats_lock); \ | ||
| 617 | } \ | ||
| 192 | } \ | 618 | } \ |
| 619 | if (show_total) \ | ||
| 620 | cb->fill(cb, "Total", cgroup_total); \ | ||
| 193 | rcu_read_unlock(); \ | 621 | rcu_read_unlock(); \ |
| 194 | cgroup_unlock(); \ | 622 | cgroup_unlock(); \ |
| 195 | return 0; \ | 623 | return 0; \ |
| 196 | } | 624 | } |
| 197 | 625 | ||
| 198 | SHOW_FUNCTION_PER_GROUP(time); | 626 | SHOW_FUNCTION_PER_GROUP(time, BLKIO_STAT_TIME, 0); |
| 199 | SHOW_FUNCTION_PER_GROUP(sectors); | 627 | SHOW_FUNCTION_PER_GROUP(sectors, BLKIO_STAT_SECTORS, 0); |
| 628 | SHOW_FUNCTION_PER_GROUP(io_service_bytes, BLKIO_STAT_SERVICE_BYTES, 1); | ||
| 629 | SHOW_FUNCTION_PER_GROUP(io_serviced, BLKIO_STAT_SERVICED, 1); | ||
| 630 | SHOW_FUNCTION_PER_GROUP(io_service_time, BLKIO_STAT_SERVICE_TIME, 1); | ||
| 631 | SHOW_FUNCTION_PER_GROUP(io_wait_time, BLKIO_STAT_WAIT_TIME, 1); | ||
| 632 | SHOW_FUNCTION_PER_GROUP(io_merged, BLKIO_STAT_MERGED, 1); | ||
| 633 | SHOW_FUNCTION_PER_GROUP(io_queued, BLKIO_STAT_QUEUED, 1); | ||
| 200 | #ifdef CONFIG_DEBUG_BLK_CGROUP | 634 | #ifdef CONFIG_DEBUG_BLK_CGROUP |
| 201 | SHOW_FUNCTION_PER_GROUP(dequeue); | 635 | SHOW_FUNCTION_PER_GROUP(dequeue, BLKIO_STAT_DEQUEUE, 0); |
| 636 | SHOW_FUNCTION_PER_GROUP(avg_queue_size, BLKIO_STAT_AVG_QUEUE_SIZE, 0); | ||
| 637 | SHOW_FUNCTION_PER_GROUP(group_wait_time, BLKIO_STAT_GROUP_WAIT_TIME, 0); | ||
| 638 | SHOW_FUNCTION_PER_GROUP(idle_time, BLKIO_STAT_IDLE_TIME, 0); | ||
| 639 | SHOW_FUNCTION_PER_GROUP(empty_time, BLKIO_STAT_EMPTY_TIME, 0); | ||
| 202 | #endif | 640 | #endif |
| 203 | #undef SHOW_FUNCTION_PER_GROUP | 641 | #undef SHOW_FUNCTION_PER_GROUP |
| 204 | 642 | ||
| 205 | #ifdef CONFIG_DEBUG_BLK_CGROUP | 643 | static int blkio_check_dev_num(dev_t dev) |
| 206 | void blkiocg_update_blkio_group_dequeue_stats(struct blkio_group *blkg, | ||
| 207 | unsigned long dequeue) | ||
| 208 | { | 644 | { |
| 209 | blkg->dequeue += dequeue; | 645 | int part = 0; |
| 646 | struct gendisk *disk; | ||
| 647 | |||
| 648 | disk = get_gendisk(dev, &part); | ||
| 649 | if (!disk || part) | ||
| 650 | return -ENODEV; | ||
| 651 | |||
| 652 | return 0; | ||
| 653 | } | ||
| 654 | |||
| 655 | static int blkio_policy_parse_and_set(char *buf, | ||
| 656 | struct blkio_policy_node *newpn) | ||
| 657 | { | ||
| 658 | char *s[4], *p, *major_s = NULL, *minor_s = NULL; | ||
| 659 | int ret; | ||
| 660 | unsigned long major, minor, temp; | ||
| 661 | int i = 0; | ||
| 662 | dev_t dev; | ||
| 663 | |||
| 664 | memset(s, 0, sizeof(s)); | ||
| 665 | |||
| 666 | while ((p = strsep(&buf, " ")) != NULL) { | ||
| 667 | if (!*p) | ||
| 668 | continue; | ||
| 669 | |||
| 670 | s[i++] = p; | ||
| 671 | |||
| 672 | /* Prevent from inputing too many things */ | ||
| 673 | if (i == 3) | ||
| 674 | break; | ||
| 675 | } | ||
| 676 | |||
| 677 | if (i != 2) | ||
| 678 | return -EINVAL; | ||
| 679 | |||
| 680 | p = strsep(&s[0], ":"); | ||
| 681 | if (p != NULL) | ||
| 682 | major_s = p; | ||
| 683 | else | ||
| 684 | return -EINVAL; | ||
| 685 | |||
| 686 | minor_s = s[0]; | ||
| 687 | if (!minor_s) | ||
| 688 | return -EINVAL; | ||
| 689 | |||
| 690 | ret = strict_strtoul(major_s, 10, &major); | ||
| 691 | if (ret) | ||
| 692 | return -EINVAL; | ||
| 693 | |||
| 694 | ret = strict_strtoul(minor_s, 10, &minor); | ||
| 695 | if (ret) | ||
| 696 | return -EINVAL; | ||
| 697 | |||
| 698 | dev = MKDEV(major, minor); | ||
| 699 | |||
| 700 | ret = blkio_check_dev_num(dev); | ||
| 701 | if (ret) | ||
| 702 | return ret; | ||
| 703 | |||
| 704 | newpn->dev = dev; | ||
| 705 | |||
| 706 | if (s[1] == NULL) | ||
| 707 | return -EINVAL; | ||
| 708 | |||
| 709 | ret = strict_strtoul(s[1], 10, &temp); | ||
| 710 | if (ret || (temp < BLKIO_WEIGHT_MIN && temp > 0) || | ||
| 711 | temp > BLKIO_WEIGHT_MAX) | ||
| 712 | return -EINVAL; | ||
| 713 | |||
| 714 | newpn->weight = temp; | ||
| 715 | |||
| 716 | return 0; | ||
| 717 | } | ||
| 718 | |||
| 719 | unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg, | ||
| 720 | dev_t dev) | ||
| 721 | { | ||
| 722 | struct blkio_policy_node *pn; | ||
| 723 | |||
| 724 | pn = blkio_policy_search_node(blkcg, dev); | ||
| 725 | if (pn) | ||
| 726 | return pn->weight; | ||
| 727 | else | ||
| 728 | return blkcg->weight; | ||
| 729 | } | ||
| 730 | EXPORT_SYMBOL_GPL(blkcg_get_weight); | ||
| 731 | |||
| 732 | |||
| 733 | static int blkiocg_weight_device_write(struct cgroup *cgrp, struct cftype *cft, | ||
| 734 | const char *buffer) | ||
| 735 | { | ||
| 736 | int ret = 0; | ||
| 737 | char *buf; | ||
| 738 | struct blkio_policy_node *newpn, *pn; | ||
| 739 | struct blkio_cgroup *blkcg; | ||
| 740 | struct blkio_group *blkg; | ||
| 741 | int keep_newpn = 0; | ||
| 742 | struct hlist_node *n; | ||
| 743 | struct blkio_policy_type *blkiop; | ||
| 744 | |||
| 745 | buf = kstrdup(buffer, GFP_KERNEL); | ||
| 746 | if (!buf) | ||
| 747 | return -ENOMEM; | ||
| 748 | |||
| 749 | newpn = kzalloc(sizeof(*newpn), GFP_KERNEL); | ||
| 750 | if (!newpn) { | ||
| 751 | ret = -ENOMEM; | ||
| 752 | goto free_buf; | ||
| 753 | } | ||
| 754 | |||
| 755 | ret = blkio_policy_parse_and_set(buf, newpn); | ||
| 756 | if (ret) | ||
| 757 | goto free_newpn; | ||
| 758 | |||
| 759 | blkcg = cgroup_to_blkio_cgroup(cgrp); | ||
| 760 | |||
| 761 | spin_lock_irq(&blkcg->lock); | ||
| 762 | |||
| 763 | pn = blkio_policy_search_node(blkcg, newpn->dev); | ||
| 764 | if (!pn) { | ||
| 765 | if (newpn->weight != 0) { | ||
| 766 | blkio_policy_insert_node(blkcg, newpn); | ||
| 767 | keep_newpn = 1; | ||
| 768 | } | ||
| 769 | spin_unlock_irq(&blkcg->lock); | ||
| 770 | goto update_io_group; | ||
| 771 | } | ||
| 772 | |||
| 773 | if (newpn->weight == 0) { | ||
| 774 | /* weight == 0 means deleteing a specific weight */ | ||
| 775 | blkio_policy_delete_node(pn); | ||
| 776 | spin_unlock_irq(&blkcg->lock); | ||
| 777 | goto update_io_group; | ||
| 778 | } | ||
| 779 | spin_unlock_irq(&blkcg->lock); | ||
| 780 | |||
| 781 | pn->weight = newpn->weight; | ||
| 782 | |||
| 783 | update_io_group: | ||
| 784 | /* update weight for each cfqg */ | ||
| 785 | spin_lock(&blkio_list_lock); | ||
| 786 | spin_lock_irq(&blkcg->lock); | ||
| 787 | |||
| 788 | hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) { | ||
| 789 | if (newpn->dev == blkg->dev) { | ||
| 790 | list_for_each_entry(blkiop, &blkio_list, list) | ||
| 791 | blkiop->ops.blkio_update_group_weight_fn(blkg, | ||
| 792 | newpn->weight ? | ||
| 793 | newpn->weight : | ||
| 794 | blkcg->weight); | ||
| 795 | } | ||
| 796 | } | ||
| 797 | |||
| 798 | spin_unlock_irq(&blkcg->lock); | ||
| 799 | spin_unlock(&blkio_list_lock); | ||
| 800 | |||
| 801 | free_newpn: | ||
| 802 | if (!keep_newpn) | ||
| 803 | kfree(newpn); | ||
| 804 | free_buf: | ||
| 805 | kfree(buf); | ||
| 806 | return ret; | ||
| 807 | } | ||
| 808 | |||
| 809 | static int blkiocg_weight_device_read(struct cgroup *cgrp, struct cftype *cft, | ||
| 810 | struct seq_file *m) | ||
| 811 | { | ||
| 812 | struct blkio_cgroup *blkcg; | ||
| 813 | struct blkio_policy_node *pn; | ||
| 814 | |||
| 815 | seq_printf(m, "dev\tweight\n"); | ||
| 816 | |||
| 817 | blkcg = cgroup_to_blkio_cgroup(cgrp); | ||
| 818 | if (list_empty(&blkcg->policy_list)) | ||
| 819 | goto out; | ||
| 820 | |||
| 821 | spin_lock_irq(&blkcg->lock); | ||
| 822 | list_for_each_entry(pn, &blkcg->policy_list, node) { | ||
| 823 | seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev), | ||
| 824 | MINOR(pn->dev), pn->weight); | ||
| 825 | } | ||
| 826 | spin_unlock_irq(&blkcg->lock); | ||
| 827 | |||
| 828 | out: | ||
| 829 | return 0; | ||
| 210 | } | 830 | } |
| 211 | EXPORT_SYMBOL_GPL(blkiocg_update_blkio_group_dequeue_stats); | ||
| 212 | #endif | ||
| 213 | 831 | ||
| 214 | struct cftype blkio_files[] = { | 832 | struct cftype blkio_files[] = { |
| 215 | { | 833 | { |
| 834 | .name = "weight_device", | ||
| 835 | .read_seq_string = blkiocg_weight_device_read, | ||
| 836 | .write_string = blkiocg_weight_device_write, | ||
| 837 | .max_write_len = 256, | ||
| 838 | }, | ||
| 839 | { | ||
| 216 | .name = "weight", | 840 | .name = "weight", |
| 217 | .read_u64 = blkiocg_weight_read, | 841 | .read_u64 = blkiocg_weight_read, |
| 218 | .write_u64 = blkiocg_weight_write, | 842 | .write_u64 = blkiocg_weight_write, |
| 219 | }, | 843 | }, |
| 220 | { | 844 | { |
| 221 | .name = "time", | 845 | .name = "time", |
| 222 | .read_seq_string = blkiocg_time_read, | 846 | .read_map = blkiocg_time_read, |
| 223 | }, | 847 | }, |
| 224 | { | 848 | { |
| 225 | .name = "sectors", | 849 | .name = "sectors", |
| 226 | .read_seq_string = blkiocg_sectors_read, | 850 | .read_map = blkiocg_sectors_read, |
| 851 | }, | ||
| 852 | { | ||
| 853 | .name = "io_service_bytes", | ||
| 854 | .read_map = blkiocg_io_service_bytes_read, | ||
| 855 | }, | ||
| 856 | { | ||
| 857 | .name = "io_serviced", | ||
| 858 | .read_map = blkiocg_io_serviced_read, | ||
| 859 | }, | ||
| 860 | { | ||
| 861 | .name = "io_service_time", | ||
| 862 | .read_map = blkiocg_io_service_time_read, | ||
| 863 | }, | ||
| 864 | { | ||
| 865 | .name = "io_wait_time", | ||
| 866 | .read_map = blkiocg_io_wait_time_read, | ||
| 867 | }, | ||
| 868 | { | ||
| 869 | .name = "io_merged", | ||
| 870 | .read_map = blkiocg_io_merged_read, | ||
| 871 | }, | ||
| 872 | { | ||
| 873 | .name = "io_queued", | ||
| 874 | .read_map = blkiocg_io_queued_read, | ||
| 875 | }, | ||
| 876 | { | ||
| 877 | .name = "reset_stats", | ||
| 878 | .write_u64 = blkiocg_reset_stats, | ||
| 227 | }, | 879 | }, |
| 228 | #ifdef CONFIG_DEBUG_BLK_CGROUP | 880 | #ifdef CONFIG_DEBUG_BLK_CGROUP |
| 229 | { | 881 | { |
| 882 | .name = "avg_queue_size", | ||
| 883 | .read_map = blkiocg_avg_queue_size_read, | ||
| 884 | }, | ||
| 885 | { | ||
| 886 | .name = "group_wait_time", | ||
| 887 | .read_map = blkiocg_group_wait_time_read, | ||
| 888 | }, | ||
| 889 | { | ||
| 890 | .name = "idle_time", | ||
| 891 | .read_map = blkiocg_idle_time_read, | ||
| 892 | }, | ||
| 893 | { | ||
| 894 | .name = "empty_time", | ||
| 895 | .read_map = blkiocg_empty_time_read, | ||
| 896 | }, | ||
| 897 | { | ||
| 230 | .name = "dequeue", | 898 | .name = "dequeue", |
| 231 | .read_seq_string = blkiocg_dequeue_read, | 899 | .read_map = blkiocg_dequeue_read, |
| 232 | }, | 900 | }, |
| 233 | #endif | 901 | #endif |
| 234 | }; | 902 | }; |
| 235 | 903 | ||
| @@ -246,6 +914,7 @@ static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup) | |||
| 246 | struct blkio_group *blkg; | 914 | struct blkio_group *blkg; |
| 247 | void *key; | 915 | void *key; |
| 248 | struct blkio_policy_type *blkiop; | 916 | struct blkio_policy_type *blkiop; |
| 917 | struct blkio_policy_node *pn, *pntmp; | ||
| 249 | 918 | ||
| 250 | rcu_read_lock(); | 919 | rcu_read_lock(); |
| 251 | remove_entry: | 920 | remove_entry: |
| @@ -276,7 +945,12 @@ remove_entry: | |||
| 276 | blkiop->ops.blkio_unlink_group_fn(key, blkg); | 945 | blkiop->ops.blkio_unlink_group_fn(key, blkg); |
| 277 | spin_unlock(&blkio_list_lock); | 946 | spin_unlock(&blkio_list_lock); |
| 278 | goto remove_entry; | 947 | goto remove_entry; |
| 948 | |||
| 279 | done: | 949 | done: |
| 950 | list_for_each_entry_safe(pn, pntmp, &blkcg->policy_list, node) { | ||
| 951 | blkio_policy_delete_node(pn); | ||
| 952 | kfree(pn); | ||
| 953 | } | ||
| 280 | free_css_id(&blkio_subsys, &blkcg->css); | 954 | free_css_id(&blkio_subsys, &blkcg->css); |
| 281 | rcu_read_unlock(); | 955 | rcu_read_unlock(); |
| 282 | if (blkcg != &blkio_root_cgroup) | 956 | if (blkcg != &blkio_root_cgroup) |
| @@ -307,6 +981,7 @@ done: | |||
| 307 | spin_lock_init(&blkcg->lock); | 981 | spin_lock_init(&blkcg->lock); |
| 308 | INIT_HLIST_HEAD(&blkcg->blkg_list); | 982 | INIT_HLIST_HEAD(&blkcg->blkg_list); |
| 309 | 983 | ||
| 984 | INIT_LIST_HEAD(&blkcg->policy_list); | ||
| 310 | return &blkcg->css; | 985 | return &blkcg->css; |
| 311 | } | 986 | } |
| 312 | 987 | ||
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index 8ccc20464dae..2b866ec1dcea 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h | |||
| @@ -23,11 +23,84 @@ extern struct cgroup_subsys blkio_subsys; | |||
| 23 | #define blkio_subsys_id blkio_subsys.subsys_id | 23 | #define blkio_subsys_id blkio_subsys.subsys_id |
| 24 | #endif | 24 | #endif |
| 25 | 25 | ||
| 26 | enum stat_type { | ||
| 27 | /* Total time spent (in ns) between request dispatch to the driver and | ||
| 28 | * request completion for IOs doen by this cgroup. This may not be | ||
| 29 | * accurate when NCQ is turned on. */ | ||
| 30 | BLKIO_STAT_SERVICE_TIME = 0, | ||
| 31 | /* Total bytes transferred */ | ||
| 32 | BLKIO_STAT_SERVICE_BYTES, | ||
| 33 | /* Total IOs serviced, post merge */ | ||
| 34 | BLKIO_STAT_SERVICED, | ||
| 35 | /* Total time spent waiting in scheduler queue in ns */ | ||
| 36 | BLKIO_STAT_WAIT_TIME, | ||
| 37 | /* Number of IOs merged */ | ||
| 38 | BLKIO_STAT_MERGED, | ||
| 39 | /* Number of IOs queued up */ | ||
| 40 | BLKIO_STAT_QUEUED, | ||
| 41 | /* All the single valued stats go below this */ | ||
| 42 | BLKIO_STAT_TIME, | ||
| 43 | BLKIO_STAT_SECTORS, | ||
| 44 | #ifdef CONFIG_DEBUG_BLK_CGROUP | ||
| 45 | BLKIO_STAT_AVG_QUEUE_SIZE, | ||
| 46 | BLKIO_STAT_IDLE_TIME, | ||
| 47 | BLKIO_STAT_EMPTY_TIME, | ||
| 48 | BLKIO_STAT_GROUP_WAIT_TIME, | ||
| 49 | BLKIO_STAT_DEQUEUE | ||
| 50 | #endif | ||
| 51 | }; | ||
| 52 | |||
| 53 | enum stat_sub_type { | ||
| 54 | BLKIO_STAT_READ = 0, | ||
| 55 | BLKIO_STAT_WRITE, | ||
| 56 | BLKIO_STAT_SYNC, | ||
| 57 | BLKIO_STAT_ASYNC, | ||
| 58 | BLKIO_STAT_TOTAL | ||
| 59 | }; | ||
| 60 | |||
| 61 | /* blkg state flags */ | ||
| 62 | enum blkg_state_flags { | ||
| 63 | BLKG_waiting = 0, | ||
| 64 | BLKG_idling, | ||
| 65 | BLKG_empty, | ||
| 66 | }; | ||
| 67 | |||
| 26 | struct blkio_cgroup { | 68 | struct blkio_cgroup { |
| 27 | struct cgroup_subsys_state css; | 69 | struct cgroup_subsys_state css; |
| 28 | unsigned int weight; | 70 | unsigned int weight; |
| 29 | spinlock_t lock; | 71 | spinlock_t lock; |
| 30 | struct hlist_head blkg_list; | 72 | struct hlist_head blkg_list; |
| 73 | struct list_head policy_list; /* list of blkio_policy_node */ | ||
| 74 | }; | ||
| 75 | |||
| 76 | struct blkio_group_stats { | ||
| 77 | /* total disk time and nr sectors dispatched by this group */ | ||
| 78 | uint64_t time; | ||
| 79 | uint64_t sectors; | ||
| 80 | uint64_t stat_arr[BLKIO_STAT_QUEUED + 1][BLKIO_STAT_TOTAL]; | ||
| 81 | #ifdef CONFIG_DEBUG_BLK_CGROUP | ||
| 82 | /* Sum of number of IOs queued across all samples */ | ||
| 83 | uint64_t avg_queue_size_sum; | ||
| 84 | /* Count of samples taken for average */ | ||
| 85 | uint64_t avg_queue_size_samples; | ||
| 86 | /* How many times this group has been removed from service tree */ | ||
| 87 | unsigned long dequeue; | ||
| 88 | |||
| 89 | /* Total time spent waiting for it to be assigned a timeslice. */ | ||
| 90 | uint64_t group_wait_time; | ||
| 91 | uint64_t start_group_wait_time; | ||
| 92 | |||
| 93 | /* Time spent idling for this blkio_group */ | ||
| 94 | uint64_t idle_time; | ||
| 95 | uint64_t start_idle_time; | ||
| 96 | /* | ||
| 97 | * Total time when we have requests queued and do not contain the | ||
| 98 | * current active queue. | ||
| 99 | */ | ||
| 100 | uint64_t empty_time; | ||
| 101 | uint64_t start_empty_time; | ||
| 102 | uint16_t flags; | ||
| 103 | #endif | ||
| 31 | }; | 104 | }; |
| 32 | 105 | ||
| 33 | struct blkio_group { | 106 | struct blkio_group { |
| @@ -35,20 +108,25 @@ struct blkio_group { | |||
| 35 | void *key; | 108 | void *key; |
| 36 | struct hlist_node blkcg_node; | 109 | struct hlist_node blkcg_node; |
| 37 | unsigned short blkcg_id; | 110 | unsigned short blkcg_id; |
| 38 | #ifdef CONFIG_DEBUG_BLK_CGROUP | ||
| 39 | /* Store cgroup path */ | 111 | /* Store cgroup path */ |
| 40 | char path[128]; | 112 | char path[128]; |
| 41 | /* How many times this group has been removed from service tree */ | ||
| 42 | unsigned long dequeue; | ||
| 43 | #endif | ||
| 44 | /* The device MKDEV(major, minor), this group has been created for */ | 113 | /* The device MKDEV(major, minor), this group has been created for */ |
| 45 | dev_t dev; | 114 | dev_t dev; |
| 46 | 115 | ||
| 47 | /* total disk time and nr sectors dispatched by this group */ | 116 | /* Need to serialize the stats in the case of reset/update */ |
| 48 | unsigned long time; | 117 | spinlock_t stats_lock; |
| 49 | unsigned long sectors; | 118 | struct blkio_group_stats stats; |
| 50 | }; | 119 | }; |
| 51 | 120 | ||
| 121 | struct blkio_policy_node { | ||
| 122 | struct list_head node; | ||
| 123 | dev_t dev; | ||
| 124 | unsigned int weight; | ||
| 125 | }; | ||
| 126 | |||
| 127 | extern unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg, | ||
| 128 | dev_t dev); | ||
| 129 | |||
| 52 | typedef void (blkio_unlink_group_fn) (void *key, struct blkio_group *blkg); | 130 | typedef void (blkio_unlink_group_fn) (void *key, struct blkio_group *blkg); |
| 53 | typedef void (blkio_update_group_weight_fn) (struct blkio_group *blkg, | 131 | typedef void (blkio_update_group_weight_fn) (struct blkio_group *blkg, |
| 54 | unsigned int weight); | 132 | unsigned int weight); |
| @@ -67,6 +145,11 @@ struct blkio_policy_type { | |||
| 67 | extern void blkio_policy_register(struct blkio_policy_type *); | 145 | extern void blkio_policy_register(struct blkio_policy_type *); |
| 68 | extern void blkio_policy_unregister(struct blkio_policy_type *); | 146 | extern void blkio_policy_unregister(struct blkio_policy_type *); |
| 69 | 147 | ||
| 148 | static inline char *blkg_path(struct blkio_group *blkg) | ||
| 149 | { | ||
| 150 | return blkg->path; | ||
| 151 | } | ||
| 152 | |||
| 70 | #else | 153 | #else |
| 71 | 154 | ||
| 72 | struct blkio_group { | 155 | struct blkio_group { |
| @@ -78,6 +161,8 @@ struct blkio_policy_type { | |||
| 78 | static inline void blkio_policy_register(struct blkio_policy_type *blkiop) { } | 161 | static inline void blkio_policy_register(struct blkio_policy_type *blkiop) { } |
| 79 | static inline void blkio_policy_unregister(struct blkio_policy_type *blkiop) { } | 162 | static inline void blkio_policy_unregister(struct blkio_policy_type *blkiop) { } |
| 80 | 163 | ||
| 164 | static inline char *blkg_path(struct blkio_group *blkg) { return NULL; } | ||
| 165 | |||
| 81 | #endif | 166 | #endif |
| 82 | 167 | ||
| 83 | #define BLKIO_WEIGHT_MIN 100 | 168 | #define BLKIO_WEIGHT_MIN 100 |
| @@ -85,16 +170,42 @@ static inline void blkio_policy_unregister(struct blkio_policy_type *blkiop) { } | |||
| 85 | #define BLKIO_WEIGHT_DEFAULT 500 | 170 | #define BLKIO_WEIGHT_DEFAULT 500 |
| 86 | 171 | ||
| 87 | #ifdef CONFIG_DEBUG_BLK_CGROUP | 172 | #ifdef CONFIG_DEBUG_BLK_CGROUP |
| 88 | static inline char *blkg_path(struct blkio_group *blkg) | 173 | void blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg); |
| 89 | { | 174 | void blkiocg_update_dequeue_stats(struct blkio_group *blkg, |
| 90 | return blkg->path; | ||
| 91 | } | ||
| 92 | void blkiocg_update_blkio_group_dequeue_stats(struct blkio_group *blkg, | ||
| 93 | unsigned long dequeue); | 175 | unsigned long dequeue); |
| 176 | void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg); | ||
| 177 | void blkiocg_update_idle_time_stats(struct blkio_group *blkg); | ||
| 178 | void blkiocg_set_start_empty_time(struct blkio_group *blkg); | ||
| 179 | |||
| 180 | #define BLKG_FLAG_FNS(name) \ | ||
| 181 | static inline void blkio_mark_blkg_##name( \ | ||
| 182 | struct blkio_group_stats *stats) \ | ||
| 183 | { \ | ||
| 184 | stats->flags |= (1 << BLKG_##name); \ | ||
| 185 | } \ | ||
| 186 | static inline void blkio_clear_blkg_##name( \ | ||
| 187 | struct blkio_group_stats *stats) \ | ||
| 188 | { \ | ||
| 189 | stats->flags &= ~(1 << BLKG_##name); \ | ||
| 190 | } \ | ||
| 191 | static inline int blkio_blkg_##name(struct blkio_group_stats *stats) \ | ||
| 192 | { \ | ||
| 193 | return (stats->flags & (1 << BLKG_##name)) != 0; \ | ||
| 194 | } \ | ||
| 195 | |||
| 196 | BLKG_FLAG_FNS(waiting) | ||
| 197 | BLKG_FLAG_FNS(idling) | ||
| 198 | BLKG_FLAG_FNS(empty) | ||
| 199 | #undef BLKG_FLAG_FNS | ||
| 94 | #else | 200 | #else |
| 95 | static inline char *blkg_path(struct blkio_group *blkg) { return NULL; } | 201 | static inline void blkiocg_update_avg_queue_size_stats( |
| 96 | static inline void blkiocg_update_blkio_group_dequeue_stats( | 202 | struct blkio_group *blkg) {} |
| 97 | struct blkio_group *blkg, unsigned long dequeue) {} | 203 | static inline void blkiocg_update_dequeue_stats(struct blkio_group *blkg, |
| 204 | unsigned long dequeue) {} | ||
| 205 | static inline void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg) | ||
| 206 | {} | ||
| 207 | static inline void blkiocg_update_idle_time_stats(struct blkio_group *blkg) {} | ||
| 208 | static inline void blkiocg_set_start_empty_time(struct blkio_group *blkg) {} | ||
| 98 | #endif | 209 | #endif |
| 99 | 210 | ||
| 100 | #if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE) | 211 | #if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE) |
| @@ -105,26 +216,43 @@ extern void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, | |||
| 105 | extern int blkiocg_del_blkio_group(struct blkio_group *blkg); | 216 | extern int blkiocg_del_blkio_group(struct blkio_group *blkg); |
| 106 | extern struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, | 217 | extern struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, |
| 107 | void *key); | 218 | void *key); |
| 108 | void blkiocg_update_blkio_group_stats(struct blkio_group *blkg, | 219 | void blkiocg_update_timeslice_used(struct blkio_group *blkg, |
| 109 | unsigned long time, unsigned long sectors); | 220 | unsigned long time); |
| 221 | void blkiocg_update_dispatch_stats(struct blkio_group *blkg, uint64_t bytes, | ||
| 222 | bool direction, bool sync); | ||
| 223 | void blkiocg_update_completion_stats(struct blkio_group *blkg, | ||
| 224 | uint64_t start_time, uint64_t io_start_time, bool direction, bool sync); | ||
| 225 | void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction, | ||
| 226 | bool sync); | ||
| 227 | void blkiocg_update_io_add_stats(struct blkio_group *blkg, | ||
| 228 | struct blkio_group *curr_blkg, bool direction, bool sync); | ||
| 229 | void blkiocg_update_io_remove_stats(struct blkio_group *blkg, | ||
| 230 | bool direction, bool sync); | ||
| 110 | #else | 231 | #else |
| 111 | struct cgroup; | 232 | struct cgroup; |
| 112 | static inline struct blkio_cgroup * | 233 | static inline struct blkio_cgroup * |
| 113 | cgroup_to_blkio_cgroup(struct cgroup *cgroup) { return NULL; } | 234 | cgroup_to_blkio_cgroup(struct cgroup *cgroup) { return NULL; } |
| 114 | 235 | ||
| 115 | static inline void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, | 236 | static inline void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, |
| 116 | struct blkio_group *blkg, void *key, dev_t dev) | 237 | struct blkio_group *blkg, void *key, dev_t dev) {} |
| 117 | { | ||
| 118 | } | ||
| 119 | 238 | ||
| 120 | static inline int | 239 | static inline int |
| 121 | blkiocg_del_blkio_group(struct blkio_group *blkg) { return 0; } | 240 | blkiocg_del_blkio_group(struct blkio_group *blkg) { return 0; } |
| 122 | 241 | ||
| 123 | static inline struct blkio_group * | 242 | static inline struct blkio_group * |
| 124 | blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key) { return NULL; } | 243 | blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key) { return NULL; } |
| 125 | static inline void blkiocg_update_blkio_group_stats(struct blkio_group *blkg, | 244 | static inline void blkiocg_update_timeslice_used(struct blkio_group *blkg, |
| 126 | unsigned long time, unsigned long sectors) | 245 | unsigned long time) {} |
| 127 | { | 246 | static inline void blkiocg_update_dispatch_stats(struct blkio_group *blkg, |
| 128 | } | 247 | uint64_t bytes, bool direction, bool sync) {} |
| 248 | static inline void blkiocg_update_completion_stats(struct blkio_group *blkg, | ||
| 249 | uint64_t start_time, uint64_t io_start_time, bool direction, | ||
| 250 | bool sync) {} | ||
| 251 | static inline void blkiocg_update_io_merged_stats(struct blkio_group *blkg, | ||
| 252 | bool direction, bool sync) {} | ||
| 253 | static inline void blkiocg_update_io_add_stats(struct blkio_group *blkg, | ||
| 254 | struct blkio_group *curr_blkg, bool direction, bool sync) {} | ||
| 255 | static inline void blkiocg_update_io_remove_stats(struct blkio_group *blkg, | ||
| 256 | bool direction, bool sync) {} | ||
| 129 | #endif | 257 | #endif |
| 130 | #endif /* _BLK_CGROUP_H */ | 258 | #endif /* _BLK_CGROUP_H */ |
diff --git a/block/blk-core.c b/block/blk-core.c index 9fe174dc74d1..e9a5ae25db8c 100644 --- a/block/blk-core.c +++ b/block/blk-core.c | |||
| @@ -127,6 +127,7 @@ void blk_rq_init(struct request_queue *q, struct request *rq) | |||
| 127 | rq->tag = -1; | 127 | rq->tag = -1; |
| 128 | rq->ref_count = 1; | 128 | rq->ref_count = 1; |
| 129 | rq->start_time = jiffies; | 129 | rq->start_time = jiffies; |
| 130 | set_start_time_ns(rq); | ||
| 130 | } | 131 | } |
| 131 | EXPORT_SYMBOL(blk_rq_init); | 132 | EXPORT_SYMBOL(blk_rq_init); |
| 132 | 133 | ||
| @@ -450,6 +451,7 @@ void blk_cleanup_queue(struct request_queue *q) | |||
| 450 | */ | 451 | */ |
| 451 | blk_sync_queue(q); | 452 | blk_sync_queue(q); |
| 452 | 453 | ||
| 454 | del_timer_sync(&q->backing_dev_info.laptop_mode_wb_timer); | ||
| 453 | mutex_lock(&q->sysfs_lock); | 455 | mutex_lock(&q->sysfs_lock); |
| 454 | queue_flag_set_unlocked(QUEUE_FLAG_DEAD, q); | 456 | queue_flag_set_unlocked(QUEUE_FLAG_DEAD, q); |
| 455 | mutex_unlock(&q->sysfs_lock); | 457 | mutex_unlock(&q->sysfs_lock); |
| @@ -510,6 +512,8 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) | |||
| 510 | return NULL; | 512 | return NULL; |
| 511 | } | 513 | } |
| 512 | 514 | ||
| 515 | setup_timer(&q->backing_dev_info.laptop_mode_wb_timer, | ||
| 516 | laptop_mode_timer_fn, (unsigned long) q); | ||
| 513 | init_timer(&q->unplug_timer); | 517 | init_timer(&q->unplug_timer); |
| 514 | setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q); | 518 | setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q); |
| 515 | INIT_LIST_HEAD(&q->timeout_list); | 519 | INIT_LIST_HEAD(&q->timeout_list); |
| @@ -1198,6 +1202,7 @@ static int __make_request(struct request_queue *q, struct bio *bio) | |||
| 1198 | if (!blk_rq_cpu_valid(req)) | 1202 | if (!blk_rq_cpu_valid(req)) |
| 1199 | req->cpu = bio->bi_comp_cpu; | 1203 | req->cpu = bio->bi_comp_cpu; |
| 1200 | drive_stat_acct(req, 0); | 1204 | drive_stat_acct(req, 0); |
| 1205 | elv_bio_merged(q, req, bio); | ||
| 1201 | if (!attempt_back_merge(q, req)) | 1206 | if (!attempt_back_merge(q, req)) |
| 1202 | elv_merged_request(q, req, el_ret); | 1207 | elv_merged_request(q, req, el_ret); |
| 1203 | goto out; | 1208 | goto out; |
| @@ -1231,6 +1236,7 @@ static int __make_request(struct request_queue *q, struct bio *bio) | |||
| 1231 | if (!blk_rq_cpu_valid(req)) | 1236 | if (!blk_rq_cpu_valid(req)) |
| 1232 | req->cpu = bio->bi_comp_cpu; | 1237 | req->cpu = bio->bi_comp_cpu; |
| 1233 | drive_stat_acct(req, 0); | 1238 | drive_stat_acct(req, 0); |
| 1239 | elv_bio_merged(q, req, bio); | ||
| 1234 | if (!attempt_front_merge(q, req)) | 1240 | if (!attempt_front_merge(q, req)) |
| 1235 | elv_merged_request(q, req, el_ret); | 1241 | elv_merged_request(q, req, el_ret); |
| 1236 | goto out; | 1242 | goto out; |
| @@ -1855,8 +1861,10 @@ void blk_dequeue_request(struct request *rq) | |||
| 1855 | * and to it is freed is accounted as io that is in progress at | 1861 | * and to it is freed is accounted as io that is in progress at |
| 1856 | * the driver side. | 1862 | * the driver side. |
| 1857 | */ | 1863 | */ |
| 1858 | if (blk_account_rq(rq)) | 1864 | if (blk_account_rq(rq)) { |
| 1859 | q->in_flight[rq_is_sync(rq)]++; | 1865 | q->in_flight[rq_is_sync(rq)]++; |
| 1866 | set_io_start_time_ns(rq); | ||
| 1867 | } | ||
| 1860 | } | 1868 | } |
| 1861 | 1869 | ||
| 1862 | /** | 1870 | /** |
| @@ -2098,7 +2106,7 @@ static void blk_finish_request(struct request *req, int error) | |||
| 2098 | BUG_ON(blk_queued_rq(req)); | 2106 | BUG_ON(blk_queued_rq(req)); |
| 2099 | 2107 | ||
| 2100 | if (unlikely(laptop_mode) && blk_fs_request(req)) | 2108 | if (unlikely(laptop_mode) && blk_fs_request(req)) |
| 2101 | laptop_io_completion(); | 2109 | laptop_io_completion(&req->q->backing_dev_info); |
| 2102 | 2110 | ||
| 2103 | blk_delete_timer(req); | 2111 | blk_delete_timer(req); |
| 2104 | 2112 | ||
| @@ -2517,4 +2525,3 @@ int __init blk_dev_init(void) | |||
| 2517 | 2525 | ||
| 2518 | return 0; | 2526 | return 0; |
| 2519 | } | 2527 | } |
| 2520 | |||
diff --git a/block/blk-lib.c b/block/blk-lib.c new file mode 100644 index 000000000000..d0216b9f22d4 --- /dev/null +++ b/block/blk-lib.c | |||
| @@ -0,0 +1,233 @@ | |||
| 1 | /* | ||
| 2 | * Functions related to generic helpers functions | ||
| 3 | */ | ||
| 4 | #include <linux/kernel.h> | ||
| 5 | #include <linux/module.h> | ||
| 6 | #include <linux/bio.h> | ||
| 7 | #include <linux/blkdev.h> | ||
| 8 | #include <linux/scatterlist.h> | ||
| 9 | |||
| 10 | #include "blk.h" | ||
| 11 | |||
| 12 | static void blkdev_discard_end_io(struct bio *bio, int err) | ||
| 13 | { | ||
| 14 | if (err) { | ||
| 15 | if (err == -EOPNOTSUPP) | ||
| 16 | set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); | ||
| 17 | clear_bit(BIO_UPTODATE, &bio->bi_flags); | ||
| 18 | } | ||
| 19 | |||
| 20 | if (bio->bi_private) | ||
| 21 | complete(bio->bi_private); | ||
| 22 | __free_page(bio_page(bio)); | ||
| 23 | |||
| 24 | bio_put(bio); | ||
| 25 | } | ||
| 26 | |||
| 27 | /** | ||
| 28 | * blkdev_issue_discard - queue a discard | ||
| 29 | * @bdev: blockdev to issue discard for | ||
| 30 | * @sector: start sector | ||
| 31 | * @nr_sects: number of sectors to discard | ||
| 32 | * @gfp_mask: memory allocation flags (for bio_alloc) | ||
| 33 | * @flags: BLKDEV_IFL_* flags to control behaviour | ||
| 34 | * | ||
| 35 | * Description: | ||
| 36 | * Issue a discard request for the sectors in question. | ||
| 37 | */ | ||
| 38 | int blkdev_issue_discard(struct block_device *bdev, sector_t sector, | ||
| 39 | sector_t nr_sects, gfp_t gfp_mask, unsigned long flags) | ||
| 40 | { | ||
| 41 | DECLARE_COMPLETION_ONSTACK(wait); | ||
| 42 | struct request_queue *q = bdev_get_queue(bdev); | ||
| 43 | int type = flags & BLKDEV_IFL_BARRIER ? | ||
| 44 | DISCARD_BARRIER : DISCARD_NOBARRIER; | ||
| 45 | struct bio *bio; | ||
| 46 | struct page *page; | ||
| 47 | int ret = 0; | ||
| 48 | |||
| 49 | if (!q) | ||
| 50 | return -ENXIO; | ||
| 51 | |||
| 52 | if (!blk_queue_discard(q)) | ||
| 53 | return -EOPNOTSUPP; | ||
| 54 | |||
| 55 | while (nr_sects && !ret) { | ||
| 56 | unsigned int sector_size = q->limits.logical_block_size; | ||
| 57 | unsigned int max_discard_sectors = | ||
| 58 | min(q->limits.max_discard_sectors, UINT_MAX >> 9); | ||
| 59 | |||
| 60 | bio = bio_alloc(gfp_mask, 1); | ||
| 61 | if (!bio) | ||
| 62 | goto out; | ||
| 63 | bio->bi_sector = sector; | ||
| 64 | bio->bi_end_io = blkdev_discard_end_io; | ||
| 65 | bio->bi_bdev = bdev; | ||
| 66 | if (flags & BLKDEV_IFL_WAIT) | ||
| 67 | bio->bi_private = &wait; | ||
| 68 | |||
| 69 | /* | ||
| 70 | * Add a zeroed one-sector payload as that's what | ||
| 71 | * our current implementations need. If we'll ever need | ||
| 72 | * more the interface will need revisiting. | ||
| 73 | */ | ||
| 74 | page = alloc_page(gfp_mask | __GFP_ZERO); | ||
| 75 | if (!page) | ||
| 76 | goto out_free_bio; | ||
| 77 | if (bio_add_pc_page(q, bio, page, sector_size, 0) < sector_size) | ||
| 78 | goto out_free_page; | ||
| 79 | |||
| 80 | /* | ||
| 81 | * And override the bio size - the way discard works we | ||
| 82 | * touch many more blocks on disk than the actual payload | ||
| 83 | * length. | ||
| 84 | */ | ||
| 85 | if (nr_sects > max_discard_sectors) { | ||
| 86 | bio->bi_size = max_discard_sectors << 9; | ||
| 87 | nr_sects -= max_discard_sectors; | ||
| 88 | sector += max_discard_sectors; | ||
| 89 | } else { | ||
| 90 | bio->bi_size = nr_sects << 9; | ||
| 91 | nr_sects = 0; | ||
| 92 | } | ||
| 93 | |||
| 94 | bio_get(bio); | ||
| 95 | submit_bio(type, bio); | ||
| 96 | |||
| 97 | if (flags & BLKDEV_IFL_WAIT) | ||
| 98 | wait_for_completion(&wait); | ||
| 99 | |||
| 100 | if (bio_flagged(bio, BIO_EOPNOTSUPP)) | ||
| 101 | ret = -EOPNOTSUPP; | ||
| 102 | else if (!bio_flagged(bio, BIO_UPTODATE)) | ||
| 103 | ret = -EIO; | ||
| 104 | bio_put(bio); | ||
| 105 | } | ||
| 106 | return ret; | ||
| 107 | out_free_page: | ||
| 108 | __free_page(page); | ||
| 109 | out_free_bio: | ||
| 110 | bio_put(bio); | ||
| 111 | out: | ||
| 112 | return -ENOMEM; | ||
| 113 | } | ||
| 114 | EXPORT_SYMBOL(blkdev_issue_discard); | ||
| 115 | |||
| 116 | struct bio_batch | ||
| 117 | { | ||
| 118 | atomic_t done; | ||
| 119 | unsigned long flags; | ||
| 120 | struct completion *wait; | ||
| 121 | bio_end_io_t *end_io; | ||
| 122 | }; | ||
| 123 | |||
| 124 | static void bio_batch_end_io(struct bio *bio, int err) | ||
| 125 | { | ||
| 126 | struct bio_batch *bb = bio->bi_private; | ||
| 127 | |||
| 128 | if (err) { | ||
| 129 | if (err == -EOPNOTSUPP) | ||
| 130 | set_bit(BIO_EOPNOTSUPP, &bb->flags); | ||
| 131 | else | ||
| 132 | clear_bit(BIO_UPTODATE, &bb->flags); | ||
| 133 | } | ||
| 134 | if (bb) { | ||
| 135 | if (bb->end_io) | ||
| 136 | bb->end_io(bio, err); | ||
| 137 | atomic_inc(&bb->done); | ||
| 138 | complete(bb->wait); | ||
| 139 | } | ||
| 140 | bio_put(bio); | ||
| 141 | } | ||
| 142 | |||
| 143 | /** | ||
| 144 | * blkdev_issue_zeroout generate number of zero filed write bios | ||
| 145 | * @bdev: blockdev to issue | ||
| 146 | * @sector: start sector | ||
| 147 | * @nr_sects: number of sectors to write | ||
| 148 | * @gfp_mask: memory allocation flags (for bio_alloc) | ||
| 149 | * @flags: BLKDEV_IFL_* flags to control behaviour | ||
| 150 | * | ||
| 151 | * Description: | ||
| 152 | * Generate and issue number of bios with zerofiled pages. | ||
| 153 | * Send barrier at the beginning and at the end if requested. This guarantie | ||
| 154 | * correct request ordering. Empty barrier allow us to avoid post queue flush. | ||
| 155 | */ | ||
| 156 | |||
| 157 | int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, | ||
| 158 | sector_t nr_sects, gfp_t gfp_mask, unsigned long flags) | ||
| 159 | { | ||
| 160 | int ret = 0; | ||
| 161 | struct bio *bio; | ||
| 162 | struct bio_batch bb; | ||
| 163 | unsigned int sz, issued = 0; | ||
| 164 | DECLARE_COMPLETION_ONSTACK(wait); | ||
| 165 | |||
| 166 | atomic_set(&bb.done, 0); | ||
| 167 | bb.flags = 1 << BIO_UPTODATE; | ||
| 168 | bb.wait = &wait; | ||
| 169 | bb.end_io = NULL; | ||
| 170 | |||
| 171 | if (flags & BLKDEV_IFL_BARRIER) { | ||
| 172 | /* issue async barrier before the data */ | ||
| 173 | ret = blkdev_issue_flush(bdev, gfp_mask, NULL, 0); | ||
| 174 | if (ret) | ||
| 175 | return ret; | ||
| 176 | } | ||
| 177 | submit: | ||
| 178 | while (nr_sects != 0) { | ||
| 179 | bio = bio_alloc(gfp_mask, | ||
| 180 | min(nr_sects, (sector_t)BIO_MAX_PAGES)); | ||
| 181 | if (!bio) | ||
| 182 | break; | ||
| 183 | |||
| 184 | bio->bi_sector = sector; | ||
| 185 | bio->bi_bdev = bdev; | ||
| 186 | bio->bi_end_io = bio_batch_end_io; | ||
| 187 | if (flags & BLKDEV_IFL_WAIT) | ||
| 188 | bio->bi_private = &bb; | ||
| 189 | |||
| 190 | while (nr_sects != 0) { | ||
| 191 | sz = min((sector_t) PAGE_SIZE >> 9 , nr_sects); | ||
| 192 | if (sz == 0) | ||
| 193 | /* bio has maximum size possible */ | ||
| 194 | break; | ||
| 195 | ret = bio_add_page(bio, ZERO_PAGE(0), sz << 9, 0); | ||
| 196 | nr_sects -= ret >> 9; | ||
| 197 | sector += ret >> 9; | ||
| 198 | if (ret < (sz << 9)) | ||
| 199 | break; | ||
| 200 | } | ||
| 201 | issued++; | ||
| 202 | submit_bio(WRITE, bio); | ||
| 203 | } | ||
| 204 | /* | ||
| 205 | * When all data bios are in flight. Send final barrier if requeted. | ||
| 206 | */ | ||
| 207 | if (nr_sects == 0 && flags & BLKDEV_IFL_BARRIER) | ||
| 208 | ret = blkdev_issue_flush(bdev, gfp_mask, NULL, | ||
| 209 | flags & BLKDEV_IFL_WAIT); | ||
| 210 | |||
| 211 | |||
| 212 | if (flags & BLKDEV_IFL_WAIT) | ||
| 213 | /* Wait for bios in-flight */ | ||
| 214 | while ( issued != atomic_read(&bb.done)) | ||
| 215 | wait_for_completion(&wait); | ||
| 216 | |||
| 217 | if (!test_bit(BIO_UPTODATE, &bb.flags)) | ||
| 218 | /* One of bios in the batch was completed with error.*/ | ||
| 219 | ret = -EIO; | ||
| 220 | |||
| 221 | if (ret) | ||
| 222 | goto out; | ||
| 223 | |||
| 224 | if (test_bit(BIO_EOPNOTSUPP, &bb.flags)) { | ||
| 225 | ret = -EOPNOTSUPP; | ||
| 226 | goto out; | ||
| 227 | } | ||
| 228 | if (nr_sects != 0) | ||
| 229 | goto submit; | ||
| 230 | out: | ||
| 231 | return ret; | ||
| 232 | } | ||
| 233 | EXPORT_SYMBOL(blkdev_issue_zeroout); | ||
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 838834be115b..0f3eb70f9ce1 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c | |||
| @@ -55,6 +55,7 @@ static const int cfq_hist_divisor = 4; | |||
| 55 | #define RQ_CIC(rq) \ | 55 | #define RQ_CIC(rq) \ |
| 56 | ((struct cfq_io_context *) (rq)->elevator_private) | 56 | ((struct cfq_io_context *) (rq)->elevator_private) |
| 57 | #define RQ_CFQQ(rq) (struct cfq_queue *) ((rq)->elevator_private2) | 57 | #define RQ_CFQQ(rq) (struct cfq_queue *) ((rq)->elevator_private2) |
| 58 | #define RQ_CFQG(rq) (struct cfq_group *) ((rq)->elevator_private3) | ||
| 58 | 59 | ||
| 59 | static struct kmem_cache *cfq_pool; | 60 | static struct kmem_cache *cfq_pool; |
| 60 | static struct kmem_cache *cfq_ioc_pool; | 61 | static struct kmem_cache *cfq_ioc_pool; |
| @@ -143,8 +144,6 @@ struct cfq_queue { | |||
| 143 | struct cfq_queue *new_cfqq; | 144 | struct cfq_queue *new_cfqq; |
| 144 | struct cfq_group *cfqg; | 145 | struct cfq_group *cfqg; |
| 145 | struct cfq_group *orig_cfqg; | 146 | struct cfq_group *orig_cfqg; |
| 146 | /* Sectors dispatched in current dispatch round */ | ||
| 147 | unsigned long nr_sectors; | ||
| 148 | }; | 147 | }; |
| 149 | 148 | ||
| 150 | /* | 149 | /* |
| @@ -346,7 +345,7 @@ CFQ_CFQQ_FNS(deep); | |||
| 346 | CFQ_CFQQ_FNS(wait_busy); | 345 | CFQ_CFQQ_FNS(wait_busy); |
| 347 | #undef CFQ_CFQQ_FNS | 346 | #undef CFQ_CFQQ_FNS |
| 348 | 347 | ||
| 349 | #ifdef CONFIG_DEBUG_CFQ_IOSCHED | 348 | #ifdef CONFIG_CFQ_GROUP_IOSCHED |
| 350 | #define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \ | 349 | #define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \ |
| 351 | blk_add_trace_msg((cfqd)->queue, "cfq%d%c %s " fmt, (cfqq)->pid, \ | 350 | blk_add_trace_msg((cfqd)->queue, "cfq%d%c %s " fmt, (cfqq)->pid, \ |
| 352 | cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \ | 351 | cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \ |
| @@ -858,7 +857,7 @@ cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg) | |||
| 858 | if (!RB_EMPTY_NODE(&cfqg->rb_node)) | 857 | if (!RB_EMPTY_NODE(&cfqg->rb_node)) |
| 859 | cfq_rb_erase(&cfqg->rb_node, st); | 858 | cfq_rb_erase(&cfqg->rb_node, st); |
| 860 | cfqg->saved_workload_slice = 0; | 859 | cfqg->saved_workload_slice = 0; |
| 861 | blkiocg_update_blkio_group_dequeue_stats(&cfqg->blkg, 1); | 860 | blkiocg_update_dequeue_stats(&cfqg->blkg, 1); |
| 862 | } | 861 | } |
| 863 | 862 | ||
| 864 | static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq) | 863 | static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq) |
| @@ -884,8 +883,7 @@ static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq) | |||
| 884 | slice_used = cfqq->allocated_slice; | 883 | slice_used = cfqq->allocated_slice; |
| 885 | } | 884 | } |
| 886 | 885 | ||
| 887 | cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u sect=%lu", slice_used, | 886 | cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u", slice_used); |
| 888 | cfqq->nr_sectors); | ||
| 889 | return slice_used; | 887 | return slice_used; |
| 890 | } | 888 | } |
| 891 | 889 | ||
| @@ -919,8 +917,8 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg, | |||
| 919 | 917 | ||
| 920 | cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime, | 918 | cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime, |
| 921 | st->min_vdisktime); | 919 | st->min_vdisktime); |
| 922 | blkiocg_update_blkio_group_stats(&cfqg->blkg, used_sl, | 920 | blkiocg_update_timeslice_used(&cfqg->blkg, used_sl); |
| 923 | cfqq->nr_sectors); | 921 | blkiocg_set_start_empty_time(&cfqg->blkg); |
| 924 | } | 922 | } |
| 925 | 923 | ||
| 926 | #ifdef CONFIG_CFQ_GROUP_IOSCHED | 924 | #ifdef CONFIG_CFQ_GROUP_IOSCHED |
| @@ -961,7 +959,6 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create) | |||
| 961 | if (!cfqg) | 959 | if (!cfqg) |
| 962 | goto done; | 960 | goto done; |
| 963 | 961 | ||
| 964 | cfqg->weight = blkcg->weight; | ||
| 965 | for_each_cfqg_st(cfqg, i, j, st) | 962 | for_each_cfqg_st(cfqg, i, j, st) |
| 966 | *st = CFQ_RB_ROOT; | 963 | *st = CFQ_RB_ROOT; |
| 967 | RB_CLEAR_NODE(&cfqg->rb_node); | 964 | RB_CLEAR_NODE(&cfqg->rb_node); |
| @@ -978,6 +975,7 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create) | |||
| 978 | sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); | 975 | sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); |
| 979 | blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd, | 976 | blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd, |
| 980 | MKDEV(major, minor)); | 977 | MKDEV(major, minor)); |
| 978 | cfqg->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev); | ||
| 981 | 979 | ||
| 982 | /* Add group on cfqd list */ | 980 | /* Add group on cfqd list */ |
| 983 | hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list); | 981 | hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list); |
| @@ -1004,6 +1002,12 @@ static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create) | |||
| 1004 | return cfqg; | 1002 | return cfqg; |
| 1005 | } | 1003 | } |
| 1006 | 1004 | ||
| 1005 | static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg) | ||
| 1006 | { | ||
| 1007 | atomic_inc(&cfqg->ref); | ||
| 1008 | return cfqg; | ||
| 1009 | } | ||
| 1010 | |||
| 1007 | static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) | 1011 | static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) |
| 1008 | { | 1012 | { |
| 1009 | /* Currently, all async queues are mapped to root group */ | 1013 | /* Currently, all async queues are mapped to root group */ |
| @@ -1087,6 +1091,12 @@ static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create) | |||
| 1087 | { | 1091 | { |
| 1088 | return &cfqd->root_group; | 1092 | return &cfqd->root_group; |
| 1089 | } | 1093 | } |
| 1094 | |||
| 1095 | static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg) | ||
| 1096 | { | ||
| 1097 | return cfqg; | ||
| 1098 | } | ||
| 1099 | |||
| 1090 | static inline void | 1100 | static inline void |
| 1091 | cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) { | 1101 | cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) { |
| 1092 | cfqq->cfqg = cfqg; | 1102 | cfqq->cfqg = cfqg; |
| @@ -1389,7 +1399,12 @@ static void cfq_reposition_rq_rb(struct cfq_queue *cfqq, struct request *rq) | |||
| 1389 | { | 1399 | { |
| 1390 | elv_rb_del(&cfqq->sort_list, rq); | 1400 | elv_rb_del(&cfqq->sort_list, rq); |
| 1391 | cfqq->queued[rq_is_sync(rq)]--; | 1401 | cfqq->queued[rq_is_sync(rq)]--; |
| 1402 | blkiocg_update_io_remove_stats(&(RQ_CFQG(rq))->blkg, rq_data_dir(rq), | ||
| 1403 | rq_is_sync(rq)); | ||
| 1392 | cfq_add_rq_rb(rq); | 1404 | cfq_add_rq_rb(rq); |
| 1405 | blkiocg_update_io_add_stats(&(RQ_CFQG(rq))->blkg, | ||
| 1406 | &cfqq->cfqd->serving_group->blkg, rq_data_dir(rq), | ||
| 1407 | rq_is_sync(rq)); | ||
| 1393 | } | 1408 | } |
| 1394 | 1409 | ||
| 1395 | static struct request * | 1410 | static struct request * |
| @@ -1445,6 +1460,8 @@ static void cfq_remove_request(struct request *rq) | |||
| 1445 | cfq_del_rq_rb(rq); | 1460 | cfq_del_rq_rb(rq); |
| 1446 | 1461 | ||
| 1447 | cfqq->cfqd->rq_queued--; | 1462 | cfqq->cfqd->rq_queued--; |
| 1463 | blkiocg_update_io_remove_stats(&(RQ_CFQG(rq))->blkg, rq_data_dir(rq), | ||
| 1464 | rq_is_sync(rq)); | ||
| 1448 | if (rq_is_meta(rq)) { | 1465 | if (rq_is_meta(rq)) { |
| 1449 | WARN_ON(!cfqq->meta_pending); | 1466 | WARN_ON(!cfqq->meta_pending); |
| 1450 | cfqq->meta_pending--; | 1467 | cfqq->meta_pending--; |
| @@ -1476,6 +1493,13 @@ static void cfq_merged_request(struct request_queue *q, struct request *req, | |||
| 1476 | } | 1493 | } |
| 1477 | } | 1494 | } |
| 1478 | 1495 | ||
| 1496 | static void cfq_bio_merged(struct request_queue *q, struct request *req, | ||
| 1497 | struct bio *bio) | ||
| 1498 | { | ||
| 1499 | blkiocg_update_io_merged_stats(&(RQ_CFQG(req))->blkg, bio_data_dir(bio), | ||
| 1500 | cfq_bio_sync(bio)); | ||
| 1501 | } | ||
| 1502 | |||
| 1479 | static void | 1503 | static void |
| 1480 | cfq_merged_requests(struct request_queue *q, struct request *rq, | 1504 | cfq_merged_requests(struct request_queue *q, struct request *rq, |
| 1481 | struct request *next) | 1505 | struct request *next) |
| @@ -1493,6 +1517,8 @@ cfq_merged_requests(struct request_queue *q, struct request *rq, | |||
| 1493 | if (cfqq->next_rq == next) | 1517 | if (cfqq->next_rq == next) |
| 1494 | cfqq->next_rq = rq; | 1518 | cfqq->next_rq = rq; |
| 1495 | cfq_remove_request(next); | 1519 | cfq_remove_request(next); |
| 1520 | blkiocg_update_io_merged_stats(&(RQ_CFQG(rq))->blkg, rq_data_dir(next), | ||
| 1521 | rq_is_sync(next)); | ||
| 1496 | } | 1522 | } |
| 1497 | 1523 | ||
| 1498 | static int cfq_allow_merge(struct request_queue *q, struct request *rq, | 1524 | static int cfq_allow_merge(struct request_queue *q, struct request *rq, |
| @@ -1520,18 +1546,24 @@ static int cfq_allow_merge(struct request_queue *q, struct request *rq, | |||
| 1520 | return cfqq == RQ_CFQQ(rq); | 1546 | return cfqq == RQ_CFQQ(rq); |
| 1521 | } | 1547 | } |
| 1522 | 1548 | ||
| 1549 | static inline void cfq_del_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq) | ||
| 1550 | { | ||
| 1551 | del_timer(&cfqd->idle_slice_timer); | ||
| 1552 | blkiocg_update_idle_time_stats(&cfqq->cfqg->blkg); | ||
| 1553 | } | ||
| 1554 | |||
| 1523 | static void __cfq_set_active_queue(struct cfq_data *cfqd, | 1555 | static void __cfq_set_active_queue(struct cfq_data *cfqd, |
| 1524 | struct cfq_queue *cfqq) | 1556 | struct cfq_queue *cfqq) |
| 1525 | { | 1557 | { |
| 1526 | if (cfqq) { | 1558 | if (cfqq) { |
| 1527 | cfq_log_cfqq(cfqd, cfqq, "set_active wl_prio:%d wl_type:%d", | 1559 | cfq_log_cfqq(cfqd, cfqq, "set_active wl_prio:%d wl_type:%d", |
| 1528 | cfqd->serving_prio, cfqd->serving_type); | 1560 | cfqd->serving_prio, cfqd->serving_type); |
| 1561 | blkiocg_update_avg_queue_size_stats(&cfqq->cfqg->blkg); | ||
| 1529 | cfqq->slice_start = 0; | 1562 | cfqq->slice_start = 0; |
| 1530 | cfqq->dispatch_start = jiffies; | 1563 | cfqq->dispatch_start = jiffies; |
| 1531 | cfqq->allocated_slice = 0; | 1564 | cfqq->allocated_slice = 0; |
| 1532 | cfqq->slice_end = 0; | 1565 | cfqq->slice_end = 0; |
| 1533 | cfqq->slice_dispatch = 0; | 1566 | cfqq->slice_dispatch = 0; |
| 1534 | cfqq->nr_sectors = 0; | ||
| 1535 | 1567 | ||
| 1536 | cfq_clear_cfqq_wait_request(cfqq); | 1568 | cfq_clear_cfqq_wait_request(cfqq); |
| 1537 | cfq_clear_cfqq_must_dispatch(cfqq); | 1569 | cfq_clear_cfqq_must_dispatch(cfqq); |
| @@ -1539,7 +1571,7 @@ static void __cfq_set_active_queue(struct cfq_data *cfqd, | |||
| 1539 | cfq_clear_cfqq_fifo_expire(cfqq); | 1571 | cfq_clear_cfqq_fifo_expire(cfqq); |
| 1540 | cfq_mark_cfqq_slice_new(cfqq); | 1572 | cfq_mark_cfqq_slice_new(cfqq); |
| 1541 | 1573 | ||
| 1542 | del_timer(&cfqd->idle_slice_timer); | 1574 | cfq_del_timer(cfqd, cfqq); |
| 1543 | } | 1575 | } |
| 1544 | 1576 | ||
| 1545 | cfqd->active_queue = cfqq; | 1577 | cfqd->active_queue = cfqq; |
| @@ -1555,7 +1587,7 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq, | |||
| 1555 | cfq_log_cfqq(cfqd, cfqq, "slice expired t=%d", timed_out); | 1587 | cfq_log_cfqq(cfqd, cfqq, "slice expired t=%d", timed_out); |
| 1556 | 1588 | ||
| 1557 | if (cfq_cfqq_wait_request(cfqq)) | 1589 | if (cfq_cfqq_wait_request(cfqq)) |
| 1558 | del_timer(&cfqd->idle_slice_timer); | 1590 | cfq_del_timer(cfqd, cfqq); |
| 1559 | 1591 | ||
| 1560 | cfq_clear_cfqq_wait_request(cfqq); | 1592 | cfq_clear_cfqq_wait_request(cfqq); |
| 1561 | cfq_clear_cfqq_wait_busy(cfqq); | 1593 | cfq_clear_cfqq_wait_busy(cfqq); |
| @@ -1857,6 +1889,7 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd) | |||
| 1857 | sl = cfqd->cfq_slice_idle; | 1889 | sl = cfqd->cfq_slice_idle; |
| 1858 | 1890 | ||
| 1859 | mod_timer(&cfqd->idle_slice_timer, jiffies + sl); | 1891 | mod_timer(&cfqd->idle_slice_timer, jiffies + sl); |
| 1892 | blkiocg_update_set_idle_time_stats(&cfqq->cfqg->blkg); | ||
| 1860 | cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu", sl); | 1893 | cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu", sl); |
| 1861 | } | 1894 | } |
| 1862 | 1895 | ||
| @@ -1876,7 +1909,8 @@ static void cfq_dispatch_insert(struct request_queue *q, struct request *rq) | |||
| 1876 | elv_dispatch_sort(q, rq); | 1909 | elv_dispatch_sort(q, rq); |
| 1877 | 1910 | ||
| 1878 | cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++; | 1911 | cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++; |
| 1879 | cfqq->nr_sectors += blk_rq_sectors(rq); | 1912 | blkiocg_update_dispatch_stats(&cfqq->cfqg->blkg, blk_rq_bytes(rq), |
| 1913 | rq_data_dir(rq), rq_is_sync(rq)); | ||
| 1880 | } | 1914 | } |
| 1881 | 1915 | ||
| 1882 | /* | 1916 | /* |
| @@ -3185,11 +3219,14 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq, | |||
| 3185 | if (cfq_cfqq_wait_request(cfqq)) { | 3219 | if (cfq_cfqq_wait_request(cfqq)) { |
| 3186 | if (blk_rq_bytes(rq) > PAGE_CACHE_SIZE || | 3220 | if (blk_rq_bytes(rq) > PAGE_CACHE_SIZE || |
| 3187 | cfqd->busy_queues > 1) { | 3221 | cfqd->busy_queues > 1) { |
| 3188 | del_timer(&cfqd->idle_slice_timer); | 3222 | cfq_del_timer(cfqd, cfqq); |
| 3189 | cfq_clear_cfqq_wait_request(cfqq); | 3223 | cfq_clear_cfqq_wait_request(cfqq); |
| 3190 | __blk_run_queue(cfqd->queue); | 3224 | __blk_run_queue(cfqd->queue); |
| 3191 | } else | 3225 | } else { |
| 3226 | blkiocg_update_idle_time_stats( | ||
| 3227 | &cfqq->cfqg->blkg); | ||
| 3192 | cfq_mark_cfqq_must_dispatch(cfqq); | 3228 | cfq_mark_cfqq_must_dispatch(cfqq); |
| 3229 | } | ||
| 3193 | } | 3230 | } |
| 3194 | } else if (cfq_should_preempt(cfqd, cfqq, rq)) { | 3231 | } else if (cfq_should_preempt(cfqd, cfqq, rq)) { |
| 3195 | /* | 3232 | /* |
| @@ -3214,7 +3251,9 @@ static void cfq_insert_request(struct request_queue *q, struct request *rq) | |||
| 3214 | rq_set_fifo_time(rq, jiffies + cfqd->cfq_fifo_expire[rq_is_sync(rq)]); | 3251 | rq_set_fifo_time(rq, jiffies + cfqd->cfq_fifo_expire[rq_is_sync(rq)]); |
| 3215 | list_add_tail(&rq->queuelist, &cfqq->fifo); | 3252 | list_add_tail(&rq->queuelist, &cfqq->fifo); |
| 3216 | cfq_add_rq_rb(rq); | 3253 | cfq_add_rq_rb(rq); |
| 3217 | 3254 | blkiocg_update_io_add_stats(&(RQ_CFQG(rq))->blkg, | |
| 3255 | &cfqd->serving_group->blkg, rq_data_dir(rq), | ||
| 3256 | rq_is_sync(rq)); | ||
| 3218 | cfq_rq_enqueued(cfqd, cfqq, rq); | 3257 | cfq_rq_enqueued(cfqd, cfqq, rq); |
| 3219 | } | 3258 | } |
| 3220 | 3259 | ||
| @@ -3300,6 +3339,9 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq) | |||
| 3300 | WARN_ON(!cfqq->dispatched); | 3339 | WARN_ON(!cfqq->dispatched); |
| 3301 | cfqd->rq_in_driver--; | 3340 | cfqd->rq_in_driver--; |
| 3302 | cfqq->dispatched--; | 3341 | cfqq->dispatched--; |
| 3342 | blkiocg_update_completion_stats(&cfqq->cfqg->blkg, rq_start_time_ns(rq), | ||
| 3343 | rq_io_start_time_ns(rq), rq_data_dir(rq), | ||
| 3344 | rq_is_sync(rq)); | ||
| 3303 | 3345 | ||
| 3304 | cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--; | 3346 | cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--; |
| 3305 | 3347 | ||
| @@ -3440,6 +3482,10 @@ static void cfq_put_request(struct request *rq) | |||
| 3440 | rq->elevator_private = NULL; | 3482 | rq->elevator_private = NULL; |
| 3441 | rq->elevator_private2 = NULL; | 3483 | rq->elevator_private2 = NULL; |
| 3442 | 3484 | ||
| 3485 | /* Put down rq reference on cfqg */ | ||
| 3486 | cfq_put_cfqg(RQ_CFQG(rq)); | ||
| 3487 | rq->elevator_private3 = NULL; | ||
| 3488 | |||
| 3443 | cfq_put_queue(cfqq); | 3489 | cfq_put_queue(cfqq); |
| 3444 | } | 3490 | } |
| 3445 | } | 3491 | } |
| @@ -3528,6 +3574,7 @@ new_queue: | |||
| 3528 | 3574 | ||
| 3529 | rq->elevator_private = cic; | 3575 | rq->elevator_private = cic; |
| 3530 | rq->elevator_private2 = cfqq; | 3576 | rq->elevator_private2 = cfqq; |
| 3577 | rq->elevator_private3 = cfq_ref_get_cfqg(cfqq->cfqg); | ||
| 3531 | return 0; | 3578 | return 0; |
| 3532 | 3579 | ||
| 3533 | queue_fail: | 3580 | queue_fail: |
| @@ -3870,6 +3917,7 @@ static struct elevator_type iosched_cfq = { | |||
| 3870 | .elevator_merged_fn = cfq_merged_request, | 3917 | .elevator_merged_fn = cfq_merged_request, |
| 3871 | .elevator_merge_req_fn = cfq_merged_requests, | 3918 | .elevator_merge_req_fn = cfq_merged_requests, |
| 3872 | .elevator_allow_merge_fn = cfq_allow_merge, | 3919 | .elevator_allow_merge_fn = cfq_allow_merge, |
| 3920 | .elevator_bio_merged_fn = cfq_bio_merged, | ||
| 3873 | .elevator_dispatch_fn = cfq_dispatch_requests, | 3921 | .elevator_dispatch_fn = cfq_dispatch_requests, |
| 3874 | .elevator_add_req_fn = cfq_insert_request, | 3922 | .elevator_add_req_fn = cfq_insert_request, |
| 3875 | .elevator_activate_req_fn = cfq_activate_request, | 3923 | .elevator_activate_req_fn = cfq_activate_request, |
diff --git a/block/elevator.c b/block/elevator.c index 76e3702d5381..5e734592bb40 100644 --- a/block/elevator.c +++ b/block/elevator.c | |||
| @@ -539,6 +539,15 @@ void elv_merge_requests(struct request_queue *q, struct request *rq, | |||
| 539 | q->last_merge = rq; | 539 | q->last_merge = rq; |
| 540 | } | 540 | } |
| 541 | 541 | ||
| 542 | void elv_bio_merged(struct request_queue *q, struct request *rq, | ||
| 543 | struct bio *bio) | ||
| 544 | { | ||
| 545 | struct elevator_queue *e = q->elevator; | ||
| 546 | |||
| 547 | if (e->ops->elevator_bio_merged_fn) | ||
| 548 | e->ops->elevator_bio_merged_fn(q, rq, bio); | ||
| 549 | } | ||
| 550 | |||
| 542 | void elv_requeue_request(struct request_queue *q, struct request *rq) | 551 | void elv_requeue_request(struct request_queue *q, struct request *rq) |
| 543 | { | 552 | { |
| 544 | /* | 553 | /* |
diff --git a/block/genhd.c b/block/genhd.c index d13ba76a169c..154b5f80b3ab 100644 --- a/block/genhd.c +++ b/block/genhd.c | |||
| @@ -596,6 +596,7 @@ struct gendisk *get_gendisk(dev_t devt, int *partno) | |||
| 596 | 596 | ||
| 597 | return disk; | 597 | return disk; |
| 598 | } | 598 | } |
| 599 | EXPORT_SYMBOL(get_gendisk); | ||
| 599 | 600 | ||
| 600 | /** | 601 | /** |
| 601 | * bdget_disk - do bdget() by gendisk and partition number | 602 | * bdget_disk - do bdget() by gendisk and partition number |
diff --git a/block/ioctl.c b/block/ioctl.c index 8905d2a2a717..e8eb679f2f9b 100644 --- a/block/ioctl.c +++ b/block/ioctl.c | |||
| @@ -126,7 +126,7 @@ static int blk_ioctl_discard(struct block_device *bdev, uint64_t start, | |||
| 126 | if (start + len > (bdev->bd_inode->i_size >> 9)) | 126 | if (start + len > (bdev->bd_inode->i_size >> 9)) |
| 127 | return -EINVAL; | 127 | return -EINVAL; |
| 128 | return blkdev_issue_discard(bdev, start, len, GFP_KERNEL, | 128 | return blkdev_issue_discard(bdev, start, len, GFP_KERNEL, |
| 129 | DISCARD_FL_WAIT); | 129 | BLKDEV_IFL_WAIT); |
| 130 | } | 130 | } |
| 131 | 131 | ||
| 132 | static int put_ushort(unsigned long arg, unsigned short val) | 132 | static int put_ushort(unsigned long arg, unsigned short val) |
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index e5e86a781820..d6f1ae342b1d 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h | |||
| @@ -2251,7 +2251,8 @@ static inline void drbd_md_flush(struct drbd_conf *mdev) | |||
| 2251 | if (test_bit(MD_NO_BARRIER, &mdev->flags)) | 2251 | if (test_bit(MD_NO_BARRIER, &mdev->flags)) |
| 2252 | return; | 2252 | return; |
| 2253 | 2253 | ||
| 2254 | r = blkdev_issue_flush(mdev->ldev->md_bdev, NULL); | 2254 | r = blkdev_issue_flush(mdev->ldev->md_bdev, GFP_KERNEL, NULL, |
| 2255 | BLKDEV_IFL_WAIT); | ||
| 2255 | if (r) { | 2256 | if (r) { |
| 2256 | set_bit(MD_NO_BARRIER, &mdev->flags); | 2257 | set_bit(MD_NO_BARRIER, &mdev->flags); |
| 2257 | dev_err(DEV, "meta data flush failed with status %d, disabling md-flushes\n", r); | 2258 | dev_err(DEV, "meta data flush failed with status %d, disabling md-flushes\n", r); |
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 3f096e7959b4..c786023001d2 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c | |||
| @@ -946,7 +946,8 @@ static enum finish_epoch drbd_flush_after_epoch(struct drbd_conf *mdev, struct d | |||
| 946 | int rv; | 946 | int rv; |
| 947 | 947 | ||
| 948 | if (mdev->write_ordering >= WO_bdev_flush && get_ldev(mdev)) { | 948 | if (mdev->write_ordering >= WO_bdev_flush && get_ldev(mdev)) { |
| 949 | rv = blkdev_issue_flush(mdev->ldev->backing_bdev, NULL); | 949 | rv = blkdev_issue_flush(mdev->ldev->backing_bdev, GFP_KERNEL, |
| 950 | NULL, BLKDEV_IFL_WAIT); | ||
| 950 | if (rv) { | 951 | if (rv) { |
| 951 | dev_err(DEV, "local disk flush failed with status %d\n", rv); | 952 | dev_err(DEV, "local disk flush failed with status %d\n", rv); |
| 952 | /* would rather check on EOPNOTSUPP, but that is not reliable. | 953 | /* would rather check on EOPNOTSUPP, but that is not reliable. |
diff --git a/fs/block_dev.c b/fs/block_dev.c index 6dcee88c2e5d..55dcb7884f4d 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c | |||
| @@ -417,7 +417,7 @@ int blkdev_fsync(struct file *filp, struct dentry *dentry, int datasync) | |||
| 417 | */ | 417 | */ |
| 418 | mutex_unlock(&bd_inode->i_mutex); | 418 | mutex_unlock(&bd_inode->i_mutex); |
| 419 | 419 | ||
| 420 | error = blkdev_issue_flush(bdev, NULL); | 420 | error = blkdev_issue_flush(bdev, GFP_KERNEL, NULL, BLKDEV_IFL_WAIT); |
| 421 | if (error == -EOPNOTSUPP) | 421 | if (error == -EOPNOTSUPP) |
| 422 | error = 0; | 422 | error = 0; |
| 423 | 423 | ||
| @@ -668,41 +668,209 @@ void bd_forget(struct inode *inode) | |||
| 668 | iput(bdev->bd_inode); | 668 | iput(bdev->bd_inode); |
| 669 | } | 669 | } |
| 670 | 670 | ||
| 671 | int bd_claim(struct block_device *bdev, void *holder) | 671 | /** |
| 672 | * bd_may_claim - test whether a block device can be claimed | ||
| 673 | * @bdev: block device of interest | ||
| 674 | * @whole: whole block device containing @bdev, may equal @bdev | ||
| 675 | * @holder: holder trying to claim @bdev | ||
| 676 | * | ||
| 677 | * Test whther @bdev can be claimed by @holder. | ||
| 678 | * | ||
| 679 | * CONTEXT: | ||
| 680 | * spin_lock(&bdev_lock). | ||
| 681 | * | ||
| 682 | * RETURNS: | ||
| 683 | * %true if @bdev can be claimed, %false otherwise. | ||
| 684 | */ | ||
| 685 | static bool bd_may_claim(struct block_device *bdev, struct block_device *whole, | ||
| 686 | void *holder) | ||
| 672 | { | 687 | { |
| 673 | int res; | ||
| 674 | spin_lock(&bdev_lock); | ||
| 675 | |||
| 676 | /* first decide result */ | ||
| 677 | if (bdev->bd_holder == holder) | 688 | if (bdev->bd_holder == holder) |
| 678 | res = 0; /* already a holder */ | 689 | return true; /* already a holder */ |
| 679 | else if (bdev->bd_holder != NULL) | 690 | else if (bdev->bd_holder != NULL) |
| 680 | res = -EBUSY; /* held by someone else */ | 691 | return false; /* held by someone else */ |
| 681 | else if (bdev->bd_contains == bdev) | 692 | else if (bdev->bd_contains == bdev) |
| 682 | res = 0; /* is a whole device which isn't held */ | 693 | return true; /* is a whole device which isn't held */ |
| 683 | 694 | ||
| 684 | else if (bdev->bd_contains->bd_holder == bd_claim) | 695 | else if (whole->bd_holder == bd_claim) |
| 685 | res = 0; /* is a partition of a device that is being partitioned */ | 696 | return true; /* is a partition of a device that is being partitioned */ |
| 686 | else if (bdev->bd_contains->bd_holder != NULL) | 697 | else if (whole->bd_holder != NULL) |
| 687 | res = -EBUSY; /* is a partition of a held device */ | 698 | return false; /* is a partition of a held device */ |
| 688 | else | 699 | else |
| 689 | res = 0; /* is a partition of an un-held device */ | 700 | return true; /* is a partition of an un-held device */ |
| 701 | } | ||
| 702 | |||
| 703 | /** | ||
| 704 | * bd_prepare_to_claim - prepare to claim a block device | ||
| 705 | * @bdev: block device of interest | ||
| 706 | * @whole: the whole device containing @bdev, may equal @bdev | ||
| 707 | * @holder: holder trying to claim @bdev | ||
| 708 | * | ||
| 709 | * Prepare to claim @bdev. This function fails if @bdev is already | ||
| 710 | * claimed by another holder and waits if another claiming is in | ||
| 711 | * progress. This function doesn't actually claim. On successful | ||
| 712 | * return, the caller has ownership of bd_claiming and bd_holder[s]. | ||
| 713 | * | ||
| 714 | * CONTEXT: | ||
| 715 | * spin_lock(&bdev_lock). Might release bdev_lock, sleep and regrab | ||
| 716 | * it multiple times. | ||
| 717 | * | ||
| 718 | * RETURNS: | ||
| 719 | * 0 if @bdev can be claimed, -EBUSY otherwise. | ||
| 720 | */ | ||
| 721 | static int bd_prepare_to_claim(struct block_device *bdev, | ||
| 722 | struct block_device *whole, void *holder) | ||
| 723 | { | ||
| 724 | retry: | ||
| 725 | /* if someone else claimed, fail */ | ||
| 726 | if (!bd_may_claim(bdev, whole, holder)) | ||
| 727 | return -EBUSY; | ||
| 728 | |||
| 729 | /* if someone else is claiming, wait for it to finish */ | ||
| 730 | if (whole->bd_claiming && whole->bd_claiming != holder) { | ||
| 731 | wait_queue_head_t *wq = bit_waitqueue(&whole->bd_claiming, 0); | ||
| 732 | DEFINE_WAIT(wait); | ||
| 733 | |||
| 734 | prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE); | ||
| 735 | spin_unlock(&bdev_lock); | ||
| 736 | schedule(); | ||
| 737 | finish_wait(wq, &wait); | ||
| 738 | spin_lock(&bdev_lock); | ||
| 739 | goto retry; | ||
| 740 | } | ||
| 741 | |||
| 742 | /* yay, all mine */ | ||
| 743 | return 0; | ||
| 744 | } | ||
| 745 | |||
| 746 | /** | ||
| 747 | * bd_start_claiming - start claiming a block device | ||
| 748 | * @bdev: block device of interest | ||
| 749 | * @holder: holder trying to claim @bdev | ||
| 750 | * | ||
| 751 | * @bdev is about to be opened exclusively. Check @bdev can be opened | ||
| 752 | * exclusively and mark that an exclusive open is in progress. Each | ||
| 753 | * successful call to this function must be matched with a call to | ||
| 754 | * either bd_claim() or bd_abort_claiming(). If this function | ||
| 755 | * succeeds, the matching bd_claim() is guaranteed to succeed. | ||
| 756 | * | ||
| 757 | * CONTEXT: | ||
| 758 | * Might sleep. | ||
| 759 | * | ||
| 760 | * RETURNS: | ||
| 761 | * Pointer to the block device containing @bdev on success, ERR_PTR() | ||
| 762 | * value on failure. | ||
| 763 | */ | ||
| 764 | static struct block_device *bd_start_claiming(struct block_device *bdev, | ||
| 765 | void *holder) | ||
| 766 | { | ||
| 767 | struct gendisk *disk; | ||
| 768 | struct block_device *whole; | ||
| 769 | int partno, err; | ||
| 770 | |||
| 771 | might_sleep(); | ||
| 772 | |||
| 773 | /* | ||
| 774 | * @bdev might not have been initialized properly yet, look up | ||
| 775 | * and grab the outer block device the hard way. | ||
| 776 | */ | ||
| 777 | disk = get_gendisk(bdev->bd_dev, &partno); | ||
| 778 | if (!disk) | ||
| 779 | return ERR_PTR(-ENXIO); | ||
| 780 | |||
| 781 | whole = bdget_disk(disk, 0); | ||
| 782 | put_disk(disk); | ||
| 783 | if (!whole) | ||
| 784 | return ERR_PTR(-ENOMEM); | ||
| 785 | |||
| 786 | /* prepare to claim, if successful, mark claiming in progress */ | ||
| 787 | spin_lock(&bdev_lock); | ||
| 788 | |||
| 789 | err = bd_prepare_to_claim(bdev, whole, holder); | ||
| 790 | if (err == 0) { | ||
| 791 | whole->bd_claiming = holder; | ||
| 792 | spin_unlock(&bdev_lock); | ||
| 793 | return whole; | ||
| 794 | } else { | ||
| 795 | spin_unlock(&bdev_lock); | ||
| 796 | bdput(whole); | ||
| 797 | return ERR_PTR(err); | ||
| 798 | } | ||
| 799 | } | ||
| 690 | 800 | ||
| 691 | /* now impose change */ | 801 | /* releases bdev_lock */ |
| 692 | if (res==0) { | 802 | static void __bd_abort_claiming(struct block_device *whole, void *holder) |
| 803 | { | ||
| 804 | BUG_ON(whole->bd_claiming != holder); | ||
| 805 | whole->bd_claiming = NULL; | ||
| 806 | wake_up_bit(&whole->bd_claiming, 0); | ||
| 807 | |||
| 808 | spin_unlock(&bdev_lock); | ||
| 809 | bdput(whole); | ||
| 810 | } | ||
| 811 | |||
| 812 | /** | ||
| 813 | * bd_abort_claiming - abort claiming a block device | ||
| 814 | * @whole: whole block device returned by bd_start_claiming() | ||
| 815 | * @holder: holder trying to claim @bdev | ||
| 816 | * | ||
| 817 | * Abort a claiming block started by bd_start_claiming(). Note that | ||
| 818 | * @whole is not the block device to be claimed but the whole device | ||
| 819 | * returned by bd_start_claiming(). | ||
| 820 | * | ||
| 821 | * CONTEXT: | ||
| 822 | * Grabs and releases bdev_lock. | ||
| 823 | */ | ||
| 824 | static void bd_abort_claiming(struct block_device *whole, void *holder) | ||
| 825 | { | ||
| 826 | spin_lock(&bdev_lock); | ||
| 827 | __bd_abort_claiming(whole, holder); /* releases bdev_lock */ | ||
| 828 | } | ||
| 829 | |||
| 830 | /** | ||
| 831 | * bd_claim - claim a block device | ||
| 832 | * @bdev: block device to claim | ||
| 833 | * @holder: holder trying to claim @bdev | ||
| 834 | * | ||
| 835 | * Try to claim @bdev which must have been opened successfully. This | ||
| 836 | * function may be called with or without preceding | ||
| 837 | * blk_start_claiming(). In the former case, this function is always | ||
| 838 | * successful and terminates the claiming block. | ||
| 839 | * | ||
| 840 | * CONTEXT: | ||
| 841 | * Might sleep. | ||
| 842 | * | ||
| 843 | * RETURNS: | ||
| 844 | * 0 if successful, -EBUSY if @bdev is already claimed. | ||
| 845 | */ | ||
| 846 | int bd_claim(struct block_device *bdev, void *holder) | ||
| 847 | { | ||
| 848 | struct block_device *whole = bdev->bd_contains; | ||
| 849 | int res; | ||
| 850 | |||
| 851 | might_sleep(); | ||
| 852 | |||
| 853 | spin_lock(&bdev_lock); | ||
| 854 | |||
| 855 | res = bd_prepare_to_claim(bdev, whole, holder); | ||
| 856 | if (res == 0) { | ||
| 693 | /* note that for a whole device bd_holders | 857 | /* note that for a whole device bd_holders |
| 694 | * will be incremented twice, and bd_holder will | 858 | * will be incremented twice, and bd_holder will |
| 695 | * be set to bd_claim before being set to holder | 859 | * be set to bd_claim before being set to holder |
| 696 | */ | 860 | */ |
| 697 | bdev->bd_contains->bd_holders ++; | 861 | whole->bd_holders++; |
| 698 | bdev->bd_contains->bd_holder = bd_claim; | 862 | whole->bd_holder = bd_claim; |
| 699 | bdev->bd_holders++; | 863 | bdev->bd_holders++; |
| 700 | bdev->bd_holder = holder; | 864 | bdev->bd_holder = holder; |
| 701 | } | 865 | } |
| 702 | spin_unlock(&bdev_lock); | 866 | |
| 867 | if (whole->bd_claiming) | ||
| 868 | __bd_abort_claiming(whole, holder); /* releases bdev_lock */ | ||
| 869 | else | ||
| 870 | spin_unlock(&bdev_lock); | ||
| 871 | |||
| 703 | return res; | 872 | return res; |
| 704 | } | 873 | } |
| 705 | |||
| 706 | EXPORT_SYMBOL(bd_claim); | 874 | EXPORT_SYMBOL(bd_claim); |
| 707 | 875 | ||
| 708 | void bd_release(struct block_device *bdev) | 876 | void bd_release(struct block_device *bdev) |
| @@ -1316,6 +1484,7 @@ EXPORT_SYMBOL(blkdev_get); | |||
| 1316 | 1484 | ||
| 1317 | static int blkdev_open(struct inode * inode, struct file * filp) | 1485 | static int blkdev_open(struct inode * inode, struct file * filp) |
| 1318 | { | 1486 | { |
| 1487 | struct block_device *whole = NULL; | ||
| 1319 | struct block_device *bdev; | 1488 | struct block_device *bdev; |
| 1320 | int res; | 1489 | int res; |
| 1321 | 1490 | ||
| @@ -1338,22 +1507,25 @@ static int blkdev_open(struct inode * inode, struct file * filp) | |||
| 1338 | if (bdev == NULL) | 1507 | if (bdev == NULL) |
| 1339 | return -ENOMEM; | 1508 | return -ENOMEM; |
| 1340 | 1509 | ||
| 1510 | if (filp->f_mode & FMODE_EXCL) { | ||
| 1511 | whole = bd_start_claiming(bdev, filp); | ||
| 1512 | if (IS_ERR(whole)) { | ||
| 1513 | bdput(bdev); | ||
| 1514 | return PTR_ERR(whole); | ||
| 1515 | } | ||
| 1516 | } | ||
| 1517 | |||
| 1341 | filp->f_mapping = bdev->bd_inode->i_mapping; | 1518 | filp->f_mapping = bdev->bd_inode->i_mapping; |
| 1342 | 1519 | ||
| 1343 | res = blkdev_get(bdev, filp->f_mode); | 1520 | res = blkdev_get(bdev, filp->f_mode); |
| 1344 | if (res) | ||
| 1345 | return res; | ||
| 1346 | 1521 | ||
| 1347 | if (filp->f_mode & FMODE_EXCL) { | 1522 | if (whole) { |
| 1348 | res = bd_claim(bdev, filp); | 1523 | if (res == 0) |
| 1349 | if (res) | 1524 | BUG_ON(bd_claim(bdev, filp) != 0); |
| 1350 | goto out_blkdev_put; | 1525 | else |
| 1526 | bd_abort_claiming(whole, filp); | ||
| 1351 | } | 1527 | } |
| 1352 | 1528 | ||
| 1353 | return 0; | ||
| 1354 | |||
| 1355 | out_blkdev_put: | ||
| 1356 | blkdev_put(bdev, filp->f_mode); | ||
| 1357 | return res; | 1529 | return res; |
| 1358 | } | 1530 | } |
| 1359 | 1531 | ||
| @@ -1564,27 +1736,34 @@ EXPORT_SYMBOL(lookup_bdev); | |||
| 1564 | */ | 1736 | */ |
| 1565 | struct block_device *open_bdev_exclusive(const char *path, fmode_t mode, void *holder) | 1737 | struct block_device *open_bdev_exclusive(const char *path, fmode_t mode, void *holder) |
| 1566 | { | 1738 | { |
| 1567 | struct block_device *bdev; | 1739 | struct block_device *bdev, *whole; |
| 1568 | int error = 0; | 1740 | int error; |
| 1569 | 1741 | ||
| 1570 | bdev = lookup_bdev(path); | 1742 | bdev = lookup_bdev(path); |
| 1571 | if (IS_ERR(bdev)) | 1743 | if (IS_ERR(bdev)) |
| 1572 | return bdev; | 1744 | return bdev; |
| 1573 | 1745 | ||
| 1746 | whole = bd_start_claiming(bdev, holder); | ||
| 1747 | if (IS_ERR(whole)) { | ||
| 1748 | bdput(bdev); | ||
| 1749 | return whole; | ||
| 1750 | } | ||
| 1751 | |||
| 1574 | error = blkdev_get(bdev, mode); | 1752 | error = blkdev_get(bdev, mode); |
| 1575 | if (error) | 1753 | if (error) |
| 1576 | return ERR_PTR(error); | 1754 | goto out_abort_claiming; |
| 1755 | |||
| 1577 | error = -EACCES; | 1756 | error = -EACCES; |
| 1578 | if ((mode & FMODE_WRITE) && bdev_read_only(bdev)) | 1757 | if ((mode & FMODE_WRITE) && bdev_read_only(bdev)) |
| 1579 | goto blkdev_put; | 1758 | goto out_blkdev_put; |
| 1580 | error = bd_claim(bdev, holder); | ||
| 1581 | if (error) | ||
| 1582 | goto blkdev_put; | ||
| 1583 | 1759 | ||
| 1760 | BUG_ON(bd_claim(bdev, holder) != 0); | ||
| 1584 | return bdev; | 1761 | return bdev; |
| 1585 | 1762 | ||
| 1586 | blkdev_put: | 1763 | out_blkdev_put: |
| 1587 | blkdev_put(bdev, mode); | 1764 | blkdev_put(bdev, mode); |
| 1765 | out_abort_claiming: | ||
| 1766 | bd_abort_claiming(whole, holder); | ||
| 1588 | return ERR_PTR(error); | 1767 | return ERR_PTR(error); |
| 1589 | } | 1768 | } |
| 1590 | 1769 | ||
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index b34d32fdaaec..c6a4f459ad76 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c | |||
| @@ -1589,7 +1589,7 @@ static void btrfs_issue_discard(struct block_device *bdev, | |||
| 1589 | u64 start, u64 len) | 1589 | u64 start, u64 len) |
| 1590 | { | 1590 | { |
| 1591 | blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL, | 1591 | blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL, |
| 1592 | DISCARD_FL_BARRIER); | 1592 | BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER); |
| 1593 | } | 1593 | } |
| 1594 | 1594 | ||
| 1595 | static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, | 1595 | static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, |
diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c index 8209f266e9ad..9492f6003ef9 100644 --- a/fs/ext3/fsync.c +++ b/fs/ext3/fsync.c | |||
| @@ -91,7 +91,8 @@ int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync) | |||
| 91 | * storage | 91 | * storage |
| 92 | */ | 92 | */ |
| 93 | if (test_opt(inode->i_sb, BARRIER)) | 93 | if (test_opt(inode->i_sb, BARRIER)) |
| 94 | blkdev_issue_flush(inode->i_sb->s_bdev, NULL); | 94 | blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL, |
| 95 | BLKDEV_IFL_WAIT); | ||
| 95 | out: | 96 | out: |
| 96 | return ret; | 97 | return ret; |
| 97 | } | 98 | } |
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index 0d0c3239c1cd..ef3d980e67cb 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c | |||
| @@ -100,9 +100,11 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync) | |||
| 100 | if (ext4_should_writeback_data(inode) && | 100 | if (ext4_should_writeback_data(inode) && |
| 101 | (journal->j_fs_dev != journal->j_dev) && | 101 | (journal->j_fs_dev != journal->j_dev) && |
| 102 | (journal->j_flags & JBD2_BARRIER)) | 102 | (journal->j_flags & JBD2_BARRIER)) |
| 103 | blkdev_issue_flush(inode->i_sb->s_bdev, NULL); | 103 | blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, |
| 104 | NULL, BLKDEV_IFL_WAIT); | ||
| 104 | jbd2_log_wait_commit(journal, commit_tid); | 105 | jbd2_log_wait_commit(journal, commit_tid); |
| 105 | } else if (journal->j_flags & JBD2_BARRIER) | 106 | } else if (journal->j_flags & JBD2_BARRIER) |
| 106 | blkdev_issue_flush(inode->i_sb->s_bdev, NULL); | 107 | blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL, |
| 108 | BLKDEV_IFL_WAIT); | ||
| 107 | return ret; | 109 | return ret; |
| 108 | } | 110 | } |
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 503b842f3ba2..bf011dc63471 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c | |||
| @@ -854,7 +854,8 @@ static void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset, | |||
| 854 | if ((start + nr_sects) != blk) { | 854 | if ((start + nr_sects) != blk) { |
| 855 | rv = blkdev_issue_discard(bdev, start, | 855 | rv = blkdev_issue_discard(bdev, start, |
| 856 | nr_sects, GFP_NOFS, | 856 | nr_sects, GFP_NOFS, |
| 857 | DISCARD_FL_BARRIER); | 857 | BLKDEV_IFL_WAIT | |
| 858 | BLKDEV_IFL_BARRIER); | ||
| 858 | if (rv) | 859 | if (rv) |
| 859 | goto fail; | 860 | goto fail; |
| 860 | nr_sects = 0; | 861 | nr_sects = 0; |
| @@ -869,7 +870,7 @@ start_new_extent: | |||
| 869 | } | 870 | } |
| 870 | if (nr_sects) { | 871 | if (nr_sects) { |
| 871 | rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS, | 872 | rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS, |
| 872 | DISCARD_FL_BARRIER); | 873 | BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER); |
| 873 | if (rv) | 874 | if (rv) |
| 874 | goto fail; | 875 | goto fail; |
| 875 | } | 876 | } |
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 30beb11ef928..076d1cc44f95 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c | |||
| @@ -530,7 +530,8 @@ int jbd2_cleanup_journal_tail(journal_t *journal) | |||
| 530 | */ | 530 | */ |
| 531 | if ((journal->j_fs_dev != journal->j_dev) && | 531 | if ((journal->j_fs_dev != journal->j_dev) && |
| 532 | (journal->j_flags & JBD2_BARRIER)) | 532 | (journal->j_flags & JBD2_BARRIER)) |
| 533 | blkdev_issue_flush(journal->j_fs_dev, NULL); | 533 | blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL, |
| 534 | BLKDEV_IFL_WAIT); | ||
| 534 | if (!(journal->j_flags & JBD2_ABORT)) | 535 | if (!(journal->j_flags & JBD2_ABORT)) |
| 535 | jbd2_journal_update_superblock(journal, 1); | 536 | jbd2_journal_update_superblock(journal, 1); |
| 536 | return 0; | 537 | return 0; |
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 671da7fb7ffd..75716d3d2be0 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c | |||
| @@ -717,7 +717,8 @@ start_journal_io: | |||
| 717 | if (commit_transaction->t_flushed_data_blocks && | 717 | if (commit_transaction->t_flushed_data_blocks && |
| 718 | (journal->j_fs_dev != journal->j_dev) && | 718 | (journal->j_fs_dev != journal->j_dev) && |
| 719 | (journal->j_flags & JBD2_BARRIER)) | 719 | (journal->j_flags & JBD2_BARRIER)) |
| 720 | blkdev_issue_flush(journal->j_fs_dev, NULL); | 720 | blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL, |
| 721 | BLKDEV_IFL_WAIT); | ||
| 721 | 722 | ||
| 722 | /* Done it all: now write the commit record asynchronously. */ | 723 | /* Done it all: now write the commit record asynchronously. */ |
| 723 | if (JBD2_HAS_INCOMPAT_FEATURE(journal, | 724 | if (JBD2_HAS_INCOMPAT_FEATURE(journal, |
| @@ -727,7 +728,8 @@ start_journal_io: | |||
| 727 | if (err) | 728 | if (err) |
| 728 | __jbd2_journal_abort_hard(journal); | 729 | __jbd2_journal_abort_hard(journal); |
| 729 | if (journal->j_flags & JBD2_BARRIER) | 730 | if (journal->j_flags & JBD2_BARRIER) |
| 730 | blkdev_issue_flush(journal->j_dev, NULL); | 731 | blkdev_issue_flush(journal->j_dev, GFP_KERNEL, NULL, |
| 732 | BLKDEV_IFL_WAIT); | ||
| 731 | } | 733 | } |
| 732 | 734 | ||
| 733 | err = journal_finish_inode_data_buffers(journal, commit_transaction); | 735 | err = journal_finish_inode_data_buffers(journal, commit_transaction); |
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c index 33871f7e4f01..7ffcf2b8b1f4 100644 --- a/fs/nilfs2/the_nilfs.c +++ b/fs/nilfs2/the_nilfs.c | |||
| @@ -670,7 +670,7 @@ int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump, | |||
| 670 | start * sects_per_block, | 670 | start * sects_per_block, |
| 671 | nblocks * sects_per_block, | 671 | nblocks * sects_per_block, |
| 672 | GFP_NOFS, | 672 | GFP_NOFS, |
| 673 | DISCARD_FL_BARRIER); | 673 | BLKDEV_IFL_BARRIER); |
| 674 | if (ret < 0) | 674 | if (ret < 0) |
| 675 | return ret; | 675 | return ret; |
| 676 | nblocks = 0; | 676 | nblocks = 0; |
| @@ -680,7 +680,7 @@ int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump, | |||
| 680 | ret = blkdev_issue_discard(nilfs->ns_bdev, | 680 | ret = blkdev_issue_discard(nilfs->ns_bdev, |
| 681 | start * sects_per_block, | 681 | start * sects_per_block, |
| 682 | nblocks * sects_per_block, | 682 | nblocks * sects_per_block, |
| 683 | GFP_NOFS, DISCARD_FL_BARRIER); | 683 | GFP_NOFS, BLKDEV_IFL_BARRIER); |
| 684 | return ret; | 684 | return ret; |
| 685 | } | 685 | } |
| 686 | 686 | ||
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c index 1d9c12714c5c..9977df9f3a54 100644 --- a/fs/reiserfs/file.c +++ b/fs/reiserfs/file.c | |||
| @@ -147,7 +147,8 @@ static int reiserfs_sync_file(struct file *filp, | |||
| 147 | barrier_done = reiserfs_commit_for_inode(inode); | 147 | barrier_done = reiserfs_commit_for_inode(inode); |
| 148 | reiserfs_write_unlock(inode->i_sb); | 148 | reiserfs_write_unlock(inode->i_sb); |
| 149 | if (barrier_done != 1 && reiserfs_barrier_flush(inode->i_sb)) | 149 | if (barrier_done != 1 && reiserfs_barrier_flush(inode->i_sb)) |
| 150 | blkdev_issue_flush(inode->i_sb->s_bdev, NULL); | 150 | blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL, |
| 151 | BLKDEV_IFL_WAIT); | ||
| 151 | if (barrier_done < 0) | 152 | if (barrier_done < 0) |
| 152 | return barrier_done; | 153 | return barrier_done; |
| 153 | return (err < 0) ? -EIO : 0; | 154 | return (err < 0) ? -EIO : 0; |
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index 52e06b487ced..2b177c778ba7 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c | |||
| @@ -725,7 +725,8 @@ void | |||
| 725 | xfs_blkdev_issue_flush( | 725 | xfs_blkdev_issue_flush( |
| 726 | xfs_buftarg_t *buftarg) | 726 | xfs_buftarg_t *buftarg) |
| 727 | { | 727 | { |
| 728 | blkdev_issue_flush(buftarg->bt_bdev, NULL); | 728 | blkdev_issue_flush(buftarg->bt_bdev, GFP_KERNEL, NULL, |
| 729 | BLKDEV_IFL_WAIT); | ||
| 729 | } | 730 | } |
| 730 | 731 | ||
| 731 | STATIC void | 732 | STATIC void |
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index bd0e3c6f323f..7534979d83bd 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h | |||
| @@ -14,6 +14,7 @@ | |||
| 14 | #include <linux/kernel.h> | 14 | #include <linux/kernel.h> |
| 15 | #include <linux/fs.h> | 15 | #include <linux/fs.h> |
| 16 | #include <linux/sched.h> | 16 | #include <linux/sched.h> |
| 17 | #include <linux/timer.h> | ||
| 17 | #include <linux/writeback.h> | 18 | #include <linux/writeback.h> |
| 18 | #include <asm/atomic.h> | 19 | #include <asm/atomic.h> |
| 19 | 20 | ||
| @@ -88,6 +89,8 @@ struct backing_dev_info { | |||
| 88 | 89 | ||
| 89 | struct device *dev; | 90 | struct device *dev; |
| 90 | 91 | ||
| 92 | struct timer_list laptop_mode_wb_timer; | ||
| 93 | |||
| 91 | #ifdef CONFIG_DEBUG_FS | 94 | #ifdef CONFIG_DEBUG_FS |
| 92 | struct dentry *debug_dir; | 95 | struct dentry *debug_dir; |
| 93 | struct dentry *debug_stats; | 96 | struct dentry *debug_stats; |
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 6690e8bae7bb..3ac2bd2fc485 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h | |||
| @@ -186,15 +186,19 @@ struct request { | |||
| 186 | }; | 186 | }; |
| 187 | 187 | ||
| 188 | /* | 188 | /* |
| 189 | * two pointers are available for the IO schedulers, if they need | 189 | * Three pointers are available for the IO schedulers, if they need |
| 190 | * more they have to dynamically allocate it. | 190 | * more they have to dynamically allocate it. |
| 191 | */ | 191 | */ |
| 192 | void *elevator_private; | 192 | void *elevator_private; |
| 193 | void *elevator_private2; | 193 | void *elevator_private2; |
| 194 | void *elevator_private3; | ||
| 194 | 195 | ||
| 195 | struct gendisk *rq_disk; | 196 | struct gendisk *rq_disk; |
| 196 | unsigned long start_time; | 197 | unsigned long start_time; |
| 197 | 198 | #ifdef CONFIG_BLK_CGROUP | |
| 199 | unsigned long long start_time_ns; | ||
| 200 | unsigned long long io_start_time_ns; /* when passed to hardware */ | ||
| 201 | #endif | ||
| 198 | /* Number of scatter-gather DMA addr+len pairs after | 202 | /* Number of scatter-gather DMA addr+len pairs after |
| 199 | * physical address coalescing is performed. | 203 | * physical address coalescing is performed. |
| 200 | */ | 204 | */ |
| @@ -994,20 +998,25 @@ static inline struct request *blk_map_queue_find_tag(struct blk_queue_tag *bqt, | |||
| 994 | return NULL; | 998 | return NULL; |
| 995 | return bqt->tag_index[tag]; | 999 | return bqt->tag_index[tag]; |
| 996 | } | 1000 | } |
| 997 | 1001 | enum{ | |
| 998 | extern int blkdev_issue_flush(struct block_device *, sector_t *); | 1002 | BLKDEV_WAIT, /* wait for completion */ |
| 999 | #define DISCARD_FL_WAIT 0x01 /* wait for completion */ | 1003 | BLKDEV_BARRIER, /*issue request with barrier */ |
| 1000 | #define DISCARD_FL_BARRIER 0x02 /* issue DISCARD_BARRIER request */ | 1004 | }; |
| 1001 | extern int blkdev_issue_discard(struct block_device *, sector_t sector, | 1005 | #define BLKDEV_IFL_WAIT (1 << BLKDEV_WAIT) |
| 1002 | sector_t nr_sects, gfp_t, int flags); | 1006 | #define BLKDEV_IFL_BARRIER (1 << BLKDEV_BARRIER) |
| 1003 | 1007 | extern int blkdev_issue_flush(struct block_device *, gfp_t, sector_t *, | |
| 1008 | unsigned long); | ||
| 1009 | extern int blkdev_issue_discard(struct block_device *bdev, sector_t sector, | ||
| 1010 | sector_t nr_sects, gfp_t gfp_mask, unsigned long flags); | ||
| 1011 | extern int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, | ||
| 1012 | sector_t nr_sects, gfp_t gfp_mask, unsigned long flags); | ||
| 1004 | static inline int sb_issue_discard(struct super_block *sb, | 1013 | static inline int sb_issue_discard(struct super_block *sb, |
| 1005 | sector_t block, sector_t nr_blocks) | 1014 | sector_t block, sector_t nr_blocks) |
| 1006 | { | 1015 | { |
| 1007 | block <<= (sb->s_blocksize_bits - 9); | 1016 | block <<= (sb->s_blocksize_bits - 9); |
| 1008 | nr_blocks <<= (sb->s_blocksize_bits - 9); | 1017 | nr_blocks <<= (sb->s_blocksize_bits - 9); |
| 1009 | return blkdev_issue_discard(sb->s_bdev, block, nr_blocks, GFP_KERNEL, | 1018 | return blkdev_issue_discard(sb->s_bdev, block, nr_blocks, GFP_KERNEL, |
| 1010 | DISCARD_FL_BARRIER); | 1019 | BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER); |
| 1011 | } | 1020 | } |
| 1012 | 1021 | ||
| 1013 | extern int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm); | 1022 | extern int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm); |
| @@ -1196,6 +1205,39 @@ static inline void put_dev_sector(Sector p) | |||
| 1196 | struct work_struct; | 1205 | struct work_struct; |
| 1197 | int kblockd_schedule_work(struct request_queue *q, struct work_struct *work); | 1206 | int kblockd_schedule_work(struct request_queue *q, struct work_struct *work); |
| 1198 | 1207 | ||
| 1208 | #ifdef CONFIG_BLK_CGROUP | ||
| 1209 | static inline void set_start_time_ns(struct request *req) | ||
| 1210 | { | ||
| 1211 | req->start_time_ns = sched_clock(); | ||
| 1212 | } | ||
| 1213 | |||
| 1214 | static inline void set_io_start_time_ns(struct request *req) | ||
| 1215 | { | ||
| 1216 | req->io_start_time_ns = sched_clock(); | ||
| 1217 | } | ||
| 1218 | |||
| 1219 | static inline uint64_t rq_start_time_ns(struct request *req) | ||
| 1220 | { | ||
| 1221 | return req->start_time_ns; | ||
| 1222 | } | ||
| 1223 | |||
| 1224 | static inline uint64_t rq_io_start_time_ns(struct request *req) | ||
| 1225 | { | ||
| 1226 | return req->io_start_time_ns; | ||
| 1227 | } | ||
| 1228 | #else | ||
| 1229 | static inline void set_start_time_ns(struct request *req) {} | ||
| 1230 | static inline void set_io_start_time_ns(struct request *req) {} | ||
| 1231 | static inline uint64_t rq_start_time_ns(struct request *req) | ||
| 1232 | { | ||
| 1233 | return 0; | ||
| 1234 | } | ||
| 1235 | static inline uint64_t rq_io_start_time_ns(struct request *req) | ||
| 1236 | { | ||
| 1237 | return 0; | ||
| 1238 | } | ||
| 1239 | #endif | ||
| 1240 | |||
| 1199 | #define MODULE_ALIAS_BLOCKDEV(major,minor) \ | 1241 | #define MODULE_ALIAS_BLOCKDEV(major,minor) \ |
| 1200 | MODULE_ALIAS("block-major-" __stringify(major) "-" __stringify(minor)) | 1242 | MODULE_ALIAS("block-major-" __stringify(major) "-" __stringify(minor)) |
| 1201 | #define MODULE_ALIAS_BLOCKDEV_MAJOR(major) \ | 1243 | #define MODULE_ALIAS_BLOCKDEV_MAJOR(major) \ |
diff --git a/include/linux/elevator.h b/include/linux/elevator.h index 1cb3372e65d8..2c958f4fce1e 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h | |||
| @@ -14,6 +14,9 @@ typedef void (elevator_merged_fn) (struct request_queue *, struct request *, int | |||
| 14 | 14 | ||
| 15 | typedef int (elevator_allow_merge_fn) (struct request_queue *, struct request *, struct bio *); | 15 | typedef int (elevator_allow_merge_fn) (struct request_queue *, struct request *, struct bio *); |
| 16 | 16 | ||
| 17 | typedef void (elevator_bio_merged_fn) (struct request_queue *, | ||
| 18 | struct request *, struct bio *); | ||
| 19 | |||
| 17 | typedef int (elevator_dispatch_fn) (struct request_queue *, int); | 20 | typedef int (elevator_dispatch_fn) (struct request_queue *, int); |
| 18 | 21 | ||
| 19 | typedef void (elevator_add_req_fn) (struct request_queue *, struct request *); | 22 | typedef void (elevator_add_req_fn) (struct request_queue *, struct request *); |
| @@ -36,6 +39,7 @@ struct elevator_ops | |||
| 36 | elevator_merged_fn *elevator_merged_fn; | 39 | elevator_merged_fn *elevator_merged_fn; |
| 37 | elevator_merge_req_fn *elevator_merge_req_fn; | 40 | elevator_merge_req_fn *elevator_merge_req_fn; |
| 38 | elevator_allow_merge_fn *elevator_allow_merge_fn; | 41 | elevator_allow_merge_fn *elevator_allow_merge_fn; |
| 42 | elevator_bio_merged_fn *elevator_bio_merged_fn; | ||
| 39 | 43 | ||
| 40 | elevator_dispatch_fn *elevator_dispatch_fn; | 44 | elevator_dispatch_fn *elevator_dispatch_fn; |
| 41 | elevator_add_req_fn *elevator_add_req_fn; | 45 | elevator_add_req_fn *elevator_add_req_fn; |
| @@ -103,6 +107,8 @@ extern int elv_merge(struct request_queue *, struct request **, struct bio *); | |||
| 103 | extern void elv_merge_requests(struct request_queue *, struct request *, | 107 | extern void elv_merge_requests(struct request_queue *, struct request *, |
| 104 | struct request *); | 108 | struct request *); |
| 105 | extern void elv_merged_request(struct request_queue *, struct request *, int); | 109 | extern void elv_merged_request(struct request_queue *, struct request *, int); |
| 110 | extern void elv_bio_merged(struct request_queue *q, struct request *, | ||
| 111 | struct bio *); | ||
| 106 | extern void elv_requeue_request(struct request_queue *, struct request *); | 112 | extern void elv_requeue_request(struct request_queue *, struct request *); |
| 107 | extern int elv_queue_empty(struct request_queue *); | 113 | extern int elv_queue_empty(struct request_queue *); |
| 108 | extern struct request *elv_former_request(struct request_queue *, struct request *); | 114 | extern struct request *elv_former_request(struct request_queue *, struct request *); |
diff --git a/include/linux/fs.h b/include/linux/fs.h index 44f35aea2f1f..f30970c97acf 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h | |||
| @@ -651,6 +651,7 @@ struct block_device { | |||
| 651 | int bd_openers; | 651 | int bd_openers; |
| 652 | struct mutex bd_mutex; /* open/close mutex */ | 652 | struct mutex bd_mutex; /* open/close mutex */ |
| 653 | struct list_head bd_inodes; | 653 | struct list_head bd_inodes; |
| 654 | void * bd_claiming; | ||
| 654 | void * bd_holder; | 655 | void * bd_holder; |
| 655 | int bd_holders; | 656 | int bd_holders; |
| 656 | #ifdef CONFIG_SYSFS | 657 | #ifdef CONFIG_SYSFS |
diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 36520ded3e06..eb38a2c645f6 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h | |||
| @@ -96,8 +96,10 @@ static inline void inode_sync_wait(struct inode *inode) | |||
| 96 | /* | 96 | /* |
| 97 | * mm/page-writeback.c | 97 | * mm/page-writeback.c |
| 98 | */ | 98 | */ |
| 99 | void laptop_io_completion(void); | 99 | void laptop_io_completion(struct backing_dev_info *info); |
| 100 | void laptop_sync_completion(void); | 100 | void laptop_sync_completion(void); |
| 101 | void laptop_mode_sync(struct work_struct *work); | ||
| 102 | void laptop_mode_timer_fn(unsigned long data); | ||
| 101 | void throttle_vm_writeout(gfp_t gfp_mask); | 103 | void throttle_vm_writeout(gfp_t gfp_mask); |
| 102 | 104 | ||
| 103 | /* These are exported to sysctl. */ | 105 | /* These are exported to sysctl. */ |
diff --git a/init/Kconfig b/init/Kconfig index eb77e8ccde1c..087c14f3c595 100644 --- a/init/Kconfig +++ b/init/Kconfig | |||
| @@ -612,6 +612,33 @@ config RT_GROUP_SCHED | |||
| 612 | 612 | ||
| 613 | endif #CGROUP_SCHED | 613 | endif #CGROUP_SCHED |
| 614 | 614 | ||
| 615 | config BLK_CGROUP | ||
| 616 | tristate "Block IO controller" | ||
| 617 | depends on CGROUPS && BLOCK | ||
| 618 | default n | ||
| 619 | ---help--- | ||
| 620 | Generic block IO controller cgroup interface. This is the common | ||
| 621 | cgroup interface which should be used by various IO controlling | ||
| 622 | policies. | ||
| 623 | |||
| 624 | Currently, CFQ IO scheduler uses it to recognize task groups and | ||
| 625 | control disk bandwidth allocation (proportional time slice allocation) | ||
| 626 | to such task groups. | ||
| 627 | |||
| 628 | This option only enables generic Block IO controller infrastructure. | ||
| 629 | One needs to also enable actual IO controlling logic in CFQ for it | ||
| 630 | to take effect. (CONFIG_CFQ_GROUP_IOSCHED=y). | ||
| 631 | |||
| 632 | See Documentation/cgroups/blkio-controller.txt for more information. | ||
| 633 | |||
| 634 | config DEBUG_BLK_CGROUP | ||
| 635 | bool "Enable Block IO controller debugging" | ||
| 636 | depends on BLK_CGROUP | ||
| 637 | default n | ||
| 638 | ---help--- | ||
| 639 | Enable some debugging help. Currently it exports additional stat | ||
| 640 | files in a cgroup which can be useful for debugging. | ||
| 641 | |||
| 615 | endif # CGROUPS | 642 | endif # CGROUPS |
| 616 | 643 | ||
| 617 | config MM_OWNER | 644 | config MM_OWNER |
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index 5b496132c28a..906a0f718cb3 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c | |||
| @@ -41,6 +41,7 @@ unsigned long long __attribute__((weak)) sched_clock(void) | |||
| 41 | return (unsigned long long)(jiffies - INITIAL_JIFFIES) | 41 | return (unsigned long long)(jiffies - INITIAL_JIFFIES) |
| 42 | * (NSEC_PER_SEC / HZ); | 42 | * (NSEC_PER_SEC / HZ); |
| 43 | } | 43 | } |
| 44 | EXPORT_SYMBOL_GPL(sched_clock); | ||
| 44 | 45 | ||
| 45 | static __read_mostly int sched_clock_running; | 46 | static __read_mostly int sched_clock_running; |
| 46 | 47 | ||
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 0b19943ecf8b..d0f2b3765f8d 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
| @@ -683,10 +683,6 @@ void throttle_vm_writeout(gfp_t gfp_mask) | |||
| 683 | } | 683 | } |
| 684 | } | 684 | } |
| 685 | 685 | ||
| 686 | static void laptop_timer_fn(unsigned long unused); | ||
| 687 | |||
| 688 | static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0); | ||
| 689 | |||
| 690 | /* | 686 | /* |
| 691 | * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs | 687 | * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs |
| 692 | */ | 688 | */ |
| @@ -697,21 +693,19 @@ int dirty_writeback_centisecs_handler(ctl_table *table, int write, | |||
| 697 | return 0; | 693 | return 0; |
| 698 | } | 694 | } |
| 699 | 695 | ||
| 700 | static void do_laptop_sync(struct work_struct *work) | 696 | void laptop_mode_timer_fn(unsigned long data) |
| 701 | { | 697 | { |
| 702 | wakeup_flusher_threads(0); | 698 | struct request_queue *q = (struct request_queue *)data; |
| 703 | kfree(work); | 699 | int nr_pages = global_page_state(NR_FILE_DIRTY) + |
| 704 | } | 700 | global_page_state(NR_UNSTABLE_NFS); |
| 705 | 701 | ||
| 706 | static void laptop_timer_fn(unsigned long unused) | 702 | /* |
| 707 | { | 703 | * We want to write everything out, not just down to the dirty |
| 708 | struct work_struct *work; | 704 | * threshold |
| 705 | */ | ||
| 709 | 706 | ||
| 710 | work = kmalloc(sizeof(*work), GFP_ATOMIC); | 707 | if (bdi_has_dirty_io(&q->backing_dev_info)) |
| 711 | if (work) { | 708 | bdi_start_writeback(&q->backing_dev_info, NULL, nr_pages); |
| 712 | INIT_WORK(work, do_laptop_sync); | ||
| 713 | schedule_work(work); | ||
| 714 | } | ||
| 715 | } | 709 | } |
| 716 | 710 | ||
| 717 | /* | 711 | /* |
| @@ -719,9 +713,9 @@ static void laptop_timer_fn(unsigned long unused) | |||
| 719 | * of all dirty data a few seconds from now. If the flush is already scheduled | 713 | * of all dirty data a few seconds from now. If the flush is already scheduled |
| 720 | * then push it back - the user is still using the disk. | 714 | * then push it back - the user is still using the disk. |
| 721 | */ | 715 | */ |
| 722 | void laptop_io_completion(void) | 716 | void laptop_io_completion(struct backing_dev_info *info) |
| 723 | { | 717 | { |
| 724 | mod_timer(&laptop_mode_wb_timer, jiffies + laptop_mode); | 718 | mod_timer(&info->laptop_mode_wb_timer, jiffies + laptop_mode); |
| 725 | } | 719 | } |
| 726 | 720 | ||
| 727 | /* | 721 | /* |
| @@ -731,7 +725,14 @@ void laptop_io_completion(void) | |||
| 731 | */ | 725 | */ |
| 732 | void laptop_sync_completion(void) | 726 | void laptop_sync_completion(void) |
| 733 | { | 727 | { |
| 734 | del_timer(&laptop_mode_wb_timer); | 728 | struct backing_dev_info *bdi; |
| 729 | |||
| 730 | rcu_read_lock(); | ||
| 731 | |||
| 732 | list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) | ||
| 733 | del_timer(&bdi->laptop_mode_wb_timer); | ||
| 734 | |||
| 735 | rcu_read_unlock(); | ||
| 735 | } | 736 | } |
| 736 | 737 | ||
| 737 | /* | 738 | /* |
diff --git a/mm/swapfile.c b/mm/swapfile.c index 6cd0a8f90dc7..eb086e0f4dcc 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
| @@ -139,7 +139,8 @@ static int discard_swap(struct swap_info_struct *si) | |||
| 139 | nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9); | 139 | nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9); |
| 140 | if (nr_blocks) { | 140 | if (nr_blocks) { |
| 141 | err = blkdev_issue_discard(si->bdev, start_block, | 141 | err = blkdev_issue_discard(si->bdev, start_block, |
| 142 | nr_blocks, GFP_KERNEL, DISCARD_FL_BARRIER); | 142 | nr_blocks, GFP_KERNEL, |
| 143 | BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER); | ||
| 143 | if (err) | 144 | if (err) |
| 144 | return err; | 145 | return err; |
| 145 | cond_resched(); | 146 | cond_resched(); |
| @@ -150,7 +151,8 @@ static int discard_swap(struct swap_info_struct *si) | |||
| 150 | nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9); | 151 | nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9); |
| 151 | 152 | ||
| 152 | err = blkdev_issue_discard(si->bdev, start_block, | 153 | err = blkdev_issue_discard(si->bdev, start_block, |
| 153 | nr_blocks, GFP_KERNEL, DISCARD_FL_BARRIER); | 154 | nr_blocks, GFP_KERNEL, |
| 155 | BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER); | ||
| 154 | if (err) | 156 | if (err) |
| 155 | break; | 157 | break; |
| 156 | 158 | ||
| @@ -189,7 +191,8 @@ static void discard_swap_cluster(struct swap_info_struct *si, | |||
| 189 | start_block <<= PAGE_SHIFT - 9; | 191 | start_block <<= PAGE_SHIFT - 9; |
| 190 | nr_blocks <<= PAGE_SHIFT - 9; | 192 | nr_blocks <<= PAGE_SHIFT - 9; |
| 191 | if (blkdev_issue_discard(si->bdev, start_block, | 193 | if (blkdev_issue_discard(si->bdev, start_block, |
| 192 | nr_blocks, GFP_NOIO, DISCARD_FL_BARRIER)) | 194 | nr_blocks, GFP_NOIO, BLKDEV_IFL_WAIT | |
| 195 | BLKDEV_IFL_BARRIER)) | ||
| 193 | break; | 196 | break; |
| 194 | } | 197 | } |
| 195 | 198 | ||
