aboutsummaryrefslogtreecommitdiffstats
path: root/block
diff options
context:
space:
mode:
Diffstat (limited to 'block')
-rw-r--r--block/Kconfig23
-rw-r--r--block/Kconfig.iosched16
-rw-r--r--block/Makefile2
-rw-r--r--block/blk-barrier.c147
-rw-r--r--block/blk-cgroup.c791
-rw-r--r--block/blk-cgroup.h178
-rw-r--r--block/blk-core.c31
-rw-r--r--block/blk-lib.c233
-rw-r--r--block/cfq-iosched.c81
-rw-r--r--block/elevator.c11
-rw-r--r--block/genhd.c2
-rw-r--r--block/ioctl.c2
12 files changed, 1252 insertions, 265 deletions
diff --git a/block/Kconfig b/block/Kconfig
index f9e89f4d94bb..9be0b56eaee1 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -77,29 +77,6 @@ config BLK_DEV_INTEGRITY
77 T10/SCSI Data Integrity Field or the T13/ATA External Path 77 T10/SCSI Data Integrity Field or the T13/ATA External Path
78 Protection. If in doubt, say N. 78 Protection. If in doubt, say N.
79 79
80config BLK_CGROUP
81 tristate "Block cgroup support"
82 depends on CGROUPS
83 depends on CFQ_GROUP_IOSCHED
84 default n
85 ---help---
86 Generic block IO controller cgroup interface. This is the common
87 cgroup interface which should be used by various IO controlling
88 policies.
89
90 Currently, CFQ IO scheduler uses it to recognize task groups and
91 control disk bandwidth allocation (proportional time slice allocation)
92 to such task groups.
93
94config DEBUG_BLK_CGROUP
95 bool
96 depends on BLK_CGROUP
97 default n
98 ---help---
99 Enable some debugging help. Currently it stores the cgroup path
100 in the blk group which can be used by cfq for tracing various
101 group related activity.
102
103endif # BLOCK 80endif # BLOCK
104 81
105config BLOCK_COMPAT 82config BLOCK_COMPAT
diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
index fc71cf071fb2..3199b76f795d 100644
--- a/block/Kconfig.iosched
+++ b/block/Kconfig.iosched
@@ -23,7 +23,8 @@ config IOSCHED_DEADLINE
23 23
24config IOSCHED_CFQ 24config IOSCHED_CFQ
25 tristate "CFQ I/O scheduler" 25 tristate "CFQ I/O scheduler"
26 select BLK_CGROUP if CFQ_GROUP_IOSCHED 26 # If BLK_CGROUP is a module, CFQ has to be built as module.
27 depends on (BLK_CGROUP=m && m) || !BLK_CGROUP || BLK_CGROUP=y
27 default y 28 default y
28 ---help--- 29 ---help---
29 The CFQ I/O scheduler tries to distribute bandwidth equally 30 The CFQ I/O scheduler tries to distribute bandwidth equally
@@ -33,22 +34,15 @@ config IOSCHED_CFQ
33 34
34 This is the default I/O scheduler. 35 This is the default I/O scheduler.
35 36
37 Note: If BLK_CGROUP=m, then CFQ can be built only as module.
38
36config CFQ_GROUP_IOSCHED 39config CFQ_GROUP_IOSCHED
37 bool "CFQ Group Scheduling support" 40 bool "CFQ Group Scheduling support"
38 depends on IOSCHED_CFQ && CGROUPS 41 depends on IOSCHED_CFQ && BLK_CGROUP
39 default n 42 default n
40 ---help--- 43 ---help---
41 Enable group IO scheduling in CFQ. 44 Enable group IO scheduling in CFQ.
42 45
43config DEBUG_CFQ_IOSCHED
44 bool "Debug CFQ Scheduling"
45 depends on CFQ_GROUP_IOSCHED
46 select DEBUG_BLK_CGROUP
47 default n
48 ---help---
49 Enable CFQ IO scheduling debugging in CFQ. Currently it makes
50 blktrace output more verbose.
51
52choice 46choice
53 prompt "Default I/O scheduler" 47 prompt "Default I/O scheduler"
54 default DEFAULT_CFQ 48 default DEFAULT_CFQ
diff --git a/block/Makefile b/block/Makefile
index cb2d515ebd6e..0bb499a739cd 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -5,7 +5,7 @@
5obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \ 5obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \
6 blk-barrier.o blk-settings.o blk-ioc.o blk-map.o \ 6 blk-barrier.o blk-settings.o blk-ioc.o blk-map.o \
7 blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \ 7 blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \
8 blk-iopoll.o ioctl.o genhd.o scsi_ioctl.o 8 blk-iopoll.o blk-lib.o ioctl.o genhd.o scsi_ioctl.o
9 9
10obj-$(CONFIG_BLK_DEV_BSG) += bsg.o 10obj-$(CONFIG_BLK_DEV_BSG) += bsg.o
11obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o 11obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o
diff --git a/block/blk-barrier.c b/block/blk-barrier.c
index 6d88544b677f..0d710c9d403b 100644
--- a/block/blk-barrier.c
+++ b/block/blk-barrier.c
@@ -286,26 +286,31 @@ static void bio_end_empty_barrier(struct bio *bio, int err)
286 set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); 286 set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
287 clear_bit(BIO_UPTODATE, &bio->bi_flags); 287 clear_bit(BIO_UPTODATE, &bio->bi_flags);
288 } 288 }
289 289 if (bio->bi_private)
290 complete(bio->bi_private); 290 complete(bio->bi_private);
291 bio_put(bio);
291} 292}
292 293
293/** 294/**
294 * blkdev_issue_flush - queue a flush 295 * blkdev_issue_flush - queue a flush
295 * @bdev: blockdev to issue flush for 296 * @bdev: blockdev to issue flush for
297 * @gfp_mask: memory allocation flags (for bio_alloc)
296 * @error_sector: error sector 298 * @error_sector: error sector
299 * @flags: BLKDEV_IFL_* flags to control behaviour
297 * 300 *
298 * Description: 301 * Description:
299 * Issue a flush for the block device in question. Caller can supply 302 * Issue a flush for the block device in question. Caller can supply
300 * room for storing the error offset in case of a flush error, if they 303 * room for storing the error offset in case of a flush error, if they
301 * wish to. 304 * wish to. If WAIT flag is not passed then caller may check only what
305 * request was pushed in some internal queue for later handling.
302 */ 306 */
303int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector) 307int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask,
308 sector_t *error_sector, unsigned long flags)
304{ 309{
305 DECLARE_COMPLETION_ONSTACK(wait); 310 DECLARE_COMPLETION_ONSTACK(wait);
306 struct request_queue *q; 311 struct request_queue *q;
307 struct bio *bio; 312 struct bio *bio;
308 int ret; 313 int ret = 0;
309 314
310 if (bdev->bd_disk == NULL) 315 if (bdev->bd_disk == NULL)
311 return -ENXIO; 316 return -ENXIO;
@@ -314,23 +319,25 @@ int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector)
314 if (!q) 319 if (!q)
315 return -ENXIO; 320 return -ENXIO;
316 321
317 bio = bio_alloc(GFP_KERNEL, 0); 322 bio = bio_alloc(gfp_mask, 0);
318 bio->bi_end_io = bio_end_empty_barrier; 323 bio->bi_end_io = bio_end_empty_barrier;
319 bio->bi_private = &wait;
320 bio->bi_bdev = bdev; 324 bio->bi_bdev = bdev;
321 submit_bio(WRITE_BARRIER, bio); 325 if (test_bit(BLKDEV_WAIT, &flags))
322 326 bio->bi_private = &wait;
323 wait_for_completion(&wait);
324 327
325 /* 328 bio_get(bio);
326 * The driver must store the error location in ->bi_sector, if 329 submit_bio(WRITE_BARRIER, bio);
327 * it supports it. For non-stacked drivers, this should be copied 330 if (test_bit(BLKDEV_WAIT, &flags)) {
328 * from blk_rq_pos(rq). 331 wait_for_completion(&wait);
329 */ 332 /*
330 if (error_sector) 333 * The driver must store the error location in ->bi_sector, if
331 *error_sector = bio->bi_sector; 334 * it supports it. For non-stacked drivers, this should be
335 * copied from blk_rq_pos(rq).
336 */
337 if (error_sector)
338 *error_sector = bio->bi_sector;
339 }
332 340
333 ret = 0;
334 if (bio_flagged(bio, BIO_EOPNOTSUPP)) 341 if (bio_flagged(bio, BIO_EOPNOTSUPP))
335 ret = -EOPNOTSUPP; 342 ret = -EOPNOTSUPP;
336 else if (!bio_flagged(bio, BIO_UPTODATE)) 343 else if (!bio_flagged(bio, BIO_UPTODATE))
@@ -340,107 +347,3 @@ int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector)
340 return ret; 347 return ret;
341} 348}
342EXPORT_SYMBOL(blkdev_issue_flush); 349EXPORT_SYMBOL(blkdev_issue_flush);
343
344static void blkdev_discard_end_io(struct bio *bio, int err)
345{
346 if (err) {
347 if (err == -EOPNOTSUPP)
348 set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
349 clear_bit(BIO_UPTODATE, &bio->bi_flags);
350 }
351
352 if (bio->bi_private)
353 complete(bio->bi_private);
354 __free_page(bio_page(bio));
355
356 bio_put(bio);
357}
358
359/**
360 * blkdev_issue_discard - queue a discard
361 * @bdev: blockdev to issue discard for
362 * @sector: start sector
363 * @nr_sects: number of sectors to discard
364 * @gfp_mask: memory allocation flags (for bio_alloc)
365 * @flags: DISCARD_FL_* flags to control behaviour
366 *
367 * Description:
368 * Issue a discard request for the sectors in question.
369 */
370int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
371 sector_t nr_sects, gfp_t gfp_mask, int flags)
372{
373 DECLARE_COMPLETION_ONSTACK(wait);
374 struct request_queue *q = bdev_get_queue(bdev);
375 int type = flags & DISCARD_FL_BARRIER ?
376 DISCARD_BARRIER : DISCARD_NOBARRIER;
377 struct bio *bio;
378 struct page *page;
379 int ret = 0;
380
381 if (!q)
382 return -ENXIO;
383
384 if (!blk_queue_discard(q))
385 return -EOPNOTSUPP;
386
387 while (nr_sects && !ret) {
388 unsigned int sector_size = q->limits.logical_block_size;
389 unsigned int max_discard_sectors =
390 min(q->limits.max_discard_sectors, UINT_MAX >> 9);
391
392 bio = bio_alloc(gfp_mask, 1);
393 if (!bio)
394 goto out;
395 bio->bi_sector = sector;
396 bio->bi_end_io = blkdev_discard_end_io;
397 bio->bi_bdev = bdev;
398 if (flags & DISCARD_FL_WAIT)
399 bio->bi_private = &wait;
400
401 /*
402 * Add a zeroed one-sector payload as that's what
403 * our current implementations need. If we'll ever need
404 * more the interface will need revisiting.
405 */
406 page = alloc_page(gfp_mask | __GFP_ZERO);
407 if (!page)
408 goto out_free_bio;
409 if (bio_add_pc_page(q, bio, page, sector_size, 0) < sector_size)
410 goto out_free_page;
411
412 /*
413 * And override the bio size - the way discard works we
414 * touch many more blocks on disk than the actual payload
415 * length.
416 */
417 if (nr_sects > max_discard_sectors) {
418 bio->bi_size = max_discard_sectors << 9;
419 nr_sects -= max_discard_sectors;
420 sector += max_discard_sectors;
421 } else {
422 bio->bi_size = nr_sects << 9;
423 nr_sects = 0;
424 }
425
426 bio_get(bio);
427 submit_bio(type, bio);
428
429 if (flags & DISCARD_FL_WAIT)
430 wait_for_completion(&wait);
431
432 if (bio_flagged(bio, BIO_EOPNOTSUPP))
433 ret = -EOPNOTSUPP;
434 else if (!bio_flagged(bio, BIO_UPTODATE))
435 ret = -EIO;
436 bio_put(bio);
437 }
438 return ret;
439out_free_page:
440 __free_page(page);
441out_free_bio:
442 bio_put(bio);
443out:
444 return -ENOMEM;
445}
446EXPORT_SYMBOL(blkdev_issue_discard);
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 2cc682b860ea..a6809645d212 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -15,8 +15,12 @@
15#include <linux/kdev_t.h> 15#include <linux/kdev_t.h>
16#include <linux/module.h> 16#include <linux/module.h>
17#include <linux/err.h> 17#include <linux/err.h>
18#include <linux/blkdev.h>
18#include <linux/slab.h> 19#include <linux/slab.h>
19#include "blk-cgroup.h" 20#include "blk-cgroup.h"
21#include <linux/genhd.h>
22
23#define MAX_KEY_LEN 100
20 24
21static DEFINE_SPINLOCK(blkio_list_lock); 25static DEFINE_SPINLOCK(blkio_list_lock);
22static LIST_HEAD(blkio_list); 26static LIST_HEAD(blkio_list);
@@ -49,6 +53,32 @@ struct cgroup_subsys blkio_subsys = {
49}; 53};
50EXPORT_SYMBOL_GPL(blkio_subsys); 54EXPORT_SYMBOL_GPL(blkio_subsys);
51 55
56static inline void blkio_policy_insert_node(struct blkio_cgroup *blkcg,
57 struct blkio_policy_node *pn)
58{
59 list_add(&pn->node, &blkcg->policy_list);
60}
61
62/* Must be called with blkcg->lock held */
63static inline void blkio_policy_delete_node(struct blkio_policy_node *pn)
64{
65 list_del(&pn->node);
66}
67
68/* Must be called with blkcg->lock held */
69static struct blkio_policy_node *
70blkio_policy_search_node(const struct blkio_cgroup *blkcg, dev_t dev)
71{
72 struct blkio_policy_node *pn;
73
74 list_for_each_entry(pn, &blkcg->policy_list, node) {
75 if (pn->dev == dev)
76 return pn;
77 }
78
79 return NULL;
80}
81
52struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup) 82struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup)
53{ 83{
54 return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id), 84 return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id),
@@ -56,13 +86,259 @@ struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup)
56} 86}
57EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup); 87EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup);
58 88
59void blkiocg_update_blkio_group_stats(struct blkio_group *blkg, 89/*
60 unsigned long time, unsigned long sectors) 90 * Add to the appropriate stat variable depending on the request type.
91 * This should be called with the blkg->stats_lock held.
92 */
93static void blkio_add_stat(uint64_t *stat, uint64_t add, bool direction,
94 bool sync)
95{
96 if (direction)
97 stat[BLKIO_STAT_WRITE] += add;
98 else
99 stat[BLKIO_STAT_READ] += add;
100 if (sync)
101 stat[BLKIO_STAT_SYNC] += add;
102 else
103 stat[BLKIO_STAT_ASYNC] += add;
104}
105
106/*
107 * Decrements the appropriate stat variable if non-zero depending on the
108 * request type. Panics on value being zero.
109 * This should be called with the blkg->stats_lock held.
110 */
111static void blkio_check_and_dec_stat(uint64_t *stat, bool direction, bool sync)
112{
113 if (direction) {
114 BUG_ON(stat[BLKIO_STAT_WRITE] == 0);
115 stat[BLKIO_STAT_WRITE]--;
116 } else {
117 BUG_ON(stat[BLKIO_STAT_READ] == 0);
118 stat[BLKIO_STAT_READ]--;
119 }
120 if (sync) {
121 BUG_ON(stat[BLKIO_STAT_SYNC] == 0);
122 stat[BLKIO_STAT_SYNC]--;
123 } else {
124 BUG_ON(stat[BLKIO_STAT_ASYNC] == 0);
125 stat[BLKIO_STAT_ASYNC]--;
126 }
127}
128
129#ifdef CONFIG_DEBUG_BLK_CGROUP
130/* This should be called with the blkg->stats_lock held. */
131static void blkio_set_start_group_wait_time(struct blkio_group *blkg,
132 struct blkio_group *curr_blkg)
133{
134 if (blkio_blkg_waiting(&blkg->stats))
135 return;
136 if (blkg == curr_blkg)
137 return;
138 blkg->stats.start_group_wait_time = sched_clock();
139 blkio_mark_blkg_waiting(&blkg->stats);
140}
141
142/* This should be called with the blkg->stats_lock held. */
143static void blkio_update_group_wait_time(struct blkio_group_stats *stats)
144{
145 unsigned long long now;
146
147 if (!blkio_blkg_waiting(stats))
148 return;
149
150 now = sched_clock();
151 if (time_after64(now, stats->start_group_wait_time))
152 stats->group_wait_time += now - stats->start_group_wait_time;
153 blkio_clear_blkg_waiting(stats);
154}
155
156/* This should be called with the blkg->stats_lock held. */
157static void blkio_end_empty_time(struct blkio_group_stats *stats)
158{
159 unsigned long long now;
160
161 if (!blkio_blkg_empty(stats))
162 return;
163
164 now = sched_clock();
165 if (time_after64(now, stats->start_empty_time))
166 stats->empty_time += now - stats->start_empty_time;
167 blkio_clear_blkg_empty(stats);
168}
169
170void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg)
171{
172 unsigned long flags;
173
174 spin_lock_irqsave(&blkg->stats_lock, flags);
175 BUG_ON(blkio_blkg_idling(&blkg->stats));
176 blkg->stats.start_idle_time = sched_clock();
177 blkio_mark_blkg_idling(&blkg->stats);
178 spin_unlock_irqrestore(&blkg->stats_lock, flags);
179}
180EXPORT_SYMBOL_GPL(blkiocg_update_set_idle_time_stats);
181
182void blkiocg_update_idle_time_stats(struct blkio_group *blkg)
183{
184 unsigned long flags;
185 unsigned long long now;
186 struct blkio_group_stats *stats;
187
188 spin_lock_irqsave(&blkg->stats_lock, flags);
189 stats = &blkg->stats;
190 if (blkio_blkg_idling(stats)) {
191 now = sched_clock();
192 if (time_after64(now, stats->start_idle_time))
193 stats->idle_time += now - stats->start_idle_time;
194 blkio_clear_blkg_idling(stats);
195 }
196 spin_unlock_irqrestore(&blkg->stats_lock, flags);
197}
198EXPORT_SYMBOL_GPL(blkiocg_update_idle_time_stats);
199
200void blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg)
201{
202 unsigned long flags;
203 struct blkio_group_stats *stats;
204
205 spin_lock_irqsave(&blkg->stats_lock, flags);
206 stats = &blkg->stats;
207 stats->avg_queue_size_sum +=
208 stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] +
209 stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE];
210 stats->avg_queue_size_samples++;
211 blkio_update_group_wait_time(stats);
212 spin_unlock_irqrestore(&blkg->stats_lock, flags);
213}
214EXPORT_SYMBOL_GPL(blkiocg_update_avg_queue_size_stats);
215
216void blkiocg_set_start_empty_time(struct blkio_group *blkg)
217{
218 unsigned long flags;
219 struct blkio_group_stats *stats;
220
221 spin_lock_irqsave(&blkg->stats_lock, flags);
222 stats = &blkg->stats;
223
224 if (stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] ||
225 stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE]) {
226 spin_unlock_irqrestore(&blkg->stats_lock, flags);
227 return;
228 }
229
230 /*
231 * group is already marked empty. This can happen if cfqq got new
232 * request in parent group and moved to this group while being added
233 * to service tree. Just ignore the event and move on.
234 */
235 if(blkio_blkg_empty(stats)) {
236 spin_unlock_irqrestore(&blkg->stats_lock, flags);
237 return;
238 }
239
240 stats->start_empty_time = sched_clock();
241 blkio_mark_blkg_empty(stats);
242 spin_unlock_irqrestore(&blkg->stats_lock, flags);
243}
244EXPORT_SYMBOL_GPL(blkiocg_set_start_empty_time);
245
246void blkiocg_update_dequeue_stats(struct blkio_group *blkg,
247 unsigned long dequeue)
248{
249 blkg->stats.dequeue += dequeue;
250}
251EXPORT_SYMBOL_GPL(blkiocg_update_dequeue_stats);
252#else
253static inline void blkio_set_start_group_wait_time(struct blkio_group *blkg,
254 struct blkio_group *curr_blkg) {}
255static inline void blkio_end_empty_time(struct blkio_group_stats *stats) {}
256#endif
257
258void blkiocg_update_io_add_stats(struct blkio_group *blkg,
259 struct blkio_group *curr_blkg, bool direction,
260 bool sync)
261{
262 unsigned long flags;
263
264 spin_lock_irqsave(&blkg->stats_lock, flags);
265 blkio_add_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED], 1, direction,
266 sync);
267 blkio_end_empty_time(&blkg->stats);
268 blkio_set_start_group_wait_time(blkg, curr_blkg);
269 spin_unlock_irqrestore(&blkg->stats_lock, flags);
270}
271EXPORT_SYMBOL_GPL(blkiocg_update_io_add_stats);
272
273void blkiocg_update_io_remove_stats(struct blkio_group *blkg,
274 bool direction, bool sync)
275{
276 unsigned long flags;
277
278 spin_lock_irqsave(&blkg->stats_lock, flags);
279 blkio_check_and_dec_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED],
280 direction, sync);
281 spin_unlock_irqrestore(&blkg->stats_lock, flags);
282}
283EXPORT_SYMBOL_GPL(blkiocg_update_io_remove_stats);
284
285void blkiocg_update_timeslice_used(struct blkio_group *blkg, unsigned long time)
286{
287 unsigned long flags;
288
289 spin_lock_irqsave(&blkg->stats_lock, flags);
290 blkg->stats.time += time;
291 spin_unlock_irqrestore(&blkg->stats_lock, flags);
292}
293EXPORT_SYMBOL_GPL(blkiocg_update_timeslice_used);
294
295void blkiocg_update_dispatch_stats(struct blkio_group *blkg,
296 uint64_t bytes, bool direction, bool sync)
61{ 297{
62 blkg->time += time; 298 struct blkio_group_stats *stats;
63 blkg->sectors += sectors; 299 unsigned long flags;
300
301 spin_lock_irqsave(&blkg->stats_lock, flags);
302 stats = &blkg->stats;
303 stats->sectors += bytes >> 9;
304 blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICED], 1, direction,
305 sync);
306 blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICE_BYTES], bytes,
307 direction, sync);
308 spin_unlock_irqrestore(&blkg->stats_lock, flags);
64} 309}
65EXPORT_SYMBOL_GPL(blkiocg_update_blkio_group_stats); 310EXPORT_SYMBOL_GPL(blkiocg_update_dispatch_stats);
311
312void blkiocg_update_completion_stats(struct blkio_group *blkg,
313 uint64_t start_time, uint64_t io_start_time, bool direction, bool sync)
314{
315 struct blkio_group_stats *stats;
316 unsigned long flags;
317 unsigned long long now = sched_clock();
318
319 spin_lock_irqsave(&blkg->stats_lock, flags);
320 stats = &blkg->stats;
321 if (time_after64(now, io_start_time))
322 blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICE_TIME],
323 now - io_start_time, direction, sync);
324 if (time_after64(io_start_time, start_time))
325 blkio_add_stat(stats->stat_arr[BLKIO_STAT_WAIT_TIME],
326 io_start_time - start_time, direction, sync);
327 spin_unlock_irqrestore(&blkg->stats_lock, flags);
328}
329EXPORT_SYMBOL_GPL(blkiocg_update_completion_stats);
330
331void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction,
332 bool sync)
333{
334 unsigned long flags;
335
336 spin_lock_irqsave(&blkg->stats_lock, flags);
337 blkio_add_stat(blkg->stats.stat_arr[BLKIO_STAT_MERGED], 1, direction,
338 sync);
339 spin_unlock_irqrestore(&blkg->stats_lock, flags);
340}
341EXPORT_SYMBOL_GPL(blkiocg_update_io_merged_stats);
66 342
67void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, 343void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
68 struct blkio_group *blkg, void *key, dev_t dev) 344 struct blkio_group *blkg, void *key, dev_t dev)
@@ -70,14 +346,13 @@ void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
70 unsigned long flags; 346 unsigned long flags;
71 347
72 spin_lock_irqsave(&blkcg->lock, flags); 348 spin_lock_irqsave(&blkcg->lock, flags);
349 spin_lock_init(&blkg->stats_lock);
73 rcu_assign_pointer(blkg->key, key); 350 rcu_assign_pointer(blkg->key, key);
74 blkg->blkcg_id = css_id(&blkcg->css); 351 blkg->blkcg_id = css_id(&blkcg->css);
75 hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list); 352 hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
76 spin_unlock_irqrestore(&blkcg->lock, flags); 353 spin_unlock_irqrestore(&blkcg->lock, flags);
77#ifdef CONFIG_DEBUG_BLK_CGROUP
78 /* Need to take css reference ? */ 354 /* Need to take css reference ? */
79 cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path)); 355 cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path));
80#endif
81 blkg->dev = dev; 356 blkg->dev = dev;
82} 357}
83EXPORT_SYMBOL_GPL(blkiocg_add_blkio_group); 358EXPORT_SYMBOL_GPL(blkiocg_add_blkio_group);
@@ -101,17 +376,16 @@ int blkiocg_del_blkio_group(struct blkio_group *blkg)
101 376
102 rcu_read_lock(); 377 rcu_read_lock();
103 css = css_lookup(&blkio_subsys, blkg->blkcg_id); 378 css = css_lookup(&blkio_subsys, blkg->blkcg_id);
104 if (!css) 379 if (css) {
105 goto out; 380 blkcg = container_of(css, struct blkio_cgroup, css);
106 381 spin_lock_irqsave(&blkcg->lock, flags);
107 blkcg = container_of(css, struct blkio_cgroup, css); 382 if (!hlist_unhashed(&blkg->blkcg_node)) {
108 spin_lock_irqsave(&blkcg->lock, flags); 383 __blkiocg_del_blkio_group(blkg);
109 if (!hlist_unhashed(&blkg->blkcg_node)) { 384 ret = 0;
110 __blkiocg_del_blkio_group(blkg); 385 }
111 ret = 0; 386 spin_unlock_irqrestore(&blkcg->lock, flags);
112 } 387 }
113 spin_unlock_irqrestore(&blkcg->lock, flags); 388
114out:
115 rcu_read_unlock(); 389 rcu_read_unlock();
116 return ret; 390 return ret;
117} 391}
@@ -154,6 +428,7 @@ blkiocg_weight_write(struct cgroup *cgroup, struct cftype *cftype, u64 val)
154 struct blkio_group *blkg; 428 struct blkio_group *blkg;
155 struct hlist_node *n; 429 struct hlist_node *n;
156 struct blkio_policy_type *blkiop; 430 struct blkio_policy_type *blkiop;
431 struct blkio_policy_node *pn;
157 432
158 if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX) 433 if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX)
159 return -EINVAL; 434 return -EINVAL;
@@ -162,7 +437,13 @@ blkiocg_weight_write(struct cgroup *cgroup, struct cftype *cftype, u64 val)
162 spin_lock(&blkio_list_lock); 437 spin_lock(&blkio_list_lock);
163 spin_lock_irq(&blkcg->lock); 438 spin_lock_irq(&blkcg->lock);
164 blkcg->weight = (unsigned int)val; 439 blkcg->weight = (unsigned int)val;
440
165 hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) { 441 hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
442 pn = blkio_policy_search_node(blkcg, blkg->dev);
443
444 if (pn)
445 continue;
446
166 list_for_each_entry(blkiop, &blkio_list, list) 447 list_for_each_entry(blkiop, &blkio_list, list)
167 blkiop->ops.blkio_update_group_weight_fn(blkg, 448 blkiop->ops.blkio_update_group_weight_fn(blkg,
168 blkcg->weight); 449 blkcg->weight);
@@ -172,13 +453,154 @@ blkiocg_weight_write(struct cgroup *cgroup, struct cftype *cftype, u64 val)
172 return 0; 453 return 0;
173} 454}
174 455
175#define SHOW_FUNCTION_PER_GROUP(__VAR) \ 456static int
457blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val)
458{
459 struct blkio_cgroup *blkcg;
460 struct blkio_group *blkg;
461 struct blkio_group_stats *stats;
462 struct hlist_node *n;
463 uint64_t queued[BLKIO_STAT_TOTAL];
464 int i;
465#ifdef CONFIG_DEBUG_BLK_CGROUP
466 bool idling, waiting, empty;
467 unsigned long long now = sched_clock();
468#endif
469
470 blkcg = cgroup_to_blkio_cgroup(cgroup);
471 spin_lock_irq(&blkcg->lock);
472 hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
473 spin_lock(&blkg->stats_lock);
474 stats = &blkg->stats;
475#ifdef CONFIG_DEBUG_BLK_CGROUP
476 idling = blkio_blkg_idling(stats);
477 waiting = blkio_blkg_waiting(stats);
478 empty = blkio_blkg_empty(stats);
479#endif
480 for (i = 0; i < BLKIO_STAT_TOTAL; i++)
481 queued[i] = stats->stat_arr[BLKIO_STAT_QUEUED][i];
482 memset(stats, 0, sizeof(struct blkio_group_stats));
483 for (i = 0; i < BLKIO_STAT_TOTAL; i++)
484 stats->stat_arr[BLKIO_STAT_QUEUED][i] = queued[i];
485#ifdef CONFIG_DEBUG_BLK_CGROUP
486 if (idling) {
487 blkio_mark_blkg_idling(stats);
488 stats->start_idle_time = now;
489 }
490 if (waiting) {
491 blkio_mark_blkg_waiting(stats);
492 stats->start_group_wait_time = now;
493 }
494 if (empty) {
495 blkio_mark_blkg_empty(stats);
496 stats->start_empty_time = now;
497 }
498#endif
499 spin_unlock(&blkg->stats_lock);
500 }
501 spin_unlock_irq(&blkcg->lock);
502 return 0;
503}
504
505static void blkio_get_key_name(enum stat_sub_type type, dev_t dev, char *str,
506 int chars_left, bool diskname_only)
507{
508 snprintf(str, chars_left, "%d:%d", MAJOR(dev), MINOR(dev));
509 chars_left -= strlen(str);
510 if (chars_left <= 0) {
511 printk(KERN_WARNING
512 "Possibly incorrect cgroup stat display format");
513 return;
514 }
515 if (diskname_only)
516 return;
517 switch (type) {
518 case BLKIO_STAT_READ:
519 strlcat(str, " Read", chars_left);
520 break;
521 case BLKIO_STAT_WRITE:
522 strlcat(str, " Write", chars_left);
523 break;
524 case BLKIO_STAT_SYNC:
525 strlcat(str, " Sync", chars_left);
526 break;
527 case BLKIO_STAT_ASYNC:
528 strlcat(str, " Async", chars_left);
529 break;
530 case BLKIO_STAT_TOTAL:
531 strlcat(str, " Total", chars_left);
532 break;
533 default:
534 strlcat(str, " Invalid", chars_left);
535 }
536}
537
538static uint64_t blkio_fill_stat(char *str, int chars_left, uint64_t val,
539 struct cgroup_map_cb *cb, dev_t dev)
540{
541 blkio_get_key_name(0, dev, str, chars_left, true);
542 cb->fill(cb, str, val);
543 return val;
544}
545
546/* This should be called with blkg->stats_lock held */
547static uint64_t blkio_get_stat(struct blkio_group *blkg,
548 struct cgroup_map_cb *cb, dev_t dev, enum stat_type type)
549{
550 uint64_t disk_total;
551 char key_str[MAX_KEY_LEN];
552 enum stat_sub_type sub_type;
553
554 if (type == BLKIO_STAT_TIME)
555 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
556 blkg->stats.time, cb, dev);
557 if (type == BLKIO_STAT_SECTORS)
558 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
559 blkg->stats.sectors, cb, dev);
560#ifdef CONFIG_DEBUG_BLK_CGROUP
561 if (type == BLKIO_STAT_AVG_QUEUE_SIZE) {
562 uint64_t sum = blkg->stats.avg_queue_size_sum;
563 uint64_t samples = blkg->stats.avg_queue_size_samples;
564 if (samples)
565 do_div(sum, samples);
566 else
567 sum = 0;
568 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, sum, cb, dev);
569 }
570 if (type == BLKIO_STAT_GROUP_WAIT_TIME)
571 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
572 blkg->stats.group_wait_time, cb, dev);
573 if (type == BLKIO_STAT_IDLE_TIME)
574 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
575 blkg->stats.idle_time, cb, dev);
576 if (type == BLKIO_STAT_EMPTY_TIME)
577 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
578 blkg->stats.empty_time, cb, dev);
579 if (type == BLKIO_STAT_DEQUEUE)
580 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
581 blkg->stats.dequeue, cb, dev);
582#endif
583
584 for (sub_type = BLKIO_STAT_READ; sub_type < BLKIO_STAT_TOTAL;
585 sub_type++) {
586 blkio_get_key_name(sub_type, dev, key_str, MAX_KEY_LEN, false);
587 cb->fill(cb, key_str, blkg->stats.stat_arr[type][sub_type]);
588 }
589 disk_total = blkg->stats.stat_arr[type][BLKIO_STAT_READ] +
590 blkg->stats.stat_arr[type][BLKIO_STAT_WRITE];
591 blkio_get_key_name(BLKIO_STAT_TOTAL, dev, key_str, MAX_KEY_LEN, false);
592 cb->fill(cb, key_str, disk_total);
593 return disk_total;
594}
595
596#define SHOW_FUNCTION_PER_GROUP(__VAR, type, show_total) \
176static int blkiocg_##__VAR##_read(struct cgroup *cgroup, \ 597static int blkiocg_##__VAR##_read(struct cgroup *cgroup, \
177 struct cftype *cftype, struct seq_file *m) \ 598 struct cftype *cftype, struct cgroup_map_cb *cb) \
178{ \ 599{ \
179 struct blkio_cgroup *blkcg; \ 600 struct blkio_cgroup *blkcg; \
180 struct blkio_group *blkg; \ 601 struct blkio_group *blkg; \
181 struct hlist_node *n; \ 602 struct hlist_node *n; \
603 uint64_t cgroup_total = 0; \
182 \ 604 \
183 if (!cgroup_lock_live_group(cgroup)) \ 605 if (!cgroup_lock_live_group(cgroup)) \
184 return -ENODEV; \ 606 return -ENODEV; \
@@ -186,50 +608,293 @@ static int blkiocg_##__VAR##_read(struct cgroup *cgroup, \
186 blkcg = cgroup_to_blkio_cgroup(cgroup); \ 608 blkcg = cgroup_to_blkio_cgroup(cgroup); \
187 rcu_read_lock(); \ 609 rcu_read_lock(); \
188 hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {\ 610 hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {\
189 if (blkg->dev) \ 611 if (blkg->dev) { \
190 seq_printf(m, "%u:%u %lu\n", MAJOR(blkg->dev), \ 612 spin_lock_irq(&blkg->stats_lock); \
191 MINOR(blkg->dev), blkg->__VAR); \ 613 cgroup_total += blkio_get_stat(blkg, cb, \
614 blkg->dev, type); \
615 spin_unlock_irq(&blkg->stats_lock); \
616 } \
192 } \ 617 } \
618 if (show_total) \
619 cb->fill(cb, "Total", cgroup_total); \
193 rcu_read_unlock(); \ 620 rcu_read_unlock(); \
194 cgroup_unlock(); \ 621 cgroup_unlock(); \
195 return 0; \ 622 return 0; \
196} 623}
197 624
198SHOW_FUNCTION_PER_GROUP(time); 625SHOW_FUNCTION_PER_GROUP(time, BLKIO_STAT_TIME, 0);
199SHOW_FUNCTION_PER_GROUP(sectors); 626SHOW_FUNCTION_PER_GROUP(sectors, BLKIO_STAT_SECTORS, 0);
627SHOW_FUNCTION_PER_GROUP(io_service_bytes, BLKIO_STAT_SERVICE_BYTES, 1);
628SHOW_FUNCTION_PER_GROUP(io_serviced, BLKIO_STAT_SERVICED, 1);
629SHOW_FUNCTION_PER_GROUP(io_service_time, BLKIO_STAT_SERVICE_TIME, 1);
630SHOW_FUNCTION_PER_GROUP(io_wait_time, BLKIO_STAT_WAIT_TIME, 1);
631SHOW_FUNCTION_PER_GROUP(io_merged, BLKIO_STAT_MERGED, 1);
632SHOW_FUNCTION_PER_GROUP(io_queued, BLKIO_STAT_QUEUED, 1);
200#ifdef CONFIG_DEBUG_BLK_CGROUP 633#ifdef CONFIG_DEBUG_BLK_CGROUP
201SHOW_FUNCTION_PER_GROUP(dequeue); 634SHOW_FUNCTION_PER_GROUP(dequeue, BLKIO_STAT_DEQUEUE, 0);
635SHOW_FUNCTION_PER_GROUP(avg_queue_size, BLKIO_STAT_AVG_QUEUE_SIZE, 0);
636SHOW_FUNCTION_PER_GROUP(group_wait_time, BLKIO_STAT_GROUP_WAIT_TIME, 0);
637SHOW_FUNCTION_PER_GROUP(idle_time, BLKIO_STAT_IDLE_TIME, 0);
638SHOW_FUNCTION_PER_GROUP(empty_time, BLKIO_STAT_EMPTY_TIME, 0);
202#endif 639#endif
203#undef SHOW_FUNCTION_PER_GROUP 640#undef SHOW_FUNCTION_PER_GROUP
204 641
205#ifdef CONFIG_DEBUG_BLK_CGROUP 642static int blkio_check_dev_num(dev_t dev)
206void blkiocg_update_blkio_group_dequeue_stats(struct blkio_group *blkg,
207 unsigned long dequeue)
208{ 643{
209 blkg->dequeue += dequeue; 644 int part = 0;
645 struct gendisk *disk;
646
647 disk = get_gendisk(dev, &part);
648 if (!disk || part)
649 return -ENODEV;
650
651 return 0;
652}
653
654static int blkio_policy_parse_and_set(char *buf,
655 struct blkio_policy_node *newpn)
656{
657 char *s[4], *p, *major_s = NULL, *minor_s = NULL;
658 int ret;
659 unsigned long major, minor, temp;
660 int i = 0;
661 dev_t dev;
662
663 memset(s, 0, sizeof(s));
664
665 while ((p = strsep(&buf, " ")) != NULL) {
666 if (!*p)
667 continue;
668
669 s[i++] = p;
670
671 /* Prevent from inputing too many things */
672 if (i == 3)
673 break;
674 }
675
676 if (i != 2)
677 return -EINVAL;
678
679 p = strsep(&s[0], ":");
680 if (p != NULL)
681 major_s = p;
682 else
683 return -EINVAL;
684
685 minor_s = s[0];
686 if (!minor_s)
687 return -EINVAL;
688
689 ret = strict_strtoul(major_s, 10, &major);
690 if (ret)
691 return -EINVAL;
692
693 ret = strict_strtoul(minor_s, 10, &minor);
694 if (ret)
695 return -EINVAL;
696
697 dev = MKDEV(major, minor);
698
699 ret = blkio_check_dev_num(dev);
700 if (ret)
701 return ret;
702
703 newpn->dev = dev;
704
705 if (s[1] == NULL)
706 return -EINVAL;
707
708 ret = strict_strtoul(s[1], 10, &temp);
709 if (ret || (temp < BLKIO_WEIGHT_MIN && temp > 0) ||
710 temp > BLKIO_WEIGHT_MAX)
711 return -EINVAL;
712
713 newpn->weight = temp;
714
715 return 0;
716}
717
718unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
719 dev_t dev)
720{
721 struct blkio_policy_node *pn;
722
723 pn = blkio_policy_search_node(blkcg, dev);
724 if (pn)
725 return pn->weight;
726 else
727 return blkcg->weight;
728}
729EXPORT_SYMBOL_GPL(blkcg_get_weight);
730
731
732static int blkiocg_weight_device_write(struct cgroup *cgrp, struct cftype *cft,
733 const char *buffer)
734{
735 int ret = 0;
736 char *buf;
737 struct blkio_policy_node *newpn, *pn;
738 struct blkio_cgroup *blkcg;
739 struct blkio_group *blkg;
740 int keep_newpn = 0;
741 struct hlist_node *n;
742 struct blkio_policy_type *blkiop;
743
744 buf = kstrdup(buffer, GFP_KERNEL);
745 if (!buf)
746 return -ENOMEM;
747
748 newpn = kzalloc(sizeof(*newpn), GFP_KERNEL);
749 if (!newpn) {
750 ret = -ENOMEM;
751 goto free_buf;
752 }
753
754 ret = blkio_policy_parse_and_set(buf, newpn);
755 if (ret)
756 goto free_newpn;
757
758 blkcg = cgroup_to_blkio_cgroup(cgrp);
759
760 spin_lock_irq(&blkcg->lock);
761
762 pn = blkio_policy_search_node(blkcg, newpn->dev);
763 if (!pn) {
764 if (newpn->weight != 0) {
765 blkio_policy_insert_node(blkcg, newpn);
766 keep_newpn = 1;
767 }
768 spin_unlock_irq(&blkcg->lock);
769 goto update_io_group;
770 }
771
772 if (newpn->weight == 0) {
773 /* weight == 0 means deleteing a specific weight */
774 blkio_policy_delete_node(pn);
775 spin_unlock_irq(&blkcg->lock);
776 goto update_io_group;
777 }
778 spin_unlock_irq(&blkcg->lock);
779
780 pn->weight = newpn->weight;
781
782update_io_group:
783 /* update weight for each cfqg */
784 spin_lock(&blkio_list_lock);
785 spin_lock_irq(&blkcg->lock);
786
787 hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
788 if (newpn->dev == blkg->dev) {
789 list_for_each_entry(blkiop, &blkio_list, list)
790 blkiop->ops.blkio_update_group_weight_fn(blkg,
791 newpn->weight ?
792 newpn->weight :
793 blkcg->weight);
794 }
795 }
796
797 spin_unlock_irq(&blkcg->lock);
798 spin_unlock(&blkio_list_lock);
799
800free_newpn:
801 if (!keep_newpn)
802 kfree(newpn);
803free_buf:
804 kfree(buf);
805 return ret;
806}
807
808static int blkiocg_weight_device_read(struct cgroup *cgrp, struct cftype *cft,
809 struct seq_file *m)
810{
811 struct blkio_cgroup *blkcg;
812 struct blkio_policy_node *pn;
813
814 seq_printf(m, "dev\tweight\n");
815
816 blkcg = cgroup_to_blkio_cgroup(cgrp);
817 if (!list_empty(&blkcg->policy_list)) {
818 spin_lock_irq(&blkcg->lock);
819 list_for_each_entry(pn, &blkcg->policy_list, node) {
820 seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev),
821 MINOR(pn->dev), pn->weight);
822 }
823 spin_unlock_irq(&blkcg->lock);
824 }
825
826 return 0;
210} 827}
211EXPORT_SYMBOL_GPL(blkiocg_update_blkio_group_dequeue_stats);
212#endif
213 828
214struct cftype blkio_files[] = { 829struct cftype blkio_files[] = {
215 { 830 {
831 .name = "weight_device",
832 .read_seq_string = blkiocg_weight_device_read,
833 .write_string = blkiocg_weight_device_write,
834 .max_write_len = 256,
835 },
836 {
216 .name = "weight", 837 .name = "weight",
217 .read_u64 = blkiocg_weight_read, 838 .read_u64 = blkiocg_weight_read,
218 .write_u64 = blkiocg_weight_write, 839 .write_u64 = blkiocg_weight_write,
219 }, 840 },
220 { 841 {
221 .name = "time", 842 .name = "time",
222 .read_seq_string = blkiocg_time_read, 843 .read_map = blkiocg_time_read,
223 }, 844 },
224 { 845 {
225 .name = "sectors", 846 .name = "sectors",
226 .read_seq_string = blkiocg_sectors_read, 847 .read_map = blkiocg_sectors_read,
848 },
849 {
850 .name = "io_service_bytes",
851 .read_map = blkiocg_io_service_bytes_read,
852 },
853 {
854 .name = "io_serviced",
855 .read_map = blkiocg_io_serviced_read,
856 },
857 {
858 .name = "io_service_time",
859 .read_map = blkiocg_io_service_time_read,
860 },
861 {
862 .name = "io_wait_time",
863 .read_map = blkiocg_io_wait_time_read,
864 },
865 {
866 .name = "io_merged",
867 .read_map = blkiocg_io_merged_read,
868 },
869 {
870 .name = "io_queued",
871 .read_map = blkiocg_io_queued_read,
872 },
873 {
874 .name = "reset_stats",
875 .write_u64 = blkiocg_reset_stats,
227 }, 876 },
228#ifdef CONFIG_DEBUG_BLK_CGROUP 877#ifdef CONFIG_DEBUG_BLK_CGROUP
229 { 878 {
879 .name = "avg_queue_size",
880 .read_map = blkiocg_avg_queue_size_read,
881 },
882 {
883 .name = "group_wait_time",
884 .read_map = blkiocg_group_wait_time_read,
885 },
886 {
887 .name = "idle_time",
888 .read_map = blkiocg_idle_time_read,
889 },
890 {
891 .name = "empty_time",
892 .read_map = blkiocg_empty_time_read,
893 },
894 {
230 .name = "dequeue", 895 .name = "dequeue",
231 .read_seq_string = blkiocg_dequeue_read, 896 .read_map = blkiocg_dequeue_read,
232 }, 897 },
233#endif 898#endif
234}; 899};
235 900
@@ -246,37 +911,42 @@ static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup)
246 struct blkio_group *blkg; 911 struct blkio_group *blkg;
247 void *key; 912 void *key;
248 struct blkio_policy_type *blkiop; 913 struct blkio_policy_type *blkiop;
914 struct blkio_policy_node *pn, *pntmp;
249 915
250 rcu_read_lock(); 916 rcu_read_lock();
251remove_entry: 917 do {
252 spin_lock_irqsave(&blkcg->lock, flags); 918 spin_lock_irqsave(&blkcg->lock, flags);
919
920 if (hlist_empty(&blkcg->blkg_list)) {
921 spin_unlock_irqrestore(&blkcg->lock, flags);
922 break;
923 }
924
925 blkg = hlist_entry(blkcg->blkg_list.first, struct blkio_group,
926 blkcg_node);
927 key = rcu_dereference(blkg->key);
928 __blkiocg_del_blkio_group(blkg);
253 929
254 if (hlist_empty(&blkcg->blkg_list)) {
255 spin_unlock_irqrestore(&blkcg->lock, flags); 930 spin_unlock_irqrestore(&blkcg->lock, flags);
256 goto done;
257 }
258 931
259 blkg = hlist_entry(blkcg->blkg_list.first, struct blkio_group, 932 /*
260 blkcg_node); 933 * This blkio_group is being unlinked as associated cgroup is
261 key = rcu_dereference(blkg->key); 934 * going away. Let all the IO controlling policies know about
262 __blkiocg_del_blkio_group(blkg); 935 * this event. Currently this is static call to one io
936 * controlling policy. Once we have more policies in place, we
937 * need some dynamic registration of callback function.
938 */
939 spin_lock(&blkio_list_lock);
940 list_for_each_entry(blkiop, &blkio_list, list)
941 blkiop->ops.blkio_unlink_group_fn(key, blkg);
942 spin_unlock(&blkio_list_lock);
943 } while (1);
263 944
264 spin_unlock_irqrestore(&blkcg->lock, flags); 945 list_for_each_entry_safe(pn, pntmp, &blkcg->policy_list, node) {
946 blkio_policy_delete_node(pn);
947 kfree(pn);
948 }
265 949
266 /*
267 * This blkio_group is being unlinked as associated cgroup is going
268 * away. Let all the IO controlling policies know about this event.
269 *
270 * Currently this is static call to one io controlling policy. Once
271 * we have more policies in place, we need some dynamic registration
272 * of callback function.
273 */
274 spin_lock(&blkio_list_lock);
275 list_for_each_entry(blkiop, &blkio_list, list)
276 blkiop->ops.blkio_unlink_group_fn(key, blkg);
277 spin_unlock(&blkio_list_lock);
278 goto remove_entry;
279done:
280 free_css_id(&blkio_subsys, &blkcg->css); 950 free_css_id(&blkio_subsys, &blkcg->css);
281 rcu_read_unlock(); 951 rcu_read_unlock();
282 if (blkcg != &blkio_root_cgroup) 952 if (blkcg != &blkio_root_cgroup)
@@ -307,6 +977,7 @@ done:
307 spin_lock_init(&blkcg->lock); 977 spin_lock_init(&blkcg->lock);
308 INIT_HLIST_HEAD(&blkcg->blkg_list); 978 INIT_HLIST_HEAD(&blkcg->blkg_list);
309 979
980 INIT_LIST_HEAD(&blkcg->policy_list);
310 return &blkcg->css; 981 return &blkcg->css;
311} 982}
312 983
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 8ccc20464dae..2b866ec1dcea 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -23,11 +23,84 @@ extern struct cgroup_subsys blkio_subsys;
23#define blkio_subsys_id blkio_subsys.subsys_id 23#define blkio_subsys_id blkio_subsys.subsys_id
24#endif 24#endif
25 25
26enum stat_type {
27 /* Total time spent (in ns) between request dispatch to the driver and
28 * request completion for IOs doen by this cgroup. This may not be
29 * accurate when NCQ is turned on. */
30 BLKIO_STAT_SERVICE_TIME = 0,
31 /* Total bytes transferred */
32 BLKIO_STAT_SERVICE_BYTES,
33 /* Total IOs serviced, post merge */
34 BLKIO_STAT_SERVICED,
35 /* Total time spent waiting in scheduler queue in ns */
36 BLKIO_STAT_WAIT_TIME,
37 /* Number of IOs merged */
38 BLKIO_STAT_MERGED,
39 /* Number of IOs queued up */
40 BLKIO_STAT_QUEUED,
41 /* All the single valued stats go below this */
42 BLKIO_STAT_TIME,
43 BLKIO_STAT_SECTORS,
44#ifdef CONFIG_DEBUG_BLK_CGROUP
45 BLKIO_STAT_AVG_QUEUE_SIZE,
46 BLKIO_STAT_IDLE_TIME,
47 BLKIO_STAT_EMPTY_TIME,
48 BLKIO_STAT_GROUP_WAIT_TIME,
49 BLKIO_STAT_DEQUEUE
50#endif
51};
52
53enum stat_sub_type {
54 BLKIO_STAT_READ = 0,
55 BLKIO_STAT_WRITE,
56 BLKIO_STAT_SYNC,
57 BLKIO_STAT_ASYNC,
58 BLKIO_STAT_TOTAL
59};
60
61/* blkg state flags */
62enum blkg_state_flags {
63 BLKG_waiting = 0,
64 BLKG_idling,
65 BLKG_empty,
66};
67
26struct blkio_cgroup { 68struct blkio_cgroup {
27 struct cgroup_subsys_state css; 69 struct cgroup_subsys_state css;
28 unsigned int weight; 70 unsigned int weight;
29 spinlock_t lock; 71 spinlock_t lock;
30 struct hlist_head blkg_list; 72 struct hlist_head blkg_list;
73 struct list_head policy_list; /* list of blkio_policy_node */
74};
75
76struct blkio_group_stats {
77 /* total disk time and nr sectors dispatched by this group */
78 uint64_t time;
79 uint64_t sectors;
80 uint64_t stat_arr[BLKIO_STAT_QUEUED + 1][BLKIO_STAT_TOTAL];
81#ifdef CONFIG_DEBUG_BLK_CGROUP
82 /* Sum of number of IOs queued across all samples */
83 uint64_t avg_queue_size_sum;
84 /* Count of samples taken for average */
85 uint64_t avg_queue_size_samples;
86 /* How many times this group has been removed from service tree */
87 unsigned long dequeue;
88
89 /* Total time spent waiting for it to be assigned a timeslice. */
90 uint64_t group_wait_time;
91 uint64_t start_group_wait_time;
92
93 /* Time spent idling for this blkio_group */
94 uint64_t idle_time;
95 uint64_t start_idle_time;
96 /*
97 * Total time when we have requests queued and do not contain the
98 * current active queue.
99 */
100 uint64_t empty_time;
101 uint64_t start_empty_time;
102 uint16_t flags;
103#endif
31}; 104};
32 105
33struct blkio_group { 106struct blkio_group {
@@ -35,20 +108,25 @@ struct blkio_group {
35 void *key; 108 void *key;
36 struct hlist_node blkcg_node; 109 struct hlist_node blkcg_node;
37 unsigned short blkcg_id; 110 unsigned short blkcg_id;
38#ifdef CONFIG_DEBUG_BLK_CGROUP
39 /* Store cgroup path */ 111 /* Store cgroup path */
40 char path[128]; 112 char path[128];
41 /* How many times this group has been removed from service tree */
42 unsigned long dequeue;
43#endif
44 /* The device MKDEV(major, minor), this group has been created for */ 113 /* The device MKDEV(major, minor), this group has been created for */
45 dev_t dev; 114 dev_t dev;
46 115
47 /* total disk time and nr sectors dispatched by this group */ 116 /* Need to serialize the stats in the case of reset/update */
48 unsigned long time; 117 spinlock_t stats_lock;
49 unsigned long sectors; 118 struct blkio_group_stats stats;
50}; 119};
51 120
121struct blkio_policy_node {
122 struct list_head node;
123 dev_t dev;
124 unsigned int weight;
125};
126
127extern unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
128 dev_t dev);
129
52typedef void (blkio_unlink_group_fn) (void *key, struct blkio_group *blkg); 130typedef void (blkio_unlink_group_fn) (void *key, struct blkio_group *blkg);
53typedef void (blkio_update_group_weight_fn) (struct blkio_group *blkg, 131typedef void (blkio_update_group_weight_fn) (struct blkio_group *blkg,
54 unsigned int weight); 132 unsigned int weight);
@@ -67,6 +145,11 @@ struct blkio_policy_type {
67extern void blkio_policy_register(struct blkio_policy_type *); 145extern void blkio_policy_register(struct blkio_policy_type *);
68extern void blkio_policy_unregister(struct blkio_policy_type *); 146extern void blkio_policy_unregister(struct blkio_policy_type *);
69 147
148static inline char *blkg_path(struct blkio_group *blkg)
149{
150 return blkg->path;
151}
152
70#else 153#else
71 154
72struct blkio_group { 155struct blkio_group {
@@ -78,6 +161,8 @@ struct blkio_policy_type {
78static inline void blkio_policy_register(struct blkio_policy_type *blkiop) { } 161static inline void blkio_policy_register(struct blkio_policy_type *blkiop) { }
79static inline void blkio_policy_unregister(struct blkio_policy_type *blkiop) { } 162static inline void blkio_policy_unregister(struct blkio_policy_type *blkiop) { }
80 163
164static inline char *blkg_path(struct blkio_group *blkg) { return NULL; }
165
81#endif 166#endif
82 167
83#define BLKIO_WEIGHT_MIN 100 168#define BLKIO_WEIGHT_MIN 100
@@ -85,16 +170,42 @@ static inline void blkio_policy_unregister(struct blkio_policy_type *blkiop) { }
85#define BLKIO_WEIGHT_DEFAULT 500 170#define BLKIO_WEIGHT_DEFAULT 500
86 171
87#ifdef CONFIG_DEBUG_BLK_CGROUP 172#ifdef CONFIG_DEBUG_BLK_CGROUP
88static inline char *blkg_path(struct blkio_group *blkg) 173void blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg);
89{ 174void blkiocg_update_dequeue_stats(struct blkio_group *blkg,
90 return blkg->path;
91}
92void blkiocg_update_blkio_group_dequeue_stats(struct blkio_group *blkg,
93 unsigned long dequeue); 175 unsigned long dequeue);
176void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg);
177void blkiocg_update_idle_time_stats(struct blkio_group *blkg);
178void blkiocg_set_start_empty_time(struct blkio_group *blkg);
179
180#define BLKG_FLAG_FNS(name) \
181static inline void blkio_mark_blkg_##name( \
182 struct blkio_group_stats *stats) \
183{ \
184 stats->flags |= (1 << BLKG_##name); \
185} \
186static inline void blkio_clear_blkg_##name( \
187 struct blkio_group_stats *stats) \
188{ \
189 stats->flags &= ~(1 << BLKG_##name); \
190} \
191static inline int blkio_blkg_##name(struct blkio_group_stats *stats) \
192{ \
193 return (stats->flags & (1 << BLKG_##name)) != 0; \
194} \
195
196BLKG_FLAG_FNS(waiting)
197BLKG_FLAG_FNS(idling)
198BLKG_FLAG_FNS(empty)
199#undef BLKG_FLAG_FNS
94#else 200#else
95static inline char *blkg_path(struct blkio_group *blkg) { return NULL; } 201static inline void blkiocg_update_avg_queue_size_stats(
96static inline void blkiocg_update_blkio_group_dequeue_stats( 202 struct blkio_group *blkg) {}
97 struct blkio_group *blkg, unsigned long dequeue) {} 203static inline void blkiocg_update_dequeue_stats(struct blkio_group *blkg,
204 unsigned long dequeue) {}
205static inline void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg)
206{}
207static inline void blkiocg_update_idle_time_stats(struct blkio_group *blkg) {}
208static inline void blkiocg_set_start_empty_time(struct blkio_group *blkg) {}
98#endif 209#endif
99 210
100#if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE) 211#if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE)
@@ -105,26 +216,43 @@ extern void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
105extern int blkiocg_del_blkio_group(struct blkio_group *blkg); 216extern int blkiocg_del_blkio_group(struct blkio_group *blkg);
106extern struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, 217extern struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg,
107 void *key); 218 void *key);
108void blkiocg_update_blkio_group_stats(struct blkio_group *blkg, 219void blkiocg_update_timeslice_used(struct blkio_group *blkg,
109 unsigned long time, unsigned long sectors); 220 unsigned long time);
221void blkiocg_update_dispatch_stats(struct blkio_group *blkg, uint64_t bytes,
222 bool direction, bool sync);
223void blkiocg_update_completion_stats(struct blkio_group *blkg,
224 uint64_t start_time, uint64_t io_start_time, bool direction, bool sync);
225void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction,
226 bool sync);
227void blkiocg_update_io_add_stats(struct blkio_group *blkg,
228 struct blkio_group *curr_blkg, bool direction, bool sync);
229void blkiocg_update_io_remove_stats(struct blkio_group *blkg,
230 bool direction, bool sync);
110#else 231#else
111struct cgroup; 232struct cgroup;
112static inline struct blkio_cgroup * 233static inline struct blkio_cgroup *
113cgroup_to_blkio_cgroup(struct cgroup *cgroup) { return NULL; } 234cgroup_to_blkio_cgroup(struct cgroup *cgroup) { return NULL; }
114 235
115static inline void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, 236static inline void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
116 struct blkio_group *blkg, void *key, dev_t dev) 237 struct blkio_group *blkg, void *key, dev_t dev) {}
117{
118}
119 238
120static inline int 239static inline int
121blkiocg_del_blkio_group(struct blkio_group *blkg) { return 0; } 240blkiocg_del_blkio_group(struct blkio_group *blkg) { return 0; }
122 241
123static inline struct blkio_group * 242static inline struct blkio_group *
124blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key) { return NULL; } 243blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key) { return NULL; }
125static inline void blkiocg_update_blkio_group_stats(struct blkio_group *blkg, 244static inline void blkiocg_update_timeslice_used(struct blkio_group *blkg,
126 unsigned long time, unsigned long sectors) 245 unsigned long time) {}
127{ 246static inline void blkiocg_update_dispatch_stats(struct blkio_group *blkg,
128} 247 uint64_t bytes, bool direction, bool sync) {}
248static inline void blkiocg_update_completion_stats(struct blkio_group *blkg,
249 uint64_t start_time, uint64_t io_start_time, bool direction,
250 bool sync) {}
251static inline void blkiocg_update_io_merged_stats(struct blkio_group *blkg,
252 bool direction, bool sync) {}
253static inline void blkiocg_update_io_add_stats(struct blkio_group *blkg,
254 struct blkio_group *curr_blkg, bool direction, bool sync) {}
255static inline void blkiocg_update_io_remove_stats(struct blkio_group *blkg,
256 bool direction, bool sync) {}
129#endif 257#endif
130#endif /* _BLK_CGROUP_H */ 258#endif /* _BLK_CGROUP_H */
diff --git a/block/blk-core.c b/block/blk-core.c
index 9fe174dc74d1..3bc5579d6f54 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -127,6 +127,7 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
127 rq->tag = -1; 127 rq->tag = -1;
128 rq->ref_count = 1; 128 rq->ref_count = 1;
129 rq->start_time = jiffies; 129 rq->start_time = jiffies;
130 set_start_time_ns(rq);
130} 131}
131EXPORT_SYMBOL(blk_rq_init); 132EXPORT_SYMBOL(blk_rq_init);
132 133
@@ -450,6 +451,7 @@ void blk_cleanup_queue(struct request_queue *q)
450 */ 451 */
451 blk_sync_queue(q); 452 blk_sync_queue(q);
452 453
454 del_timer_sync(&q->backing_dev_info.laptop_mode_wb_timer);
453 mutex_lock(&q->sysfs_lock); 455 mutex_lock(&q->sysfs_lock);
454 queue_flag_set_unlocked(QUEUE_FLAG_DEAD, q); 456 queue_flag_set_unlocked(QUEUE_FLAG_DEAD, q);
455 mutex_unlock(&q->sysfs_lock); 457 mutex_unlock(&q->sysfs_lock);
@@ -510,6 +512,8 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
510 return NULL; 512 return NULL;
511 } 513 }
512 514
515 setup_timer(&q->backing_dev_info.laptop_mode_wb_timer,
516 laptop_mode_timer_fn, (unsigned long) q);
513 init_timer(&q->unplug_timer); 517 init_timer(&q->unplug_timer);
514 setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q); 518 setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q);
515 INIT_LIST_HEAD(&q->timeout_list); 519 INIT_LIST_HEAD(&q->timeout_list);
@@ -568,6 +572,22 @@ blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)
568{ 572{
569 struct request_queue *q = blk_alloc_queue_node(GFP_KERNEL, node_id); 573 struct request_queue *q = blk_alloc_queue_node(GFP_KERNEL, node_id);
570 574
575 return blk_init_allocated_queue_node(q, rfn, lock, node_id);
576}
577EXPORT_SYMBOL(blk_init_queue_node);
578
579struct request_queue *
580blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn,
581 spinlock_t *lock)
582{
583 return blk_init_allocated_queue_node(q, rfn, lock, -1);
584}
585EXPORT_SYMBOL(blk_init_allocated_queue);
586
587struct request_queue *
588blk_init_allocated_queue_node(struct request_queue *q, request_fn_proc *rfn,
589 spinlock_t *lock, int node_id)
590{
571 if (!q) 591 if (!q)
572 return NULL; 592 return NULL;
573 593
@@ -601,7 +621,7 @@ blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)
601 blk_put_queue(q); 621 blk_put_queue(q);
602 return NULL; 622 return NULL;
603} 623}
604EXPORT_SYMBOL(blk_init_queue_node); 624EXPORT_SYMBOL(blk_init_allocated_queue_node);
605 625
606int blk_get_queue(struct request_queue *q) 626int blk_get_queue(struct request_queue *q)
607{ 627{
@@ -1198,6 +1218,7 @@ static int __make_request(struct request_queue *q, struct bio *bio)
1198 if (!blk_rq_cpu_valid(req)) 1218 if (!blk_rq_cpu_valid(req))
1199 req->cpu = bio->bi_comp_cpu; 1219 req->cpu = bio->bi_comp_cpu;
1200 drive_stat_acct(req, 0); 1220 drive_stat_acct(req, 0);
1221 elv_bio_merged(q, req, bio);
1201 if (!attempt_back_merge(q, req)) 1222 if (!attempt_back_merge(q, req))
1202 elv_merged_request(q, req, el_ret); 1223 elv_merged_request(q, req, el_ret);
1203 goto out; 1224 goto out;
@@ -1231,6 +1252,7 @@ static int __make_request(struct request_queue *q, struct bio *bio)
1231 if (!blk_rq_cpu_valid(req)) 1252 if (!blk_rq_cpu_valid(req))
1232 req->cpu = bio->bi_comp_cpu; 1253 req->cpu = bio->bi_comp_cpu;
1233 drive_stat_acct(req, 0); 1254 drive_stat_acct(req, 0);
1255 elv_bio_merged(q, req, bio);
1234 if (!attempt_front_merge(q, req)) 1256 if (!attempt_front_merge(q, req))
1235 elv_merged_request(q, req, el_ret); 1257 elv_merged_request(q, req, el_ret);
1236 goto out; 1258 goto out;
@@ -1855,8 +1877,10 @@ void blk_dequeue_request(struct request *rq)
1855 * and to it is freed is accounted as io that is in progress at 1877 * and to it is freed is accounted as io that is in progress at
1856 * the driver side. 1878 * the driver side.
1857 */ 1879 */
1858 if (blk_account_rq(rq)) 1880 if (blk_account_rq(rq)) {
1859 q->in_flight[rq_is_sync(rq)]++; 1881 q->in_flight[rq_is_sync(rq)]++;
1882 set_io_start_time_ns(rq);
1883 }
1860} 1884}
1861 1885
1862/** 1886/**
@@ -2098,7 +2122,7 @@ static void blk_finish_request(struct request *req, int error)
2098 BUG_ON(blk_queued_rq(req)); 2122 BUG_ON(blk_queued_rq(req));
2099 2123
2100 if (unlikely(laptop_mode) && blk_fs_request(req)) 2124 if (unlikely(laptop_mode) && blk_fs_request(req))
2101 laptop_io_completion(); 2125 laptop_io_completion(&req->q->backing_dev_info);
2102 2126
2103 blk_delete_timer(req); 2127 blk_delete_timer(req);
2104 2128
@@ -2517,4 +2541,3 @@ int __init blk_dev_init(void)
2517 2541
2518 return 0; 2542 return 0;
2519} 2543}
2520
diff --git a/block/blk-lib.c b/block/blk-lib.c
new file mode 100644
index 000000000000..d0216b9f22d4
--- /dev/null
+++ b/block/blk-lib.c
@@ -0,0 +1,233 @@
1/*
2 * Functions related to generic helpers functions
3 */
4#include <linux/kernel.h>
5#include <linux/module.h>
6#include <linux/bio.h>
7#include <linux/blkdev.h>
8#include <linux/scatterlist.h>
9
10#include "blk.h"
11
12static void blkdev_discard_end_io(struct bio *bio, int err)
13{
14 if (err) {
15 if (err == -EOPNOTSUPP)
16 set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
17 clear_bit(BIO_UPTODATE, &bio->bi_flags);
18 }
19
20 if (bio->bi_private)
21 complete(bio->bi_private);
22 __free_page(bio_page(bio));
23
24 bio_put(bio);
25}
26
27/**
28 * blkdev_issue_discard - queue a discard
29 * @bdev: blockdev to issue discard for
30 * @sector: start sector
31 * @nr_sects: number of sectors to discard
32 * @gfp_mask: memory allocation flags (for bio_alloc)
33 * @flags: BLKDEV_IFL_* flags to control behaviour
34 *
35 * Description:
36 * Issue a discard request for the sectors in question.
37 */
38int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
39 sector_t nr_sects, gfp_t gfp_mask, unsigned long flags)
40{
41 DECLARE_COMPLETION_ONSTACK(wait);
42 struct request_queue *q = bdev_get_queue(bdev);
43 int type = flags & BLKDEV_IFL_BARRIER ?
44 DISCARD_BARRIER : DISCARD_NOBARRIER;
45 struct bio *bio;
46 struct page *page;
47 int ret = 0;
48
49 if (!q)
50 return -ENXIO;
51
52 if (!blk_queue_discard(q))
53 return -EOPNOTSUPP;
54
55 while (nr_sects && !ret) {
56 unsigned int sector_size = q->limits.logical_block_size;
57 unsigned int max_discard_sectors =
58 min(q->limits.max_discard_sectors, UINT_MAX >> 9);
59
60 bio = bio_alloc(gfp_mask, 1);
61 if (!bio)
62 goto out;
63 bio->bi_sector = sector;
64 bio->bi_end_io = blkdev_discard_end_io;
65 bio->bi_bdev = bdev;
66 if (flags & BLKDEV_IFL_WAIT)
67 bio->bi_private = &wait;
68
69 /*
70 * Add a zeroed one-sector payload as that's what
71 * our current implementations need. If we'll ever need
72 * more the interface will need revisiting.
73 */
74 page = alloc_page(gfp_mask | __GFP_ZERO);
75 if (!page)
76 goto out_free_bio;
77 if (bio_add_pc_page(q, bio, page, sector_size, 0) < sector_size)
78 goto out_free_page;
79
80 /*
81 * And override the bio size - the way discard works we
82 * touch many more blocks on disk than the actual payload
83 * length.
84 */
85 if (nr_sects > max_discard_sectors) {
86 bio->bi_size = max_discard_sectors << 9;
87 nr_sects -= max_discard_sectors;
88 sector += max_discard_sectors;
89 } else {
90 bio->bi_size = nr_sects << 9;
91 nr_sects = 0;
92 }
93
94 bio_get(bio);
95 submit_bio(type, bio);
96
97 if (flags & BLKDEV_IFL_WAIT)
98 wait_for_completion(&wait);
99
100 if (bio_flagged(bio, BIO_EOPNOTSUPP))
101 ret = -EOPNOTSUPP;
102 else if (!bio_flagged(bio, BIO_UPTODATE))
103 ret = -EIO;
104 bio_put(bio);
105 }
106 return ret;
107out_free_page:
108 __free_page(page);
109out_free_bio:
110 bio_put(bio);
111out:
112 return -ENOMEM;
113}
114EXPORT_SYMBOL(blkdev_issue_discard);
115
116struct bio_batch
117{
118 atomic_t done;
119 unsigned long flags;
120 struct completion *wait;
121 bio_end_io_t *end_io;
122};
123
124static void bio_batch_end_io(struct bio *bio, int err)
125{
126 struct bio_batch *bb = bio->bi_private;
127
128 if (err) {
129 if (err == -EOPNOTSUPP)
130 set_bit(BIO_EOPNOTSUPP, &bb->flags);
131 else
132 clear_bit(BIO_UPTODATE, &bb->flags);
133 }
134 if (bb) {
135 if (bb->end_io)
136 bb->end_io(bio, err);
137 atomic_inc(&bb->done);
138 complete(bb->wait);
139 }
140 bio_put(bio);
141}
142
143/**
144 * blkdev_issue_zeroout generate number of zero filed write bios
145 * @bdev: blockdev to issue
146 * @sector: start sector
147 * @nr_sects: number of sectors to write
148 * @gfp_mask: memory allocation flags (for bio_alloc)
149 * @flags: BLKDEV_IFL_* flags to control behaviour
150 *
151 * Description:
152 * Generate and issue number of bios with zerofiled pages.
153 * Send barrier at the beginning and at the end if requested. This guarantie
154 * correct request ordering. Empty barrier allow us to avoid post queue flush.
155 */
156
157int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
158 sector_t nr_sects, gfp_t gfp_mask, unsigned long flags)
159{
160 int ret = 0;
161 struct bio *bio;
162 struct bio_batch bb;
163 unsigned int sz, issued = 0;
164 DECLARE_COMPLETION_ONSTACK(wait);
165
166 atomic_set(&bb.done, 0);
167 bb.flags = 1 << BIO_UPTODATE;
168 bb.wait = &wait;
169 bb.end_io = NULL;
170
171 if (flags & BLKDEV_IFL_BARRIER) {
172 /* issue async barrier before the data */
173 ret = blkdev_issue_flush(bdev, gfp_mask, NULL, 0);
174 if (ret)
175 return ret;
176 }
177submit:
178 while (nr_sects != 0) {
179 bio = bio_alloc(gfp_mask,
180 min(nr_sects, (sector_t)BIO_MAX_PAGES));
181 if (!bio)
182 break;
183
184 bio->bi_sector = sector;
185 bio->bi_bdev = bdev;
186 bio->bi_end_io = bio_batch_end_io;
187 if (flags & BLKDEV_IFL_WAIT)
188 bio->bi_private = &bb;
189
190 while (nr_sects != 0) {
191 sz = min((sector_t) PAGE_SIZE >> 9 , nr_sects);
192 if (sz == 0)
193 /* bio has maximum size possible */
194 break;
195 ret = bio_add_page(bio, ZERO_PAGE(0), sz << 9, 0);
196 nr_sects -= ret >> 9;
197 sector += ret >> 9;
198 if (ret < (sz << 9))
199 break;
200 }
201 issued++;
202 submit_bio(WRITE, bio);
203 }
204 /*
205 * When all data bios are in flight. Send final barrier if requeted.
206 */
207 if (nr_sects == 0 && flags & BLKDEV_IFL_BARRIER)
208 ret = blkdev_issue_flush(bdev, gfp_mask, NULL,
209 flags & BLKDEV_IFL_WAIT);
210
211
212 if (flags & BLKDEV_IFL_WAIT)
213 /* Wait for bios in-flight */
214 while ( issued != atomic_read(&bb.done))
215 wait_for_completion(&wait);
216
217 if (!test_bit(BIO_UPTODATE, &bb.flags))
218 /* One of bios in the batch was completed with error.*/
219 ret = -EIO;
220
221 if (ret)
222 goto out;
223
224 if (test_bit(BIO_EOPNOTSUPP, &bb.flags)) {
225 ret = -EOPNOTSUPP;
226 goto out;
227 }
228 if (nr_sects != 0)
229 goto submit;
230out:
231 return ret;
232}
233EXPORT_SYMBOL(blkdev_issue_zeroout);
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 5f127cfb2e92..ed897b5ef315 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -55,6 +55,7 @@ static const int cfq_hist_divisor = 4;
55#define RQ_CIC(rq) \ 55#define RQ_CIC(rq) \
56 ((struct cfq_io_context *) (rq)->elevator_private) 56 ((struct cfq_io_context *) (rq)->elevator_private)
57#define RQ_CFQQ(rq) (struct cfq_queue *) ((rq)->elevator_private2) 57#define RQ_CFQQ(rq) (struct cfq_queue *) ((rq)->elevator_private2)
58#define RQ_CFQG(rq) (struct cfq_group *) ((rq)->elevator_private3)
58 59
59static struct kmem_cache *cfq_pool; 60static struct kmem_cache *cfq_pool;
60static struct kmem_cache *cfq_ioc_pool; 61static struct kmem_cache *cfq_ioc_pool;
@@ -143,8 +144,6 @@ struct cfq_queue {
143 struct cfq_queue *new_cfqq; 144 struct cfq_queue *new_cfqq;
144 struct cfq_group *cfqg; 145 struct cfq_group *cfqg;
145 struct cfq_group *orig_cfqg; 146 struct cfq_group *orig_cfqg;
146 /* Sectors dispatched in current dispatch round */
147 unsigned long nr_sectors;
148}; 147};
149 148
150/* 149/*
@@ -346,7 +345,7 @@ CFQ_CFQQ_FNS(deep);
346CFQ_CFQQ_FNS(wait_busy); 345CFQ_CFQQ_FNS(wait_busy);
347#undef CFQ_CFQQ_FNS 346#undef CFQ_CFQQ_FNS
348 347
349#ifdef CONFIG_DEBUG_CFQ_IOSCHED 348#ifdef CONFIG_CFQ_GROUP_IOSCHED
350#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \ 349#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \
351 blk_add_trace_msg((cfqd)->queue, "cfq%d%c %s " fmt, (cfqq)->pid, \ 350 blk_add_trace_msg((cfqd)->queue, "cfq%d%c %s " fmt, (cfqq)->pid, \
352 cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \ 351 cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \
@@ -858,7 +857,7 @@ cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
858 if (!RB_EMPTY_NODE(&cfqg->rb_node)) 857 if (!RB_EMPTY_NODE(&cfqg->rb_node))
859 cfq_rb_erase(&cfqg->rb_node, st); 858 cfq_rb_erase(&cfqg->rb_node, st);
860 cfqg->saved_workload_slice = 0; 859 cfqg->saved_workload_slice = 0;
861 blkiocg_update_blkio_group_dequeue_stats(&cfqg->blkg, 1); 860 blkiocg_update_dequeue_stats(&cfqg->blkg, 1);
862} 861}
863 862
864static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq) 863static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
@@ -884,8 +883,7 @@ static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
884 slice_used = cfqq->allocated_slice; 883 slice_used = cfqq->allocated_slice;
885 } 884 }
886 885
887 cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u sect=%lu", slice_used, 886 cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u", slice_used);
888 cfqq->nr_sectors);
889 return slice_used; 887 return slice_used;
890} 888}
891 889
@@ -919,8 +917,8 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
919 917
920 cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime, 918 cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime,
921 st->min_vdisktime); 919 st->min_vdisktime);
922 blkiocg_update_blkio_group_stats(&cfqg->blkg, used_sl, 920 blkiocg_update_timeslice_used(&cfqg->blkg, used_sl);
923 cfqq->nr_sectors); 921 blkiocg_set_start_empty_time(&cfqg->blkg);
924} 922}
925 923
926#ifdef CONFIG_CFQ_GROUP_IOSCHED 924#ifdef CONFIG_CFQ_GROUP_IOSCHED
@@ -961,7 +959,6 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
961 if (!cfqg) 959 if (!cfqg)
962 goto done; 960 goto done;
963 961
964 cfqg->weight = blkcg->weight;
965 for_each_cfqg_st(cfqg, i, j, st) 962 for_each_cfqg_st(cfqg, i, j, st)
966 *st = CFQ_RB_ROOT; 963 *st = CFQ_RB_ROOT;
967 RB_CLEAR_NODE(&cfqg->rb_node); 964 RB_CLEAR_NODE(&cfqg->rb_node);
@@ -978,6 +975,7 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
978 sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); 975 sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
979 blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd, 976 blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
980 MKDEV(major, minor)); 977 MKDEV(major, minor));
978 cfqg->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
981 979
982 /* Add group on cfqd list */ 980 /* Add group on cfqd list */
983 hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list); 981 hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
@@ -1004,6 +1002,12 @@ static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create)
1004 return cfqg; 1002 return cfqg;
1005} 1003}
1006 1004
1005static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg)
1006{
1007 atomic_inc(&cfqg->ref);
1008 return cfqg;
1009}
1010
1007static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) 1011static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg)
1008{ 1012{
1009 /* Currently, all async queues are mapped to root group */ 1013 /* Currently, all async queues are mapped to root group */
@@ -1087,6 +1091,12 @@ static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create)
1087{ 1091{
1088 return &cfqd->root_group; 1092 return &cfqd->root_group;
1089} 1093}
1094
1095static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg)
1096{
1097 return cfqg;
1098}
1099
1090static inline void 1100static inline void
1091cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) { 1101cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) {
1092 cfqq->cfqg = cfqg; 1102 cfqq->cfqg = cfqg;
@@ -1389,7 +1399,12 @@ static void cfq_reposition_rq_rb(struct cfq_queue *cfqq, struct request *rq)
1389{ 1399{
1390 elv_rb_del(&cfqq->sort_list, rq); 1400 elv_rb_del(&cfqq->sort_list, rq);
1391 cfqq->queued[rq_is_sync(rq)]--; 1401 cfqq->queued[rq_is_sync(rq)]--;
1402 blkiocg_update_io_remove_stats(&(RQ_CFQG(rq))->blkg, rq_data_dir(rq),
1403 rq_is_sync(rq));
1392 cfq_add_rq_rb(rq); 1404 cfq_add_rq_rb(rq);
1405 blkiocg_update_io_add_stats(&(RQ_CFQG(rq))->blkg,
1406 &cfqq->cfqd->serving_group->blkg, rq_data_dir(rq),
1407 rq_is_sync(rq));
1393} 1408}
1394 1409
1395static struct request * 1410static struct request *
@@ -1445,6 +1460,8 @@ static void cfq_remove_request(struct request *rq)
1445 cfq_del_rq_rb(rq); 1460 cfq_del_rq_rb(rq);
1446 1461
1447 cfqq->cfqd->rq_queued--; 1462 cfqq->cfqd->rq_queued--;
1463 blkiocg_update_io_remove_stats(&(RQ_CFQG(rq))->blkg, rq_data_dir(rq),
1464 rq_is_sync(rq));
1448 if (rq_is_meta(rq)) { 1465 if (rq_is_meta(rq)) {
1449 WARN_ON(!cfqq->meta_pending); 1466 WARN_ON(!cfqq->meta_pending);
1450 cfqq->meta_pending--; 1467 cfqq->meta_pending--;
@@ -1476,6 +1493,13 @@ static void cfq_merged_request(struct request_queue *q, struct request *req,
1476 } 1493 }
1477} 1494}
1478 1495
1496static void cfq_bio_merged(struct request_queue *q, struct request *req,
1497 struct bio *bio)
1498{
1499 blkiocg_update_io_merged_stats(&(RQ_CFQG(req))->blkg, bio_data_dir(bio),
1500 cfq_bio_sync(bio));
1501}
1502
1479static void 1503static void
1480cfq_merged_requests(struct request_queue *q, struct request *rq, 1504cfq_merged_requests(struct request_queue *q, struct request *rq,
1481 struct request *next) 1505 struct request *next)
@@ -1493,6 +1517,8 @@ cfq_merged_requests(struct request_queue *q, struct request *rq,
1493 if (cfqq->next_rq == next) 1517 if (cfqq->next_rq == next)
1494 cfqq->next_rq = rq; 1518 cfqq->next_rq = rq;
1495 cfq_remove_request(next); 1519 cfq_remove_request(next);
1520 blkiocg_update_io_merged_stats(&(RQ_CFQG(rq))->blkg, rq_data_dir(next),
1521 rq_is_sync(next));
1496} 1522}
1497 1523
1498static int cfq_allow_merge(struct request_queue *q, struct request *rq, 1524static int cfq_allow_merge(struct request_queue *q, struct request *rq,
@@ -1520,18 +1546,24 @@ static int cfq_allow_merge(struct request_queue *q, struct request *rq,
1520 return cfqq == RQ_CFQQ(rq); 1546 return cfqq == RQ_CFQQ(rq);
1521} 1547}
1522 1548
1549static inline void cfq_del_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1550{
1551 del_timer(&cfqd->idle_slice_timer);
1552 blkiocg_update_idle_time_stats(&cfqq->cfqg->blkg);
1553}
1554
1523static void __cfq_set_active_queue(struct cfq_data *cfqd, 1555static void __cfq_set_active_queue(struct cfq_data *cfqd,
1524 struct cfq_queue *cfqq) 1556 struct cfq_queue *cfqq)
1525{ 1557{
1526 if (cfqq) { 1558 if (cfqq) {
1527 cfq_log_cfqq(cfqd, cfqq, "set_active wl_prio:%d wl_type:%d", 1559 cfq_log_cfqq(cfqd, cfqq, "set_active wl_prio:%d wl_type:%d",
1528 cfqd->serving_prio, cfqd->serving_type); 1560 cfqd->serving_prio, cfqd->serving_type);
1561 blkiocg_update_avg_queue_size_stats(&cfqq->cfqg->blkg);
1529 cfqq->slice_start = 0; 1562 cfqq->slice_start = 0;
1530 cfqq->dispatch_start = jiffies; 1563 cfqq->dispatch_start = jiffies;
1531 cfqq->allocated_slice = 0; 1564 cfqq->allocated_slice = 0;
1532 cfqq->slice_end = 0; 1565 cfqq->slice_end = 0;
1533 cfqq->slice_dispatch = 0; 1566 cfqq->slice_dispatch = 0;
1534 cfqq->nr_sectors = 0;
1535 1567
1536 cfq_clear_cfqq_wait_request(cfqq); 1568 cfq_clear_cfqq_wait_request(cfqq);
1537 cfq_clear_cfqq_must_dispatch(cfqq); 1569 cfq_clear_cfqq_must_dispatch(cfqq);
@@ -1539,7 +1571,7 @@ static void __cfq_set_active_queue(struct cfq_data *cfqd,
1539 cfq_clear_cfqq_fifo_expire(cfqq); 1571 cfq_clear_cfqq_fifo_expire(cfqq);
1540 cfq_mark_cfqq_slice_new(cfqq); 1572 cfq_mark_cfqq_slice_new(cfqq);
1541 1573
1542 del_timer(&cfqd->idle_slice_timer); 1574 cfq_del_timer(cfqd, cfqq);
1543 } 1575 }
1544 1576
1545 cfqd->active_queue = cfqq; 1577 cfqd->active_queue = cfqq;
@@ -1555,7 +1587,7 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
1555 cfq_log_cfqq(cfqd, cfqq, "slice expired t=%d", timed_out); 1587 cfq_log_cfqq(cfqd, cfqq, "slice expired t=%d", timed_out);
1556 1588
1557 if (cfq_cfqq_wait_request(cfqq)) 1589 if (cfq_cfqq_wait_request(cfqq))
1558 del_timer(&cfqd->idle_slice_timer); 1590 cfq_del_timer(cfqd, cfqq);
1559 1591
1560 cfq_clear_cfqq_wait_request(cfqq); 1592 cfq_clear_cfqq_wait_request(cfqq);
1561 cfq_clear_cfqq_wait_busy(cfqq); 1593 cfq_clear_cfqq_wait_busy(cfqq);
@@ -1857,6 +1889,7 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
1857 sl = cfqd->cfq_slice_idle; 1889 sl = cfqd->cfq_slice_idle;
1858 1890
1859 mod_timer(&cfqd->idle_slice_timer, jiffies + sl); 1891 mod_timer(&cfqd->idle_slice_timer, jiffies + sl);
1892 blkiocg_update_set_idle_time_stats(&cfqq->cfqg->blkg);
1860 cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu", sl); 1893 cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu", sl);
1861} 1894}
1862 1895
@@ -1876,7 +1909,8 @@ static void cfq_dispatch_insert(struct request_queue *q, struct request *rq)
1876 elv_dispatch_sort(q, rq); 1909 elv_dispatch_sort(q, rq);
1877 1910
1878 cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++; 1911 cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++;
1879 cfqq->nr_sectors += blk_rq_sectors(rq); 1912 blkiocg_update_dispatch_stats(&cfqq->cfqg->blkg, blk_rq_bytes(rq),
1913 rq_data_dir(rq), rq_is_sync(rq));
1880} 1914}
1881 1915
1882/* 1916/*
@@ -3185,11 +3219,14 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
3185 if (cfq_cfqq_wait_request(cfqq)) { 3219 if (cfq_cfqq_wait_request(cfqq)) {
3186 if (blk_rq_bytes(rq) > PAGE_CACHE_SIZE || 3220 if (blk_rq_bytes(rq) > PAGE_CACHE_SIZE ||
3187 cfqd->busy_queues > 1) { 3221 cfqd->busy_queues > 1) {
3188 del_timer(&cfqd->idle_slice_timer); 3222 cfq_del_timer(cfqd, cfqq);
3189 cfq_clear_cfqq_wait_request(cfqq); 3223 cfq_clear_cfqq_wait_request(cfqq);
3190 __blk_run_queue(cfqd->queue); 3224 __blk_run_queue(cfqd->queue);
3191 } else 3225 } else {
3226 blkiocg_update_idle_time_stats(
3227 &cfqq->cfqg->blkg);
3192 cfq_mark_cfqq_must_dispatch(cfqq); 3228 cfq_mark_cfqq_must_dispatch(cfqq);
3229 }
3193 } 3230 }
3194 } else if (cfq_should_preempt(cfqd, cfqq, rq)) { 3231 } else if (cfq_should_preempt(cfqd, cfqq, rq)) {
3195 /* 3232 /*
@@ -3214,7 +3251,9 @@ static void cfq_insert_request(struct request_queue *q, struct request *rq)
3214 rq_set_fifo_time(rq, jiffies + cfqd->cfq_fifo_expire[rq_is_sync(rq)]); 3251 rq_set_fifo_time(rq, jiffies + cfqd->cfq_fifo_expire[rq_is_sync(rq)]);
3215 list_add_tail(&rq->queuelist, &cfqq->fifo); 3252 list_add_tail(&rq->queuelist, &cfqq->fifo);
3216 cfq_add_rq_rb(rq); 3253 cfq_add_rq_rb(rq);
3217 3254 blkiocg_update_io_add_stats(&(RQ_CFQG(rq))->blkg,
3255 &cfqd->serving_group->blkg, rq_data_dir(rq),
3256 rq_is_sync(rq));
3218 cfq_rq_enqueued(cfqd, cfqq, rq); 3257 cfq_rq_enqueued(cfqd, cfqq, rq);
3219} 3258}
3220 3259
@@ -3300,6 +3339,9 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
3300 WARN_ON(!cfqq->dispatched); 3339 WARN_ON(!cfqq->dispatched);
3301 cfqd->rq_in_driver--; 3340 cfqd->rq_in_driver--;
3302 cfqq->dispatched--; 3341 cfqq->dispatched--;
3342 blkiocg_update_completion_stats(&cfqq->cfqg->blkg, rq_start_time_ns(rq),
3343 rq_io_start_time_ns(rq), rq_data_dir(rq),
3344 rq_is_sync(rq));
3303 3345
3304 cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--; 3346 cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--;
3305 3347
@@ -3440,6 +3482,10 @@ static void cfq_put_request(struct request *rq)
3440 rq->elevator_private = NULL; 3482 rq->elevator_private = NULL;
3441 rq->elevator_private2 = NULL; 3483 rq->elevator_private2 = NULL;
3442 3484
3485 /* Put down rq reference on cfqg */
3486 cfq_put_cfqg(RQ_CFQG(rq));
3487 rq->elevator_private3 = NULL;
3488
3443 cfq_put_queue(cfqq); 3489 cfq_put_queue(cfqq);
3444 } 3490 }
3445} 3491}
@@ -3528,6 +3574,7 @@ new_queue:
3528 3574
3529 rq->elevator_private = cic; 3575 rq->elevator_private = cic;
3530 rq->elevator_private2 = cfqq; 3576 rq->elevator_private2 = cfqq;
3577 rq->elevator_private3 = cfq_ref_get_cfqg(cfqq->cfqg);
3531 return 0; 3578 return 0;
3532 3579
3533queue_fail: 3580queue_fail:
@@ -3743,7 +3790,6 @@ static void *cfq_init_queue(struct request_queue *q)
3743 * second, in order to have larger depth for async operations. 3790 * second, in order to have larger depth for async operations.
3744 */ 3791 */
3745 cfqd->last_delayed_sync = jiffies - HZ; 3792 cfqd->last_delayed_sync = jiffies - HZ;
3746 INIT_RCU_HEAD(&cfqd->rcu);
3747 return cfqd; 3793 return cfqd;
3748} 3794}
3749 3795
@@ -3872,6 +3918,7 @@ static struct elevator_type iosched_cfq = {
3872 .elevator_merged_fn = cfq_merged_request, 3918 .elevator_merged_fn = cfq_merged_request,
3873 .elevator_merge_req_fn = cfq_merged_requests, 3919 .elevator_merge_req_fn = cfq_merged_requests,
3874 .elevator_allow_merge_fn = cfq_allow_merge, 3920 .elevator_allow_merge_fn = cfq_allow_merge,
3921 .elevator_bio_merged_fn = cfq_bio_merged,
3875 .elevator_dispatch_fn = cfq_dispatch_requests, 3922 .elevator_dispatch_fn = cfq_dispatch_requests,
3876 .elevator_add_req_fn = cfq_insert_request, 3923 .elevator_add_req_fn = cfq_insert_request,
3877 .elevator_activate_req_fn = cfq_activate_request, 3924 .elevator_activate_req_fn = cfq_activate_request,
diff --git a/block/elevator.c b/block/elevator.c
index 76e3702d5381..6df2b5056b51 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -539,6 +539,15 @@ void elv_merge_requests(struct request_queue *q, struct request *rq,
539 q->last_merge = rq; 539 q->last_merge = rq;
540} 540}
541 541
542void elv_bio_merged(struct request_queue *q, struct request *rq,
543 struct bio *bio)
544{
545 struct elevator_queue *e = q->elevator;
546
547 if (e->ops->elevator_bio_merged_fn)
548 e->ops->elevator_bio_merged_fn(q, rq, bio);
549}
550
542void elv_requeue_request(struct request_queue *q, struct request *rq) 551void elv_requeue_request(struct request_queue *q, struct request *rq)
543{ 552{
544 /* 553 /*
@@ -921,6 +930,7 @@ int elv_register_queue(struct request_queue *q)
921 } 930 }
922 return error; 931 return error;
923} 932}
933EXPORT_SYMBOL(elv_register_queue);
924 934
925static void __elv_unregister_queue(struct elevator_queue *e) 935static void __elv_unregister_queue(struct elevator_queue *e)
926{ 936{
@@ -933,6 +943,7 @@ void elv_unregister_queue(struct request_queue *q)
933 if (q) 943 if (q)
934 __elv_unregister_queue(q->elevator); 944 __elv_unregister_queue(q->elevator);
935} 945}
946EXPORT_SYMBOL(elv_unregister_queue);
936 947
937void elv_register(struct elevator_type *e) 948void elv_register(struct elevator_type *e)
938{ 949{
diff --git a/block/genhd.c b/block/genhd.c
index d13ba76a169c..59a2db6fecef 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -596,6 +596,7 @@ struct gendisk *get_gendisk(dev_t devt, int *partno)
596 596
597 return disk; 597 return disk;
598} 598}
599EXPORT_SYMBOL(get_gendisk);
599 600
600/** 601/**
601 * bdget_disk - do bdget() by gendisk and partition number 602 * bdget_disk - do bdget() by gendisk and partition number
@@ -987,7 +988,6 @@ int disk_expand_part_tbl(struct gendisk *disk, int partno)
987 if (!new_ptbl) 988 if (!new_ptbl)
988 return -ENOMEM; 989 return -ENOMEM;
989 990
990 INIT_RCU_HEAD(&new_ptbl->rcu_head);
991 new_ptbl->len = target; 991 new_ptbl->len = target;
992 992
993 for (i = 0; i < len; i++) 993 for (i = 0; i < len; i++)
diff --git a/block/ioctl.c b/block/ioctl.c
index 8905d2a2a717..e8eb679f2f9b 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -126,7 +126,7 @@ static int blk_ioctl_discard(struct block_device *bdev, uint64_t start,
126 if (start + len > (bdev->bd_inode->i_size >> 9)) 126 if (start + len > (bdev->bd_inode->i_size >> 9))
127 return -EINVAL; 127 return -EINVAL;
128 return blkdev_issue_discard(bdev, start, len, GFP_KERNEL, 128 return blkdev_issue_discard(bdev, start, len, GFP_KERNEL,
129 DISCARD_FL_WAIT); 129 BLKDEV_IFL_WAIT);
130} 130}
131 131
132static int put_ushort(unsigned long arg, unsigned short val) 132static int put_ushort(unsigned long arg, unsigned short val)