aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorVivek Goyal <vgoyal@redhat.com>2010-09-15 17:06:35 -0400
committerJens Axboe <jaxboe@fusionio.com>2010-09-16 02:42:52 -0400
commite43473b7f223ec866f7db273697e76c337c390f9 (patch)
treee90b52dbe4ec4ae37263a00e2bd9eaf5367cf72f
parent4c9eefa16c6f124ffcc736cb719b24ea27f85017 (diff)
blkio: Core implementation of throttle policy
o Actual implementation of throttling policy in block layer. Currently it implements READ and WRITE bytes per second throttling logic. IOPS throttling comes in later patches. Signed-off-by: Vivek Goyal <vgoyal@redhat.com> Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
-rw-r--r--block/Kconfig12
-rw-r--r--block/Makefile1
-rw-r--r--block/blk-core.c24
-rw-r--r--block/blk-throttle.c909
-rw-r--r--include/linux/blk_types.h3
-rw-r--r--include/linux/blkdev.h24
-rw-r--r--init/Kconfig9
7 files changed, 979 insertions, 3 deletions
diff --git a/block/Kconfig b/block/Kconfig
index 9be0b56eaee1..6c9213ef15a1 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -77,6 +77,18 @@ config BLK_DEV_INTEGRITY
77 T10/SCSI Data Integrity Field or the T13/ATA External Path 77 T10/SCSI Data Integrity Field or the T13/ATA External Path
78 Protection. If in doubt, say N. 78 Protection. If in doubt, say N.
79 79
80config BLK_DEV_THROTTLING
81 bool "Block layer bio throttling support"
82 depends on BLK_CGROUP=y && EXPERIMENTAL
83 default n
84 ---help---
85 Block layer bio throttling support. It can be used to limit
86 the IO rate to a device. IO rate policies are per cgroup and
87 one needs to mount and use blkio cgroup controller for creating
88 cgroups and specifying per device IO rate policies.
89
90 See Documentation/cgroups/blkio-controller.txt for more information.
91
80endif # BLOCK 92endif # BLOCK
81 93
82config BLOCK_COMPAT 94config BLOCK_COMPAT
diff --git a/block/Makefile b/block/Makefile
index 0bb499a739cd..c850d5ef80a2 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -9,6 +9,7 @@ obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \
9 9
10obj-$(CONFIG_BLK_DEV_BSG) += bsg.o 10obj-$(CONFIG_BLK_DEV_BSG) += bsg.o
11obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o 11obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o
12obj-$(CONFIG_BLK_DEV_THROTTLING) += blk-throttle.o
12obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o 13obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o
13obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o 14obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o
14obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o 15obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o
diff --git a/block/blk-core.c b/block/blk-core.c
index 8d07c1b7e701..797d5095eb83 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -382,6 +382,7 @@ void blk_sync_queue(struct request_queue *q)
382 del_timer_sync(&q->unplug_timer); 382 del_timer_sync(&q->unplug_timer);
383 del_timer_sync(&q->timeout); 383 del_timer_sync(&q->timeout);
384 cancel_work_sync(&q->unplug_work); 384 cancel_work_sync(&q->unplug_work);
385 throtl_shutdown_timer_wq(q);
385} 386}
386EXPORT_SYMBOL(blk_sync_queue); 387EXPORT_SYMBOL(blk_sync_queue);
387 388
@@ -459,6 +460,8 @@ void blk_cleanup_queue(struct request_queue *q)
459 if (q->elevator) 460 if (q->elevator)
460 elevator_exit(q->elevator); 461 elevator_exit(q->elevator);
461 462
463 blk_throtl_exit(q);
464
462 blk_put_queue(q); 465 blk_put_queue(q);
463} 466}
464EXPORT_SYMBOL(blk_cleanup_queue); 467EXPORT_SYMBOL(blk_cleanup_queue);
@@ -515,6 +518,11 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
515 return NULL; 518 return NULL;
516 } 519 }
517 520
521 if (blk_throtl_init(q)) {
522 kmem_cache_free(blk_requestq_cachep, q);
523 return NULL;
524 }
525
518 setup_timer(&q->backing_dev_info.laptop_mode_wb_timer, 526 setup_timer(&q->backing_dev_info.laptop_mode_wb_timer,
519 laptop_mode_timer_fn, (unsigned long) q); 527 laptop_mode_timer_fn, (unsigned long) q);
520 init_timer(&q->unplug_timer); 528 init_timer(&q->unplug_timer);
@@ -1522,6 +1530,15 @@ static inline void __generic_make_request(struct bio *bio)
1522 goto end_io; 1530 goto end_io;
1523 } 1531 }
1524 1532
1533 blk_throtl_bio(q, &bio);
1534
1535 /*
1536 * If bio = NULL, bio has been throttled and will be submitted
1537 * later.
1538 */
1539 if (!bio)
1540 break;
1541
1525 trace_block_bio_queue(q, bio); 1542 trace_block_bio_queue(q, bio);
1526 1543
1527 ret = q->make_request_fn(q, bio); 1544 ret = q->make_request_fn(q, bio);
@@ -2580,6 +2597,13 @@ int kblockd_schedule_work(struct request_queue *q, struct work_struct *work)
2580} 2597}
2581EXPORT_SYMBOL(kblockd_schedule_work); 2598EXPORT_SYMBOL(kblockd_schedule_work);
2582 2599
2600int kblockd_schedule_delayed_work(struct request_queue *q,
2601 struct delayed_work *dwork, unsigned long delay)
2602{
2603 return queue_delayed_work(kblockd_workqueue, dwork, delay);
2604}
2605EXPORT_SYMBOL(kblockd_schedule_delayed_work);
2606
2583int __init blk_dev_init(void) 2607int __init blk_dev_init(void)
2584{ 2608{
2585 BUILD_BUG_ON(__REQ_NR_BITS > 8 * 2609 BUILD_BUG_ON(__REQ_NR_BITS > 8 *
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
new file mode 100644
index 000000000000..4b492011e0de
--- /dev/null
+++ b/block/blk-throttle.c
@@ -0,0 +1,909 @@
1/*
2 * Interface for controlling IO bandwidth on a request queue
3 *
4 * Copyright (C) 2010 Vivek Goyal <vgoyal@redhat.com>
5 */
6
7#include <linux/module.h>
8#include <linux/slab.h>
9#include <linux/blkdev.h>
10#include <linux/bio.h>
11#include <linux/blktrace_api.h>
12#include "blk-cgroup.h"
13
14/* Max dispatch from a group in 1 round */
15static int throtl_grp_quantum = 8;
16
17/* Total max dispatch from all groups in one round */
18static int throtl_quantum = 32;
19
20/* Throttling is performed over 100ms slice and after that slice is renewed */
21static unsigned long throtl_slice = HZ/10; /* 100 ms */
22
23struct throtl_rb_root {
24 struct rb_root rb;
25 struct rb_node *left;
26 unsigned int count;
27 unsigned long min_disptime;
28};
29
30#define THROTL_RB_ROOT (struct throtl_rb_root) { .rb = RB_ROOT, .left = NULL, \
31 .count = 0, .min_disptime = 0}
32
33#define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node)
34
35struct throtl_grp {
36 /* List of throtl groups on the request queue*/
37 struct hlist_node tg_node;
38
39 /* active throtl group service_tree member */
40 struct rb_node rb_node;
41
42 /*
43 * Dispatch time in jiffies. This is the estimated time when group
44 * will unthrottle and is ready to dispatch more bio. It is used as
45 * key to sort active groups in service tree.
46 */
47 unsigned long disptime;
48
49 struct blkio_group blkg;
50 atomic_t ref;
51 unsigned int flags;
52
53 /* Two lists for READ and WRITE */
54 struct bio_list bio_lists[2];
55
56 /* Number of queued bios on READ and WRITE lists */
57 unsigned int nr_queued[2];
58
59 /* bytes per second rate limits */
60 uint64_t bps[2];
61
62 /* Number of bytes disptached in current slice */
63 uint64_t bytes_disp[2];
64
65 /* When did we start a new slice */
66 unsigned long slice_start[2];
67 unsigned long slice_end[2];
68};
69
70struct throtl_data
71{
72 /* List of throtl groups */
73 struct hlist_head tg_list;
74
75 /* service tree for active throtl groups */
76 struct throtl_rb_root tg_service_tree;
77
78 struct throtl_grp root_tg;
79 struct request_queue *queue;
80
81 /* Total Number of queued bios on READ and WRITE lists */
82 unsigned int nr_queued[2];
83
84 /*
85 * number of total undestroyed groups (excluding root group)
86 */
87 unsigned int nr_undestroyed_grps;
88
89 /* Work for dispatching throttled bios */
90 struct delayed_work throtl_work;
91};
92
93enum tg_state_flags {
94 THROTL_TG_FLAG_on_rr = 0, /* on round-robin busy list */
95};
96
97#define THROTL_TG_FNS(name) \
98static inline void throtl_mark_tg_##name(struct throtl_grp *tg) \
99{ \
100 (tg)->flags |= (1 << THROTL_TG_FLAG_##name); \
101} \
102static inline void throtl_clear_tg_##name(struct throtl_grp *tg) \
103{ \
104 (tg)->flags &= ~(1 << THROTL_TG_FLAG_##name); \
105} \
106static inline int throtl_tg_##name(const struct throtl_grp *tg) \
107{ \
108 return ((tg)->flags & (1 << THROTL_TG_FLAG_##name)) != 0; \
109}
110
111THROTL_TG_FNS(on_rr);
112
113#define throtl_log_tg(td, tg, fmt, args...) \
114 blk_add_trace_msg((td)->queue, "throtl %s " fmt, \
115 blkg_path(&(tg)->blkg), ##args); \
116
117#define throtl_log(td, fmt, args...) \
118 blk_add_trace_msg((td)->queue, "throtl " fmt, ##args)
119
120static inline struct throtl_grp *tg_of_blkg(struct blkio_group *blkg)
121{
122 if (blkg)
123 return container_of(blkg, struct throtl_grp, blkg);
124
125 return NULL;
126}
127
128static inline int total_nr_queued(struct throtl_data *td)
129{
130 return (td->nr_queued[0] + td->nr_queued[1]);
131}
132
133static inline struct throtl_grp *throtl_ref_get_tg(struct throtl_grp *tg)
134{
135 atomic_inc(&tg->ref);
136 return tg;
137}
138
139static void throtl_put_tg(struct throtl_grp *tg)
140{
141 BUG_ON(atomic_read(&tg->ref) <= 0);
142 if (!atomic_dec_and_test(&tg->ref))
143 return;
144 kfree(tg);
145}
146
147static struct throtl_grp * throtl_find_alloc_tg(struct throtl_data *td,
148 struct cgroup *cgroup)
149{
150 struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
151 struct throtl_grp *tg = NULL;
152 void *key = td;
153 struct backing_dev_info *bdi = &td->queue->backing_dev_info;
154 unsigned int major, minor;
155
156 /*
157 * TODO: Speed up blkiocg_lookup_group() by maintaining a radix
158 * tree of blkg (instead of traversing through hash list all
159 * the time.
160 */
161 tg = tg_of_blkg(blkiocg_lookup_group(blkcg, key));
162
163 /* Fill in device details for root group */
164 if (tg && !tg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
165 sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
166 tg->blkg.dev = MKDEV(major, minor);
167 goto done;
168 }
169
170 if (tg)
171 goto done;
172
173 tg = kzalloc_node(sizeof(*tg), GFP_ATOMIC, td->queue->node);
174 if (!tg)
175 goto done;
176
177 INIT_HLIST_NODE(&tg->tg_node);
178 RB_CLEAR_NODE(&tg->rb_node);
179 bio_list_init(&tg->bio_lists[0]);
180 bio_list_init(&tg->bio_lists[1]);
181
182 /*
183 * Take the initial reference that will be released on destroy
184 * This can be thought of a joint reference by cgroup and
185 * request queue which will be dropped by either request queue
186 * exit or cgroup deletion path depending on who is exiting first.
187 */
188 atomic_set(&tg->ref, 1);
189
190 /* Add group onto cgroup list */
191 sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
192 blkiocg_add_blkio_group(blkcg, &tg->blkg, (void *)td,
193 MKDEV(major, minor), BLKIO_POLICY_THROTL);
194
195 tg->bps[READ] = blkcg_get_read_bps(blkcg, tg->blkg.dev);
196 tg->bps[WRITE] = blkcg_get_write_bps(blkcg, tg->blkg.dev);
197
198 hlist_add_head(&tg->tg_node, &td->tg_list);
199 td->nr_undestroyed_grps++;
200done:
201 return tg;
202}
203
204static struct throtl_grp * throtl_get_tg(struct throtl_data *td)
205{
206 struct cgroup *cgroup;
207 struct throtl_grp *tg = NULL;
208
209 rcu_read_lock();
210 cgroup = task_cgroup(current, blkio_subsys_id);
211 tg = throtl_find_alloc_tg(td, cgroup);
212 if (!tg)
213 tg = &td->root_tg;
214 rcu_read_unlock();
215 return tg;
216}
217
218static struct throtl_grp *throtl_rb_first(struct throtl_rb_root *root)
219{
220 /* Service tree is empty */
221 if (!root->count)
222 return NULL;
223
224 if (!root->left)
225 root->left = rb_first(&root->rb);
226
227 if (root->left)
228 return rb_entry_tg(root->left);
229
230 return NULL;
231}
232
233static void rb_erase_init(struct rb_node *n, struct rb_root *root)
234{
235 rb_erase(n, root);
236 RB_CLEAR_NODE(n);
237}
238
239static void throtl_rb_erase(struct rb_node *n, struct throtl_rb_root *root)
240{
241 if (root->left == n)
242 root->left = NULL;
243 rb_erase_init(n, &root->rb);
244 --root->count;
245}
246
247static void update_min_dispatch_time(struct throtl_rb_root *st)
248{
249 struct throtl_grp *tg;
250
251 tg = throtl_rb_first(st);
252 if (!tg)
253 return;
254
255 st->min_disptime = tg->disptime;
256}
257
258static void
259tg_service_tree_add(struct throtl_rb_root *st, struct throtl_grp *tg)
260{
261 struct rb_node **node = &st->rb.rb_node;
262 struct rb_node *parent = NULL;
263 struct throtl_grp *__tg;
264 unsigned long key = tg->disptime;
265 int left = 1;
266
267 while (*node != NULL) {
268 parent = *node;
269 __tg = rb_entry_tg(parent);
270
271 if (time_before(key, __tg->disptime))
272 node = &parent->rb_left;
273 else {
274 node = &parent->rb_right;
275 left = 0;
276 }
277 }
278
279 if (left)
280 st->left = &tg->rb_node;
281
282 rb_link_node(&tg->rb_node, parent, node);
283 rb_insert_color(&tg->rb_node, &st->rb);
284}
285
286static void __throtl_enqueue_tg(struct throtl_data *td, struct throtl_grp *tg)
287{
288 struct throtl_rb_root *st = &td->tg_service_tree;
289
290 tg_service_tree_add(st, tg);
291 throtl_mark_tg_on_rr(tg);
292 st->count++;
293}
294
295static void throtl_enqueue_tg(struct throtl_data *td, struct throtl_grp *tg)
296{
297 if (!throtl_tg_on_rr(tg))
298 __throtl_enqueue_tg(td, tg);
299}
300
301static void __throtl_dequeue_tg(struct throtl_data *td, struct throtl_grp *tg)
302{
303 throtl_rb_erase(&tg->rb_node, &td->tg_service_tree);
304 throtl_clear_tg_on_rr(tg);
305}
306
307static void throtl_dequeue_tg(struct throtl_data *td, struct throtl_grp *tg)
308{
309 if (throtl_tg_on_rr(tg))
310 __throtl_dequeue_tg(td, tg);
311}
312
313static void throtl_schedule_next_dispatch(struct throtl_data *td)
314{
315 struct throtl_rb_root *st = &td->tg_service_tree;
316
317 /*
318 * If there are more bios pending, schedule more work.
319 */
320 if (!total_nr_queued(td))
321 return;
322
323 BUG_ON(!st->count);
324
325 update_min_dispatch_time(st);
326
327 if (time_before_eq(st->min_disptime, jiffies))
328 throtl_schedule_delayed_work(td->queue, 0);
329 else
330 throtl_schedule_delayed_work(td->queue,
331 (st->min_disptime - jiffies));
332}
333
334static inline void
335throtl_start_new_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
336{
337 tg->bytes_disp[rw] = 0;
338 tg->slice_start[rw] = jiffies;
339 tg->slice_end[rw] = jiffies + throtl_slice;
340 throtl_log_tg(td, tg, "[%c] new slice start=%lu end=%lu jiffies=%lu",
341 rw == READ ? 'R' : 'W', tg->slice_start[rw],
342 tg->slice_end[rw], jiffies);
343}
344
345static inline void throtl_extend_slice(struct throtl_data *td,
346 struct throtl_grp *tg, bool rw, unsigned long jiffy_end)
347{
348 tg->slice_end[rw] = roundup(jiffy_end, throtl_slice);
349 throtl_log_tg(td, tg, "[%c] extend slice start=%lu end=%lu jiffies=%lu",
350 rw == READ ? 'R' : 'W', tg->slice_start[rw],
351 tg->slice_end[rw], jiffies);
352}
353
354/* Determine if previously allocated or extended slice is complete or not */
355static bool
356throtl_slice_used(struct throtl_data *td, struct throtl_grp *tg, bool rw)
357{
358 if (time_in_range(jiffies, tg->slice_start[rw], tg->slice_end[rw]))
359 return 0;
360
361 return 1;
362}
363
364/* Trim the used slices and adjust slice start accordingly */
365static inline void
366throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
367{
368 unsigned long nr_slices, bytes_trim, time_elapsed;
369
370 BUG_ON(time_before(tg->slice_end[rw], tg->slice_start[rw]));
371
372 /*
373 * If bps are unlimited (-1), then time slice don't get
374 * renewed. Don't try to trim the slice if slice is used. A new
375 * slice will start when appropriate.
376 */
377 if (throtl_slice_used(td, tg, rw))
378 return;
379
380 time_elapsed = jiffies - tg->slice_start[rw];
381
382 nr_slices = time_elapsed / throtl_slice;
383
384 if (!nr_slices)
385 return;
386
387 bytes_trim = (tg->bps[rw] * throtl_slice * nr_slices)/HZ;
388
389 if (!bytes_trim)
390 return;
391
392 if (tg->bytes_disp[rw] >= bytes_trim)
393 tg->bytes_disp[rw] -= bytes_trim;
394 else
395 tg->bytes_disp[rw] = 0;
396
397 tg->slice_start[rw] += nr_slices * throtl_slice;
398
399 throtl_log_tg(td, tg, "[%c] trim slice nr=%lu bytes=%lu"
400 " start=%lu end=%lu jiffies=%lu",
401 rw == READ ? 'R' : 'W', nr_slices, bytes_trim,
402 tg->slice_start[rw], tg->slice_end[rw], jiffies);
403}
404
405/*
406 * Returns whether one can dispatch a bio or not. Also returns approx number
407 * of jiffies to wait before this bio is with-in IO rate and can be dispatched
408 */
409static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg,
410 struct bio *bio, unsigned long *wait)
411{
412 bool rw = bio_data_dir(bio);
413 u64 bytes_allowed, extra_bytes;
414 unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
415
416 /*
417 * Currently whole state machine of group depends on first bio
418 * queued in the group bio list. So one should not be calling
419 * this function with a different bio if there are other bios
420 * queued.
421 */
422 BUG_ON(tg->nr_queued[rw] && bio != bio_list_peek(&tg->bio_lists[rw]));
423
424 /* If tg->bps = -1, then BW is unlimited */
425 if (tg->bps[rw] == -1) {
426 if (wait)
427 *wait = 0;
428 return 1;
429 }
430
431 /*
432 * If previous slice expired, start a new one otherwise renew/extend
433 * existing slice to make sure it is at least throtl_slice interval
434 * long since now.
435 */
436 if (throtl_slice_used(td, tg, rw))
437 throtl_start_new_slice(td, tg, rw);
438 else {
439 if (time_before(tg->slice_end[rw], jiffies + throtl_slice))
440 throtl_extend_slice(td, tg, rw, jiffies + throtl_slice);
441 }
442
443 jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw];
444
445 /* Slice has just started. Consider one slice interval */
446 if (!jiffy_elapsed)
447 jiffy_elapsed_rnd = throtl_slice;
448
449 jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice);
450
451 bytes_allowed = (tg->bps[rw] * jiffies_to_msecs(jiffy_elapsed_rnd))
452 / MSEC_PER_SEC;
453
454 if (tg->bytes_disp[rw] + bio->bi_size <= bytes_allowed) {
455 if (wait)
456 *wait = 0;
457 return 1;
458 }
459
460 /* Calc approx time to dispatch */
461 extra_bytes = tg->bytes_disp[rw] + bio->bi_size - bytes_allowed;
462 jiffy_wait = div64_u64(extra_bytes * HZ, tg->bps[rw]);
463
464 if (!jiffy_wait)
465 jiffy_wait = 1;
466
467 /*
468 * This wait time is without taking into consideration the rounding
469 * up we did. Add that time also.
470 */
471 jiffy_wait = jiffy_wait + (jiffy_elapsed_rnd - jiffy_elapsed);
472
473 if (wait)
474 *wait = jiffy_wait;
475
476 if (time_before(tg->slice_end[rw], jiffies + jiffy_wait))
477 throtl_extend_slice(td, tg, rw, jiffies + jiffy_wait);
478
479 return 0;
480}
481
482static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
483{
484 bool rw = bio_data_dir(bio);
485 bool sync = bio->bi_rw & REQ_SYNC;
486
487 /* Charge the bio to the group */
488 tg->bytes_disp[rw] += bio->bi_size;
489
490 /*
491 * TODO: This will take blkg->stats_lock. Figure out a way
492 * to avoid this cost.
493 */
494 blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size, rw, sync);
495
496}
497
498static void throtl_add_bio_tg(struct throtl_data *td, struct throtl_grp *tg,
499 struct bio *bio)
500{
501 bool rw = bio_data_dir(bio);
502
503 bio_list_add(&tg->bio_lists[rw], bio);
504 /* Take a bio reference on tg */
505 throtl_ref_get_tg(tg);
506 tg->nr_queued[rw]++;
507 td->nr_queued[rw]++;
508 throtl_enqueue_tg(td, tg);
509}
510
511static void tg_update_disptime(struct throtl_data *td, struct throtl_grp *tg)
512{
513 unsigned long read_wait = -1, write_wait = -1, min_wait = -1, disptime;
514 struct bio *bio;
515
516 if ((bio = bio_list_peek(&tg->bio_lists[READ])))
517 tg_may_dispatch(td, tg, bio, &read_wait);
518
519 if ((bio = bio_list_peek(&tg->bio_lists[WRITE])))
520 tg_may_dispatch(td, tg, bio, &write_wait);
521
522 min_wait = min(read_wait, write_wait);
523 disptime = jiffies + min_wait;
524
525 /*
526 * If group is already on active tree, then update dispatch time
527 * only if it is lesser than existing dispatch time. Otherwise
528 * always update the dispatch time
529 */
530
531 if (throtl_tg_on_rr(tg) && time_before(disptime, tg->disptime))
532 return;
533
534 /* Update dispatch time */
535 throtl_dequeue_tg(td, tg);
536 tg->disptime = disptime;
537 throtl_enqueue_tg(td, tg);
538}
539
540static void tg_dispatch_one_bio(struct throtl_data *td, struct throtl_grp *tg,
541 bool rw, struct bio_list *bl)
542{
543 struct bio *bio;
544
545 bio = bio_list_pop(&tg->bio_lists[rw]);
546 tg->nr_queued[rw]--;
547 /* Drop bio reference on tg */
548 throtl_put_tg(tg);
549
550 BUG_ON(td->nr_queued[rw] <= 0);
551 td->nr_queued[rw]--;
552
553 throtl_charge_bio(tg, bio);
554 bio_list_add(bl, bio);
555 bio->bi_rw |= REQ_THROTTLED;
556
557 throtl_trim_slice(td, tg, rw);
558}
559
560static int throtl_dispatch_tg(struct throtl_data *td, struct throtl_grp *tg,
561 struct bio_list *bl)
562{
563 unsigned int nr_reads = 0, nr_writes = 0;
564 unsigned int max_nr_reads = throtl_grp_quantum*3/4;
565 unsigned int max_nr_writes = throtl_grp_quantum - nr_reads;
566 struct bio *bio;
567
568 /* Try to dispatch 75% READS and 25% WRITES */
569
570 while ((bio = bio_list_peek(&tg->bio_lists[READ]))
571 && tg_may_dispatch(td, tg, bio, NULL)) {
572
573 tg_dispatch_one_bio(td, tg, bio_data_dir(bio), bl);
574 nr_reads++;
575
576 if (nr_reads >= max_nr_reads)
577 break;
578 }
579
580 while ((bio = bio_list_peek(&tg->bio_lists[WRITE]))
581 && tg_may_dispatch(td, tg, bio, NULL)) {
582
583 tg_dispatch_one_bio(td, tg, bio_data_dir(bio), bl);
584 nr_writes++;
585
586 if (nr_writes >= max_nr_writes)
587 break;
588 }
589
590 return nr_reads + nr_writes;
591}
592
593static int throtl_select_dispatch(struct throtl_data *td, struct bio_list *bl)
594{
595 unsigned int nr_disp = 0;
596 struct throtl_grp *tg;
597 struct throtl_rb_root *st = &td->tg_service_tree;
598
599 while (1) {
600 tg = throtl_rb_first(st);
601
602 if (!tg)
603 break;
604
605 if (time_before(jiffies, tg->disptime))
606 break;
607
608 throtl_dequeue_tg(td, tg);
609
610 nr_disp += throtl_dispatch_tg(td, tg, bl);
611
612 if (tg->nr_queued[0] || tg->nr_queued[1]) {
613 tg_update_disptime(td, tg);
614 throtl_enqueue_tg(td, tg);
615 }
616
617 if (nr_disp >= throtl_quantum)
618 break;
619 }
620
621 return nr_disp;
622}
623
624/* Dispatch throttled bios. Should be called without queue lock held. */
625static int throtl_dispatch(struct request_queue *q)
626{
627 struct throtl_data *td = q->td;
628 unsigned int nr_disp = 0;
629 struct bio_list bio_list_on_stack;
630 struct bio *bio;
631
632 spin_lock_irq(q->queue_lock);
633
634 if (!total_nr_queued(td))
635 goto out;
636
637 bio_list_init(&bio_list_on_stack);
638
639 throtl_log(td, "dispatch nr_queued=%lu read=%u write=%u",
640 total_nr_queued(td), td->nr_queued[READ],
641 td->nr_queued[WRITE]);
642
643 nr_disp = throtl_select_dispatch(td, &bio_list_on_stack);
644
645 if (nr_disp)
646 throtl_log(td, "bios disp=%u", nr_disp);
647
648 throtl_schedule_next_dispatch(td);
649out:
650 spin_unlock_irq(q->queue_lock);
651
652 /*
653 * If we dispatched some requests, unplug the queue to make sure
654 * immediate dispatch
655 */
656 if (nr_disp) {
657 while((bio = bio_list_pop(&bio_list_on_stack)))
658 generic_make_request(bio);
659 blk_unplug(q);
660 }
661 return nr_disp;
662}
663
664void blk_throtl_work(struct work_struct *work)
665{
666 struct throtl_data *td = container_of(work, struct throtl_data,
667 throtl_work.work);
668 struct request_queue *q = td->queue;
669
670 throtl_dispatch(q);
671}
672
673/* Call with queue lock held */
674void throtl_schedule_delayed_work(struct request_queue *q, unsigned long delay)
675{
676
677 struct throtl_data *td = q->td;
678 struct delayed_work *dwork = &td->throtl_work;
679
680 if (total_nr_queued(td) > 0) {
681 /*
682 * We might have a work scheduled to be executed in future.
683 * Cancel that and schedule a new one.
684 */
685 __cancel_delayed_work(dwork);
686 kblockd_schedule_delayed_work(q, dwork, delay);
687 throtl_log(td, "schedule work. delay=%lu jiffies=%lu",
688 delay, jiffies);
689 }
690}
691EXPORT_SYMBOL(throtl_schedule_delayed_work);
692
693static void
694throtl_destroy_tg(struct throtl_data *td, struct throtl_grp *tg)
695{
696 /* Something wrong if we are trying to remove same group twice */
697 BUG_ON(hlist_unhashed(&tg->tg_node));
698
699 hlist_del_init(&tg->tg_node);
700
701 /*
702 * Put the reference taken at the time of creation so that when all
703 * queues are gone, group can be destroyed.
704 */
705 throtl_put_tg(tg);
706 td->nr_undestroyed_grps--;
707}
708
709static void throtl_release_tgs(struct throtl_data *td)
710{
711 struct hlist_node *pos, *n;
712 struct throtl_grp *tg;
713
714 hlist_for_each_entry_safe(tg, pos, n, &td->tg_list, tg_node) {
715 /*
716 * If cgroup removal path got to blk_group first and removed
717 * it from cgroup list, then it will take care of destroying
718 * cfqg also.
719 */
720 if (!blkiocg_del_blkio_group(&tg->blkg))
721 throtl_destroy_tg(td, tg);
722 }
723}
724
725static void throtl_td_free(struct throtl_data *td)
726{
727 kfree(td);
728}
729
730/*
731 * Blk cgroup controller notification saying that blkio_group object is being
732 * delinked as associated cgroup object is going away. That also means that
733 * no new IO will come in this group. So get rid of this group as soon as
734 * any pending IO in the group is finished.
735 *
736 * This function is called under rcu_read_lock(). key is the rcu protected
737 * pointer. That means "key" is a valid throtl_data pointer as long as we are
738 * rcu read lock.
739 *
740 * "key" was fetched from blkio_group under blkio_cgroup->lock. That means
741 * it should not be NULL as even if queue was going away, cgroup deltion
742 * path got to it first.
743 */
744void throtl_unlink_blkio_group(void *key, struct blkio_group *blkg)
745{
746 unsigned long flags;
747 struct throtl_data *td = key;
748
749 spin_lock_irqsave(td->queue->queue_lock, flags);
750 throtl_destroy_tg(td, tg_of_blkg(blkg));
751 spin_unlock_irqrestore(td->queue->queue_lock, flags);
752}
753
754static void throtl_update_blkio_group_read_bps (struct blkio_group *blkg,
755 u64 read_bps)
756{
757 tg_of_blkg(blkg)->bps[READ] = read_bps;
758}
759
760static void throtl_update_blkio_group_write_bps (struct blkio_group *blkg,
761 u64 write_bps)
762{
763 tg_of_blkg(blkg)->bps[WRITE] = write_bps;
764}
765
766void throtl_shutdown_timer_wq(struct request_queue *q)
767{
768 struct throtl_data *td = q->td;
769
770 cancel_delayed_work_sync(&td->throtl_work);
771}
772
773static struct blkio_policy_type blkio_policy_throtl = {
774 .ops = {
775 .blkio_unlink_group_fn = throtl_unlink_blkio_group,
776 .blkio_update_group_read_bps_fn =
777 throtl_update_blkio_group_read_bps,
778 .blkio_update_group_write_bps_fn =
779 throtl_update_blkio_group_write_bps,
780 },
781};
782
783int blk_throtl_bio(struct request_queue *q, struct bio **biop)
784{
785 struct throtl_data *td = q->td;
786 struct throtl_grp *tg;
787 struct bio *bio = *biop;
788 bool rw = bio_data_dir(bio), update_disptime = true;
789
790 if (bio->bi_rw & REQ_THROTTLED) {
791 bio->bi_rw &= ~REQ_THROTTLED;
792 return 0;
793 }
794
795 spin_lock_irq(q->queue_lock);
796 tg = throtl_get_tg(td);
797
798 if (tg->nr_queued[rw]) {
799 /*
800 * There is already another bio queued in same dir. No
801 * need to update dispatch time.
802 */
803 update_disptime = false;
804 goto queue_bio;
805 }
806
807 /* Bio is with-in rate limit of group */
808 if (tg_may_dispatch(td, tg, bio, NULL)) {
809 throtl_charge_bio(tg, bio);
810 goto out;
811 }
812
813queue_bio:
814 throtl_log_tg(td, tg, "[%c] bio. disp=%u sz=%u bps=%llu"
815 " queued=%d/%d", rw == READ ? 'R' : 'W',
816 tg->bytes_disp[rw], bio->bi_size, tg->bps[rw],
817 tg->nr_queued[READ], tg->nr_queued[WRITE]);
818
819 throtl_add_bio_tg(q->td, tg, bio);
820 *biop = NULL;
821
822 if (update_disptime) {
823 tg_update_disptime(td, tg);
824 throtl_schedule_next_dispatch(td);
825 }
826
827out:
828 spin_unlock_irq(q->queue_lock);
829 return 0;
830}
831
832int blk_throtl_init(struct request_queue *q)
833{
834 struct throtl_data *td;
835 struct throtl_grp *tg;
836
837 td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node);
838 if (!td)
839 return -ENOMEM;
840
841 INIT_HLIST_HEAD(&td->tg_list);
842 td->tg_service_tree = THROTL_RB_ROOT;
843
844 /* Init root group */
845 tg = &td->root_tg;
846 INIT_HLIST_NODE(&tg->tg_node);
847 RB_CLEAR_NODE(&tg->rb_node);
848 bio_list_init(&tg->bio_lists[0]);
849 bio_list_init(&tg->bio_lists[1]);
850
851 /* Practically unlimited BW */
852 tg->bps[0] = tg->bps[1] = -1;
853 atomic_set(&tg->ref, 1);
854
855 INIT_DELAYED_WORK(&td->throtl_work, blk_throtl_work);
856
857 rcu_read_lock();
858 blkiocg_add_blkio_group(&blkio_root_cgroup, &tg->blkg, (void *)td,
859 0, BLKIO_POLICY_THROTL);
860 rcu_read_unlock();
861
862 /* Attach throtl data to request queue */
863 td->queue = q;
864 q->td = td;
865 return 0;
866}
867
868void blk_throtl_exit(struct request_queue *q)
869{
870 struct throtl_data *td = q->td;
871 bool wait = false;
872
873 BUG_ON(!td);
874
875 throtl_shutdown_timer_wq(q);
876
877 spin_lock_irq(q->queue_lock);
878 throtl_release_tgs(td);
879 blkiocg_del_blkio_group(&td->root_tg.blkg);
880
881 /* If there are other groups */
882 if (td->nr_undestroyed_grps >= 1)
883 wait = true;
884
885 spin_unlock_irq(q->queue_lock);
886
887 /*
888 * Wait for tg->blkg->key accessors to exit their grace periods.
889 * Do this wait only if there are other undestroyed groups out
890 * there (other than root group). This can happen if cgroup deletion
891 * path claimed the responsibility of cleaning up a group before
892 * queue cleanup code get to the group.
893 *
894 * Do not call synchronize_rcu() unconditionally as there are drivers
895 * which create/delete request queue hundreds of times during scan/boot
896 * and synchronize_rcu() can take significant time and slow down boot.
897 */
898 if (wait)
899 synchronize_rcu();
900 throtl_td_free(td);
901}
902
903static int __init throtl_init(void)
904{
905 blkio_policy_register(&blkio_policy_throtl);
906 return 0;
907}
908
909module_init(throtl_init);
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index ca83a97c9715..10a0c291b55a 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -130,6 +130,8 @@ enum rq_flag_bits {
130 /* bio only flags */ 130 /* bio only flags */
131 __REQ_UNPLUG, /* unplug the immediately after submission */ 131 __REQ_UNPLUG, /* unplug the immediately after submission */
132 __REQ_RAHEAD, /* read ahead, can fail anytime */ 132 __REQ_RAHEAD, /* read ahead, can fail anytime */
133 __REQ_THROTTLED, /* This bio has already been subjected to
134 * throttling rules. Don't do it again. */
133 135
134 /* request only flags */ 136 /* request only flags */
135 __REQ_SORTED, /* elevator knows about this request */ 137 __REQ_SORTED, /* elevator knows about this request */
@@ -172,6 +174,7 @@ enum rq_flag_bits {
172 174
173#define REQ_UNPLUG (1 << __REQ_UNPLUG) 175#define REQ_UNPLUG (1 << __REQ_UNPLUG)
174#define REQ_RAHEAD (1 << __REQ_RAHEAD) 176#define REQ_RAHEAD (1 << __REQ_RAHEAD)
177#define REQ_THROTTLED (1 << __REQ_THROTTLED)
175 178
176#define REQ_SORTED (1 << __REQ_SORTED) 179#define REQ_SORTED (1 << __REQ_SORTED)
177#define REQ_SOFTBARRIER (1 << __REQ_SOFTBARRIER) 180#define REQ_SOFTBARRIER (1 << __REQ_SOFTBARRIER)
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 780824edac16..1341df5806df 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -371,6 +371,11 @@ struct request_queue
371#if defined(CONFIG_BLK_DEV_BSG) 371#if defined(CONFIG_BLK_DEV_BSG)
372 struct bsg_class_device bsg_dev; 372 struct bsg_class_device bsg_dev;
373#endif 373#endif
374
375#ifdef CONFIG_BLK_DEV_THROTTLING
376 /* Throttle data */
377 struct throtl_data *td;
378#endif
374}; 379};
375 380
376#define QUEUE_FLAG_CLUSTER 0 /* cluster several segments into 1 */ 381#define QUEUE_FLAG_CLUSTER 0 /* cluster several segments into 1 */
@@ -1131,6 +1136,7 @@ static inline void put_dev_sector(Sector p)
1131 1136
1132struct work_struct; 1137struct work_struct;
1133int kblockd_schedule_work(struct request_queue *q, struct work_struct *work); 1138int kblockd_schedule_work(struct request_queue *q, struct work_struct *work);
1139int kblockd_schedule_delayed_work(struct request_queue *q, struct delayed_work *dwork, unsigned long delay);
1134 1140
1135#ifdef CONFIG_BLK_CGROUP 1141#ifdef CONFIG_BLK_CGROUP
1136/* 1142/*
@@ -1174,6 +1180,24 @@ static inline uint64_t rq_io_start_time_ns(struct request *req)
1174} 1180}
1175#endif 1181#endif
1176 1182
1183#ifdef CONFIG_BLK_DEV_THROTTLING
1184extern int blk_throtl_init(struct request_queue *q);
1185extern void blk_throtl_exit(struct request_queue *q);
1186extern int blk_throtl_bio(struct request_queue *q, struct bio **bio);
1187extern void throtl_schedule_delayed_work(struct request_queue *q, unsigned long delay);
1188extern void throtl_shutdown_timer_wq(struct request_queue *q);
1189#else /* CONFIG_BLK_DEV_THROTTLING */
1190static inline int blk_throtl_bio(struct request_queue *q, struct bio **bio)
1191{
1192 return 0;
1193}
1194
1195static inline int blk_throtl_init(struct request_queue *q) { return 0; }
1196static inline int blk_throtl_exit(struct request_queue *q) { return 0; }
1197static inline void throtl_schedule_delayed_work(struct request_queue *q, unsigned long delay) {}
1198static inline void throtl_shutdown_timer_wq(struct request_queue *q) {}
1199#endif /* CONFIG_BLK_DEV_THROTTLING */
1200
1177#define MODULE_ALIAS_BLOCKDEV(major,minor) \ 1201#define MODULE_ALIAS_BLOCKDEV(major,minor) \
1178 MODULE_ALIAS("block-major-" __stringify(major) "-" __stringify(minor)) 1202 MODULE_ALIAS("block-major-" __stringify(major) "-" __stringify(minor))
1179#define MODULE_ALIAS_BLOCKDEV_MAJOR(major) \ 1203#define MODULE_ALIAS_BLOCKDEV_MAJOR(major) \
diff --git a/init/Kconfig b/init/Kconfig
index 2de5b1cbadd9..950ba26f7233 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -634,11 +634,14 @@ config BLK_CGROUP
634 634
635 Currently, CFQ IO scheduler uses it to recognize task groups and 635 Currently, CFQ IO scheduler uses it to recognize task groups and
636 control disk bandwidth allocation (proportional time slice allocation) 636 control disk bandwidth allocation (proportional time slice allocation)
637 to such task groups. 637 to such task groups. It is also used by bio throttling logic in
638 block layer to implement upper limit in IO rates on a device.
638 639
639 This option only enables generic Block IO controller infrastructure. 640 This option only enables generic Block IO controller infrastructure.
640 One needs to also enable actual IO controlling logic in CFQ for it 641 One needs to also enable actual IO controlling logic/policy. For
641 to take effect. (CONFIG_CFQ_GROUP_IOSCHED=y). 642 enabling proportional weight division of disk bandwidth in CFQ seti
643 CONFIG_CFQ_GROUP_IOSCHED=y and for enabling throttling policy set
644 CONFIG_BLK_THROTTLE=y.
642 645
643 See Documentation/cgroups/blkio-controller.txt for more information. 646 See Documentation/cgroups/blkio-controller.txt for more information.
644 647