aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--block/Kconfig.iosched4
-rw-r--r--block/blk-cgroup.c2100
-rw-r--r--block/blk-cgroup.h647
-rw-r--r--block/blk-core.c281
-rw-r--r--block/blk-ioc.c126
-rw-r--r--block/blk-sysfs.c6
-rw-r--r--block/blk-throttle.c697
-rw-r--r--block/blk.h32
-rw-r--r--block/cfq-iosched.c1072
-rw-r--r--block/cfq.h115
-rw-r--r--block/deadline-iosched.c8
-rw-r--r--block/elevator.c121
-rw-r--r--block/noop-iosched.c8
-rw-r--r--fs/bio.c61
-rw-r--r--fs/ioprio.c2
-rw-r--r--fs/splice.c4
-rw-r--r--include/linux/bio.h8
-rw-r--r--include/linux/blk_types.h10
-rw-r--r--include/linux/blkdev.h20
-rw-r--r--include/linux/elevator.h8
-rw-r--r--include/linux/iocontext.h39
-rw-r--r--include/linux/ioprio.h22
-rw-r--r--init/Kconfig2
-rw-r--r--kernel/fork.c5
24 files changed, 2446 insertions, 2952 deletions
diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
index 3199b76f795d..421bef9c4c48 100644
--- a/block/Kconfig.iosched
+++ b/block/Kconfig.iosched
@@ -23,8 +23,6 @@ config IOSCHED_DEADLINE
23 23
24config IOSCHED_CFQ 24config IOSCHED_CFQ
25 tristate "CFQ I/O scheduler" 25 tristate "CFQ I/O scheduler"
26 # If BLK_CGROUP is a module, CFQ has to be built as module.
27 depends on (BLK_CGROUP=m && m) || !BLK_CGROUP || BLK_CGROUP=y
28 default y 26 default y
29 ---help--- 27 ---help---
30 The CFQ I/O scheduler tries to distribute bandwidth equally 28 The CFQ I/O scheduler tries to distribute bandwidth equally
@@ -34,8 +32,6 @@ config IOSCHED_CFQ
34 32
35 This is the default I/O scheduler. 33 This is the default I/O scheduler.
36 34
37 Note: If BLK_CGROUP=m, then CFQ can be built only as module.
38
39config CFQ_GROUP_IOSCHED 35config CFQ_GROUP_IOSCHED
40 bool "CFQ Group Scheduling support" 36 bool "CFQ Group Scheduling support"
41 depends on IOSCHED_CFQ && BLK_CGROUP 37 depends on IOSCHED_CFQ && BLK_CGROUP
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 126c341955de..02cf6335e9bd 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -11,1570 +11,612 @@
11 * Nauman Rafique <nauman@google.com> 11 * Nauman Rafique <nauman@google.com>
12 */ 12 */
13#include <linux/ioprio.h> 13#include <linux/ioprio.h>
14#include <linux/seq_file.h>
15#include <linux/kdev_t.h> 14#include <linux/kdev_t.h>
16#include <linux/module.h> 15#include <linux/module.h>
17#include <linux/err.h> 16#include <linux/err.h>
18#include <linux/blkdev.h> 17#include <linux/blkdev.h>
19#include <linux/slab.h> 18#include <linux/slab.h>
20#include "blk-cgroup.h"
21#include <linux/genhd.h> 19#include <linux/genhd.h>
20#include <linux/delay.h>
21#include <linux/atomic.h>
22#include "blk-cgroup.h"
23#include "blk.h"
22 24
23#define MAX_KEY_LEN 100 25#define MAX_KEY_LEN 100
24 26
25static DEFINE_SPINLOCK(blkio_list_lock); 27static DEFINE_MUTEX(blkcg_pol_mutex);
26static LIST_HEAD(blkio_list);
27 28
28struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT }; 29struct blkcg blkcg_root = { .cfq_weight = 2 * CFQ_WEIGHT_DEFAULT };
29EXPORT_SYMBOL_GPL(blkio_root_cgroup); 30EXPORT_SYMBOL_GPL(blkcg_root);
30 31
31/* for encoding cft->private value on file */ 32static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];
32#define BLKIOFILE_PRIVATE(x, val) (((x) << 16) | (val))
33/* What policy owns the file, proportional or throttle */
34#define BLKIOFILE_POLICY(val) (((val) >> 16) & 0xffff)
35#define BLKIOFILE_ATTR(val) ((val) & 0xffff)
36 33
37static inline void blkio_policy_insert_node(struct blkio_cgroup *blkcg, 34struct blkcg *cgroup_to_blkcg(struct cgroup *cgroup)
38 struct blkio_policy_node *pn)
39{ 35{
40 list_add(&pn->node, &blkcg->policy_list); 36 return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id),
37 struct blkcg, css);
41} 38}
39EXPORT_SYMBOL_GPL(cgroup_to_blkcg);
42 40
43static inline bool cftype_blkg_same_policy(struct cftype *cft, 41static struct blkcg *task_blkcg(struct task_struct *tsk)
44 struct blkio_group *blkg)
45{ 42{
46 enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); 43 return container_of(task_subsys_state(tsk, blkio_subsys_id),
47 44 struct blkcg, css);
48 if (blkg->plid == plid)
49 return 1;
50
51 return 0;
52} 45}
53 46
54/* Determines if policy node matches cgroup file being accessed */ 47struct blkcg *bio_blkcg(struct bio *bio)
55static inline bool pn_matches_cftype(struct cftype *cft,
56 struct blkio_policy_node *pn)
57{ 48{
58 enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); 49 if (bio && bio->bi_css)
59 int fileid = BLKIOFILE_ATTR(cft->private); 50 return container_of(bio->bi_css, struct blkcg, css);
60 51 return task_blkcg(current);
61 return (plid == pn->plid && fileid == pn->fileid);
62} 52}
53EXPORT_SYMBOL_GPL(bio_blkcg);
63 54
64/* Must be called with blkcg->lock held */ 55static bool blkcg_policy_enabled(struct request_queue *q,
65static inline void blkio_policy_delete_node(struct blkio_policy_node *pn) 56 const struct blkcg_policy *pol)
66{ 57{
67 list_del(&pn->node); 58 return pol && test_bit(pol->plid, q->blkcg_pols);
68} 59}
69 60
70/* Must be called with blkcg->lock held */ 61/**
71static struct blkio_policy_node * 62 * blkg_free - free a blkg
72blkio_policy_search_node(const struct blkio_cgroup *blkcg, dev_t dev, 63 * @blkg: blkg to free
73 enum blkio_policy_id plid, int fileid) 64 *
65 * Free @blkg which may be partially allocated.
66 */
67static void blkg_free(struct blkcg_gq *blkg)
74{ 68{
75 struct blkio_policy_node *pn; 69 int i;
76
77 list_for_each_entry(pn, &blkcg->policy_list, node) {
78 if (pn->dev == dev && pn->plid == plid && pn->fileid == fileid)
79 return pn;
80 }
81 70
82 return NULL; 71 if (!blkg)
83} 72 return;
84 73
85struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup) 74 for (i = 0; i < BLKCG_MAX_POLS; i++) {
86{ 75 struct blkcg_policy *pol = blkcg_policy[i];
87 return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id), 76 struct blkg_policy_data *pd = blkg->pd[i];
88 struct blkio_cgroup, css);
89}
90EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup);
91 77
92struct blkio_cgroup *task_blkio_cgroup(struct task_struct *tsk) 78 if (!pd)
93{ 79 continue;
94 return container_of(task_subsys_state(tsk, blkio_subsys_id),
95 struct blkio_cgroup, css);
96}
97EXPORT_SYMBOL_GPL(task_blkio_cgroup);
98 80
99static inline void 81 if (pol && pol->pd_exit_fn)
100blkio_update_group_weight(struct blkio_group *blkg, unsigned int weight) 82 pol->pd_exit_fn(blkg);
101{
102 struct blkio_policy_type *blkiop;
103 83
104 list_for_each_entry(blkiop, &blkio_list, list) { 84 kfree(pd);
105 /* If this policy does not own the blkg, do not send updates */
106 if (blkiop->plid != blkg->plid)
107 continue;
108 if (blkiop->ops.blkio_update_group_weight_fn)
109 blkiop->ops.blkio_update_group_weight_fn(blkg->key,
110 blkg, weight);
111 } 85 }
86
87 kfree(blkg);
112} 88}
113 89
114static inline void blkio_update_group_bps(struct blkio_group *blkg, u64 bps, 90/**
115 int fileid) 91 * blkg_alloc - allocate a blkg
92 * @blkcg: block cgroup the new blkg is associated with
93 * @q: request_queue the new blkg is associated with
94 *
95 * Allocate a new blkg assocating @blkcg and @q.
96 */
97static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q)
116{ 98{
117 struct blkio_policy_type *blkiop; 99 struct blkcg_gq *blkg;
118 100 int i;
119 list_for_each_entry(blkiop, &blkio_list, list) {
120
121 /* If this policy does not own the blkg, do not send updates */
122 if (blkiop->plid != blkg->plid)
123 continue;
124
125 if (fileid == BLKIO_THROTL_read_bps_device
126 && blkiop->ops.blkio_update_group_read_bps_fn)
127 blkiop->ops.blkio_update_group_read_bps_fn(blkg->key,
128 blkg, bps);
129 101
130 if (fileid == BLKIO_THROTL_write_bps_device 102 /* alloc and init base part */
131 && blkiop->ops.blkio_update_group_write_bps_fn) 103 blkg = kzalloc_node(sizeof(*blkg), GFP_ATOMIC, q->node);
132 blkiop->ops.blkio_update_group_write_bps_fn(blkg->key, 104 if (!blkg)
133 blkg, bps); 105 return NULL;
134 }
135}
136 106
137static inline void blkio_update_group_iops(struct blkio_group *blkg, 107 blkg->q = q;
138 unsigned int iops, int fileid) 108 INIT_LIST_HEAD(&blkg->q_node);
139{ 109 blkg->blkcg = blkcg;
140 struct blkio_policy_type *blkiop; 110 blkg->refcnt = 1;
141 111
142 list_for_each_entry(blkiop, &blkio_list, list) { 112 for (i = 0; i < BLKCG_MAX_POLS; i++) {
113 struct blkcg_policy *pol = blkcg_policy[i];
114 struct blkg_policy_data *pd;
143 115
144 /* If this policy does not own the blkg, do not send updates */ 116 if (!blkcg_policy_enabled(q, pol))
145 if (blkiop->plid != blkg->plid)
146 continue; 117 continue;
147 118
148 if (fileid == BLKIO_THROTL_read_iops_device 119 /* alloc per-policy data and attach it to blkg */
149 && blkiop->ops.blkio_update_group_read_iops_fn) 120 pd = kzalloc_node(pol->pd_size, GFP_ATOMIC, q->node);
150 blkiop->ops.blkio_update_group_read_iops_fn(blkg->key, 121 if (!pd) {
151 blkg, iops); 122 blkg_free(blkg);
123 return NULL;
124 }
152 125
153 if (fileid == BLKIO_THROTL_write_iops_device 126 blkg->pd[i] = pd;
154 && blkiop->ops.blkio_update_group_write_iops_fn) 127 pd->blkg = blkg;
155 blkiop->ops.blkio_update_group_write_iops_fn(blkg->key,
156 blkg,iops);
157 } 128 }
158}
159 129
160/* 130 /* invoke per-policy init */
161 * Add to the appropriate stat variable depending on the request type. 131 for (i = 0; i < BLKCG_MAX_POLS; i++) {
162 * This should be called with the blkg->stats_lock held. 132 struct blkcg_policy *pol = blkcg_policy[i];
163 */
164static void blkio_add_stat(uint64_t *stat, uint64_t add, bool direction,
165 bool sync)
166{
167 if (direction)
168 stat[BLKIO_STAT_WRITE] += add;
169 else
170 stat[BLKIO_STAT_READ] += add;
171 if (sync)
172 stat[BLKIO_STAT_SYNC] += add;
173 else
174 stat[BLKIO_STAT_ASYNC] += add;
175}
176 133
177/* 134 if (blkcg_policy_enabled(blkg->q, pol))
178 * Decrements the appropriate stat variable if non-zero depending on the 135 pol->pd_init_fn(blkg);
179 * request type. Panics on value being zero.
180 * This should be called with the blkg->stats_lock held.
181 */
182static void blkio_check_and_dec_stat(uint64_t *stat, bool direction, bool sync)
183{
184 if (direction) {
185 BUG_ON(stat[BLKIO_STAT_WRITE] == 0);
186 stat[BLKIO_STAT_WRITE]--;
187 } else {
188 BUG_ON(stat[BLKIO_STAT_READ] == 0);
189 stat[BLKIO_STAT_READ]--;
190 }
191 if (sync) {
192 BUG_ON(stat[BLKIO_STAT_SYNC] == 0);
193 stat[BLKIO_STAT_SYNC]--;
194 } else {
195 BUG_ON(stat[BLKIO_STAT_ASYNC] == 0);
196 stat[BLKIO_STAT_ASYNC]--;
197 } 136 }
198}
199 137
200#ifdef CONFIG_DEBUG_BLK_CGROUP 138 return blkg;
201/* This should be called with the blkg->stats_lock held. */
202static void blkio_set_start_group_wait_time(struct blkio_group *blkg,
203 struct blkio_group *curr_blkg)
204{
205 if (blkio_blkg_waiting(&blkg->stats))
206 return;
207 if (blkg == curr_blkg)
208 return;
209 blkg->stats.start_group_wait_time = sched_clock();
210 blkio_mark_blkg_waiting(&blkg->stats);
211} 139}
212 140
213/* This should be called with the blkg->stats_lock held. */ 141static struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg,
214static void blkio_update_group_wait_time(struct blkio_group_stats *stats) 142 struct request_queue *q)
215{ 143{
216 unsigned long long now; 144 struct blkcg_gq *blkg;
217 145
218 if (!blkio_blkg_waiting(stats)) 146 blkg = rcu_dereference(blkcg->blkg_hint);
219 return; 147 if (blkg && blkg->q == q)
148 return blkg;
220 149
221 now = sched_clock(); 150 /*
222 if (time_after64(now, stats->start_group_wait_time)) 151 * Hint didn't match. Look up from the radix tree. Note that we
223 stats->group_wait_time += now - stats->start_group_wait_time; 152 * may not be holding queue_lock and thus are not sure whether
224 blkio_clear_blkg_waiting(stats); 153 * @blkg from blkg_tree has already been removed or not, so we
154 * can't update hint to the lookup result. Leave it to the caller.
155 */
156 blkg = radix_tree_lookup(&blkcg->blkg_tree, q->id);
157 if (blkg && blkg->q == q)
158 return blkg;
159
160 return NULL;
225} 161}
226 162
227/* This should be called with the blkg->stats_lock held. */ 163/**
228static void blkio_end_empty_time(struct blkio_group_stats *stats) 164 * blkg_lookup - lookup blkg for the specified blkcg - q pair
165 * @blkcg: blkcg of interest
166 * @q: request_queue of interest
167 *
168 * Lookup blkg for the @blkcg - @q pair. This function should be called
169 * under RCU read lock and is guaranteed to return %NULL if @q is bypassing
170 * - see blk_queue_bypass_start() for details.
171 */
172struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q)
229{ 173{
230 unsigned long long now; 174 WARN_ON_ONCE(!rcu_read_lock_held());
231
232 if (!blkio_blkg_empty(stats))
233 return;
234 175
235 now = sched_clock(); 176 if (unlikely(blk_queue_bypass(q)))
236 if (time_after64(now, stats->start_empty_time)) 177 return NULL;
237 stats->empty_time += now - stats->start_empty_time; 178 return __blkg_lookup(blkcg, q);
238 blkio_clear_blkg_empty(stats);
239} 179}
180EXPORT_SYMBOL_GPL(blkg_lookup);
240 181
241void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg) 182static struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg,
183 struct request_queue *q)
184 __releases(q->queue_lock) __acquires(q->queue_lock)
242{ 185{
243 unsigned long flags; 186 struct blkcg_gq *blkg;
187 int ret;
244 188
245 spin_lock_irqsave(&blkg->stats_lock, flags); 189 WARN_ON_ONCE(!rcu_read_lock_held());
246 BUG_ON(blkio_blkg_idling(&blkg->stats)); 190 lockdep_assert_held(q->queue_lock);
247 blkg->stats.start_idle_time = sched_clock();
248 blkio_mark_blkg_idling(&blkg->stats);
249 spin_unlock_irqrestore(&blkg->stats_lock, flags);
250}
251EXPORT_SYMBOL_GPL(blkiocg_update_set_idle_time_stats);
252 191
253void blkiocg_update_idle_time_stats(struct blkio_group *blkg) 192 /* lookup and update hint on success, see __blkg_lookup() for details */
254{ 193 blkg = __blkg_lookup(blkcg, q);
255 unsigned long flags; 194 if (blkg) {
256 unsigned long long now; 195 rcu_assign_pointer(blkcg->blkg_hint, blkg);
257 struct blkio_group_stats *stats; 196 return blkg;
258
259 spin_lock_irqsave(&blkg->stats_lock, flags);
260 stats = &blkg->stats;
261 if (blkio_blkg_idling(stats)) {
262 now = sched_clock();
263 if (time_after64(now, stats->start_idle_time))
264 stats->idle_time += now - stats->start_idle_time;
265 blkio_clear_blkg_idling(stats);
266 } 197 }
267 spin_unlock_irqrestore(&blkg->stats_lock, flags);
268}
269EXPORT_SYMBOL_GPL(blkiocg_update_idle_time_stats);
270 198
271void blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg) 199 /* blkg holds a reference to blkcg */
272{ 200 if (!css_tryget(&blkcg->css))
273 unsigned long flags; 201 return ERR_PTR(-EINVAL);
274 struct blkio_group_stats *stats;
275
276 spin_lock_irqsave(&blkg->stats_lock, flags);
277 stats = &blkg->stats;
278 stats->avg_queue_size_sum +=
279 stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] +
280 stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE];
281 stats->avg_queue_size_samples++;
282 blkio_update_group_wait_time(stats);
283 spin_unlock_irqrestore(&blkg->stats_lock, flags);
284}
285EXPORT_SYMBOL_GPL(blkiocg_update_avg_queue_size_stats);
286 202
287void blkiocg_set_start_empty_time(struct blkio_group *blkg) 203 /* allocate */
288{ 204 ret = -ENOMEM;
289 unsigned long flags; 205 blkg = blkg_alloc(blkcg, q);
290 struct blkio_group_stats *stats; 206 if (unlikely(!blkg))
207 goto err_put;
291 208
292 spin_lock_irqsave(&blkg->stats_lock, flags); 209 /* insert */
293 stats = &blkg->stats; 210 ret = radix_tree_preload(GFP_ATOMIC);
211 if (ret)
212 goto err_free;
294 213
295 if (stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] || 214 spin_lock(&blkcg->lock);
296 stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE]) { 215 ret = radix_tree_insert(&blkcg->blkg_tree, q->id, blkg);
297 spin_unlock_irqrestore(&blkg->stats_lock, flags); 216 if (likely(!ret)) {
298 return; 217 hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
218 list_add(&blkg->q_node, &q->blkg_list);
299 } 219 }
220 spin_unlock(&blkcg->lock);
300 221
301 /* 222 radix_tree_preload_end();
302 * group is already marked empty. This can happen if cfqq got new
303 * request in parent group and moved to this group while being added
304 * to service tree. Just ignore the event and move on.
305 */
306 if(blkio_blkg_empty(stats)) {
307 spin_unlock_irqrestore(&blkg->stats_lock, flags);
308 return;
309 }
310 223
311 stats->start_empty_time = sched_clock(); 224 if (!ret)
312 blkio_mark_blkg_empty(stats); 225 return blkg;
313 spin_unlock_irqrestore(&blkg->stats_lock, flags); 226err_free:
227 blkg_free(blkg);
228err_put:
229 css_put(&blkcg->css);
230 return ERR_PTR(ret);
314} 231}
315EXPORT_SYMBOL_GPL(blkiocg_set_start_empty_time);
316 232
317void blkiocg_update_dequeue_stats(struct blkio_group *blkg, 233struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
318 unsigned long dequeue) 234 struct request_queue *q)
319{ 235{
320 blkg->stats.dequeue += dequeue; 236 /*
321} 237 * This could be the first entry point of blkcg implementation and
322EXPORT_SYMBOL_GPL(blkiocg_update_dequeue_stats); 238 * we shouldn't allow anything to go through for a bypassing queue.
323#else 239 */
324static inline void blkio_set_start_group_wait_time(struct blkio_group *blkg, 240 if (unlikely(blk_queue_bypass(q)))
325 struct blkio_group *curr_blkg) {} 241 return ERR_PTR(blk_queue_dead(q) ? -EINVAL : -EBUSY);
326static inline void blkio_end_empty_time(struct blkio_group_stats *stats) {} 242 return __blkg_lookup_create(blkcg, q);
327#endif
328
329void blkiocg_update_io_add_stats(struct blkio_group *blkg,
330 struct blkio_group *curr_blkg, bool direction,
331 bool sync)
332{
333 unsigned long flags;
334
335 spin_lock_irqsave(&blkg->stats_lock, flags);
336 blkio_add_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED], 1, direction,
337 sync);
338 blkio_end_empty_time(&blkg->stats);
339 blkio_set_start_group_wait_time(blkg, curr_blkg);
340 spin_unlock_irqrestore(&blkg->stats_lock, flags);
341} 243}
342EXPORT_SYMBOL_GPL(blkiocg_update_io_add_stats); 244EXPORT_SYMBOL_GPL(blkg_lookup_create);
343 245
344void blkiocg_update_io_remove_stats(struct blkio_group *blkg, 246static void blkg_destroy(struct blkcg_gq *blkg)
345 bool direction, bool sync)
346{ 247{
347 unsigned long flags; 248 struct request_queue *q = blkg->q;
249 struct blkcg *blkcg = blkg->blkcg;
348 250
349 spin_lock_irqsave(&blkg->stats_lock, flags); 251 lockdep_assert_held(q->queue_lock);
350 blkio_check_and_dec_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED], 252 lockdep_assert_held(&blkcg->lock);
351 direction, sync);
352 spin_unlock_irqrestore(&blkg->stats_lock, flags);
353}
354EXPORT_SYMBOL_GPL(blkiocg_update_io_remove_stats);
355 253
356void blkiocg_update_timeslice_used(struct blkio_group *blkg, unsigned long time, 254 /* Something wrong if we are trying to remove same group twice */
357 unsigned long unaccounted_time) 255 WARN_ON_ONCE(list_empty(&blkg->q_node));
358{ 256 WARN_ON_ONCE(hlist_unhashed(&blkg->blkcg_node));
359 unsigned long flags;
360
361 spin_lock_irqsave(&blkg->stats_lock, flags);
362 blkg->stats.time += time;
363#ifdef CONFIG_DEBUG_BLK_CGROUP
364 blkg->stats.unaccounted_time += unaccounted_time;
365#endif
366 spin_unlock_irqrestore(&blkg->stats_lock, flags);
367}
368EXPORT_SYMBOL_GPL(blkiocg_update_timeslice_used);
369 257
370/* 258 radix_tree_delete(&blkcg->blkg_tree, blkg->q->id);
371 * should be called under rcu read lock or queue lock to make sure blkg pointer 259 list_del_init(&blkg->q_node);
372 * is valid. 260 hlist_del_init_rcu(&blkg->blkcg_node);
373 */
374void blkiocg_update_dispatch_stats(struct blkio_group *blkg,
375 uint64_t bytes, bool direction, bool sync)
376{
377 struct blkio_group_stats_cpu *stats_cpu;
378 unsigned long flags;
379 261
380 /* 262 /*
381 * Disabling interrupts to provide mutual exclusion between two 263 * Both setting lookup hint to and clearing it from @blkg are done
382 * writes on same cpu. It probably is not needed for 64bit. Not 264 * under queue_lock. If it's not pointing to @blkg now, it never
383 * optimizing that case yet. 265 * will. Hint assignment itself can race safely.
384 */ 266 */
385 local_irq_save(flags); 267 if (rcu_dereference_raw(blkcg->blkg_hint) == blkg)
386 268 rcu_assign_pointer(blkcg->blkg_hint, NULL);
387 stats_cpu = this_cpu_ptr(blkg->stats_cpu);
388
389 u64_stats_update_begin(&stats_cpu->syncp);
390 stats_cpu->sectors += bytes >> 9;
391 blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_SERVICED],
392 1, direction, sync);
393 blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_SERVICE_BYTES],
394 bytes, direction, sync);
395 u64_stats_update_end(&stats_cpu->syncp);
396 local_irq_restore(flags);
397}
398EXPORT_SYMBOL_GPL(blkiocg_update_dispatch_stats);
399
400void blkiocg_update_completion_stats(struct blkio_group *blkg,
401 uint64_t start_time, uint64_t io_start_time, bool direction, bool sync)
402{
403 struct blkio_group_stats *stats;
404 unsigned long flags;
405 unsigned long long now = sched_clock();
406
407 spin_lock_irqsave(&blkg->stats_lock, flags);
408 stats = &blkg->stats;
409 if (time_after64(now, io_start_time))
410 blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICE_TIME],
411 now - io_start_time, direction, sync);
412 if (time_after64(io_start_time, start_time))
413 blkio_add_stat(stats->stat_arr[BLKIO_STAT_WAIT_TIME],
414 io_start_time - start_time, direction, sync);
415 spin_unlock_irqrestore(&blkg->stats_lock, flags);
416}
417EXPORT_SYMBOL_GPL(blkiocg_update_completion_stats);
418
419/* Merged stats are per cpu. */
420void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction,
421 bool sync)
422{
423 struct blkio_group_stats_cpu *stats_cpu;
424 unsigned long flags;
425 269
426 /* 270 /*
427 * Disabling interrupts to provide mutual exclusion between two 271 * Put the reference taken at the time of creation so that when all
428 * writes on same cpu. It probably is not needed for 64bit. Not 272 * queues are gone, group can be destroyed.
429 * optimizing that case yet.
430 */ 273 */
431 local_irq_save(flags); 274 blkg_put(blkg);
432
433 stats_cpu = this_cpu_ptr(blkg->stats_cpu);
434
435 u64_stats_update_begin(&stats_cpu->syncp);
436 blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_MERGED], 1,
437 direction, sync);
438 u64_stats_update_end(&stats_cpu->syncp);
439 local_irq_restore(flags);
440} 275}
441EXPORT_SYMBOL_GPL(blkiocg_update_io_merged_stats);
442 276
443/* 277/**
444 * This function allocates the per cpu stats for blkio_group. Should be called 278 * blkg_destroy_all - destroy all blkgs associated with a request_queue
445 * from sleepable context as alloc_per_cpu() requires that. 279 * @q: request_queue of interest
280 *
281 * Destroy all blkgs associated with @q.
446 */ 282 */
447int blkio_alloc_blkg_stats(struct blkio_group *blkg) 283static void blkg_destroy_all(struct request_queue *q)
448{ 284{
449 /* Allocate memory for per cpu stats */ 285 struct blkcg_gq *blkg, *n;
450 blkg->stats_cpu = alloc_percpu(struct blkio_group_stats_cpu);
451 if (!blkg->stats_cpu)
452 return -ENOMEM;
453 return 0;
454}
455EXPORT_SYMBOL_GPL(blkio_alloc_blkg_stats);
456 286
457void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, 287 lockdep_assert_held(q->queue_lock);
458 struct blkio_group *blkg, void *key, dev_t dev,
459 enum blkio_policy_id plid)
460{
461 unsigned long flags;
462
463 spin_lock_irqsave(&blkcg->lock, flags);
464 spin_lock_init(&blkg->stats_lock);
465 rcu_assign_pointer(blkg->key, key);
466 blkg->blkcg_id = css_id(&blkcg->css);
467 hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
468 blkg->plid = plid;
469 spin_unlock_irqrestore(&blkcg->lock, flags);
470 /* Need to take css reference ? */
471 cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path));
472 blkg->dev = dev;
473}
474EXPORT_SYMBOL_GPL(blkiocg_add_blkio_group);
475 288
476static void __blkiocg_del_blkio_group(struct blkio_group *blkg) 289 list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) {
477{ 290 struct blkcg *blkcg = blkg->blkcg;
478 hlist_del_init_rcu(&blkg->blkcg_node);
479 blkg->blkcg_id = 0;
480}
481 291
482/* 292 spin_lock(&blkcg->lock);
483 * returns 0 if blkio_group was still on cgroup list. Otherwise returns 1 293 blkg_destroy(blkg);
484 * indicating that blk_group was unhashed by the time we got to it. 294 spin_unlock(&blkcg->lock);
485 */
486int blkiocg_del_blkio_group(struct blkio_group *blkg)
487{
488 struct blkio_cgroup *blkcg;
489 unsigned long flags;
490 struct cgroup_subsys_state *css;
491 int ret = 1;
492
493 rcu_read_lock();
494 css = css_lookup(&blkio_subsys, blkg->blkcg_id);
495 if (css) {
496 blkcg = container_of(css, struct blkio_cgroup, css);
497 spin_lock_irqsave(&blkcg->lock, flags);
498 if (!hlist_unhashed(&blkg->blkcg_node)) {
499 __blkiocg_del_blkio_group(blkg);
500 ret = 0;
501 }
502 spin_unlock_irqrestore(&blkcg->lock, flags);
503 } 295 }
504
505 rcu_read_unlock();
506 return ret;
507} 296}
508EXPORT_SYMBOL_GPL(blkiocg_del_blkio_group);
509 297
510/* called under rcu_read_lock(). */ 298static void blkg_rcu_free(struct rcu_head *rcu_head)
511struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key)
512{ 299{
513 struct blkio_group *blkg; 300 blkg_free(container_of(rcu_head, struct blkcg_gq, rcu_head));
514 struct hlist_node *n;
515 void *__key;
516
517 hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {
518 __key = blkg->key;
519 if (__key == key)
520 return blkg;
521 }
522
523 return NULL;
524} 301}
525EXPORT_SYMBOL_GPL(blkiocg_lookup_group);
526 302
527static void blkio_reset_stats_cpu(struct blkio_group *blkg) 303void __blkg_release(struct blkcg_gq *blkg)
528{ 304{
529 struct blkio_group_stats_cpu *stats_cpu; 305 /* release the extra blkcg reference this blkg has been holding */
530 int i, j, k; 306 css_put(&blkg->blkcg->css);
307
531 /* 308 /*
532 * Note: On 64 bit arch this should not be an issue. This has the 309 * A group is freed in rcu manner. But having an rcu lock does not
533 * possibility of returning some inconsistent value on 32bit arch 310 * mean that one can access all the fields of blkg and assume these
534 * as 64bit update on 32bit is non atomic. Taking care of this 311 * are valid. For example, don't try to follow throtl_data and
535 * corner case makes code very complicated, like sending IPIs to 312 * request queue links.
536 * cpus, taking care of stats of offline cpus etc.
537 * 313 *
538 * reset stats is anyway more of a debug feature and this sounds a 314 * Having a reference to blkg under an rcu allows acess to only
539 * corner case. So I am not complicating the code yet until and 315 * values local to groups like group stats and group rate limits
540 * unless this becomes a real issue.
541 */ 316 */
542 for_each_possible_cpu(i) { 317 call_rcu(&blkg->rcu_head, blkg_rcu_free);
543 stats_cpu = per_cpu_ptr(blkg->stats_cpu, i);
544 stats_cpu->sectors = 0;
545 for(j = 0; j < BLKIO_STAT_CPU_NR; j++)
546 for (k = 0; k < BLKIO_STAT_TOTAL; k++)
547 stats_cpu->stat_arr_cpu[j][k] = 0;
548 }
549} 318}
319EXPORT_SYMBOL_GPL(__blkg_release);
550 320
551static int 321static int blkcg_reset_stats(struct cgroup *cgroup, struct cftype *cftype,
552blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val) 322 u64 val)
553{ 323{
554 struct blkio_cgroup *blkcg; 324 struct blkcg *blkcg = cgroup_to_blkcg(cgroup);
555 struct blkio_group *blkg; 325 struct blkcg_gq *blkg;
556 struct blkio_group_stats *stats;
557 struct hlist_node *n; 326 struct hlist_node *n;
558 uint64_t queued[BLKIO_STAT_TOTAL];
559 int i; 327 int i;
560#ifdef CONFIG_DEBUG_BLK_CGROUP
561 bool idling, waiting, empty;
562 unsigned long long now = sched_clock();
563#endif
564 328
565 blkcg = cgroup_to_blkio_cgroup(cgroup); 329 mutex_lock(&blkcg_pol_mutex);
566 spin_lock_irq(&blkcg->lock); 330 spin_lock_irq(&blkcg->lock);
567 hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
568 spin_lock(&blkg->stats_lock);
569 stats = &blkg->stats;
570#ifdef CONFIG_DEBUG_BLK_CGROUP
571 idling = blkio_blkg_idling(stats);
572 waiting = blkio_blkg_waiting(stats);
573 empty = blkio_blkg_empty(stats);
574#endif
575 for (i = 0; i < BLKIO_STAT_TOTAL; i++)
576 queued[i] = stats->stat_arr[BLKIO_STAT_QUEUED][i];
577 memset(stats, 0, sizeof(struct blkio_group_stats));
578 for (i = 0; i < BLKIO_STAT_TOTAL; i++)
579 stats->stat_arr[BLKIO_STAT_QUEUED][i] = queued[i];
580#ifdef CONFIG_DEBUG_BLK_CGROUP
581 if (idling) {
582 blkio_mark_blkg_idling(stats);
583 stats->start_idle_time = now;
584 }
585 if (waiting) {
586 blkio_mark_blkg_waiting(stats);
587 stats->start_group_wait_time = now;
588 }
589 if (empty) {
590 blkio_mark_blkg_empty(stats);
591 stats->start_empty_time = now;
592 }
593#endif
594 spin_unlock(&blkg->stats_lock);
595
596 /* Reset Per cpu stats which don't take blkg->stats_lock */
597 blkio_reset_stats_cpu(blkg);
598 }
599
600 spin_unlock_irq(&blkcg->lock);
601 return 0;
602}
603
604static void blkio_get_key_name(enum stat_sub_type type, dev_t dev, char *str,
605 int chars_left, bool diskname_only)
606{
607 snprintf(str, chars_left, "%d:%d", MAJOR(dev), MINOR(dev));
608 chars_left -= strlen(str);
609 if (chars_left <= 0) {
610 printk(KERN_WARNING
611 "Possibly incorrect cgroup stat display format");
612 return;
613 }
614 if (diskname_only)
615 return;
616 switch (type) {
617 case BLKIO_STAT_READ:
618 strlcat(str, " Read", chars_left);
619 break;
620 case BLKIO_STAT_WRITE:
621 strlcat(str, " Write", chars_left);
622 break;
623 case BLKIO_STAT_SYNC:
624 strlcat(str, " Sync", chars_left);
625 break;
626 case BLKIO_STAT_ASYNC:
627 strlcat(str, " Async", chars_left);
628 break;
629 case BLKIO_STAT_TOTAL:
630 strlcat(str, " Total", chars_left);
631 break;
632 default:
633 strlcat(str, " Invalid", chars_left);
634 }
635}
636
637static uint64_t blkio_fill_stat(char *str, int chars_left, uint64_t val,
638 struct cgroup_map_cb *cb, dev_t dev)
639{
640 blkio_get_key_name(0, dev, str, chars_left, true);
641 cb->fill(cb, str, val);
642 return val;
643}
644
645
646static uint64_t blkio_read_stat_cpu(struct blkio_group *blkg,
647 enum stat_type_cpu type, enum stat_sub_type sub_type)
648{
649 int cpu;
650 struct blkio_group_stats_cpu *stats_cpu;
651 u64 val = 0, tval;
652
653 for_each_possible_cpu(cpu) {
654 unsigned int start;
655 stats_cpu = per_cpu_ptr(blkg->stats_cpu, cpu);
656
657 do {
658 start = u64_stats_fetch_begin(&stats_cpu->syncp);
659 if (type == BLKIO_STAT_CPU_SECTORS)
660 tval = stats_cpu->sectors;
661 else
662 tval = stats_cpu->stat_arr_cpu[type][sub_type];
663 } while(u64_stats_fetch_retry(&stats_cpu->syncp, start));
664
665 val += tval;
666 }
667
668 return val;
669}
670
671static uint64_t blkio_get_stat_cpu(struct blkio_group *blkg,
672 struct cgroup_map_cb *cb, dev_t dev, enum stat_type_cpu type)
673{
674 uint64_t disk_total, val;
675 char key_str[MAX_KEY_LEN];
676 enum stat_sub_type sub_type;
677 331
678 if (type == BLKIO_STAT_CPU_SECTORS) { 332 /*
679 val = blkio_read_stat_cpu(blkg, type, 0); 333 * Note that stat reset is racy - it doesn't synchronize against
680 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, val, cb, dev); 334 * stat updates. This is a debug feature which shouldn't exist
681 } 335 * anyway. If you get hit by a race, retry.
682 336 */
683 for (sub_type = BLKIO_STAT_READ; sub_type < BLKIO_STAT_TOTAL; 337 hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
684 sub_type++) { 338 for (i = 0; i < BLKCG_MAX_POLS; i++) {
685 blkio_get_key_name(sub_type, dev, key_str, MAX_KEY_LEN, false); 339 struct blkcg_policy *pol = blkcg_policy[i];
686 val = blkio_read_stat_cpu(blkg, type, sub_type);
687 cb->fill(cb, key_str, val);
688 }
689
690 disk_total = blkio_read_stat_cpu(blkg, type, BLKIO_STAT_READ) +
691 blkio_read_stat_cpu(blkg, type, BLKIO_STAT_WRITE);
692
693 blkio_get_key_name(BLKIO_STAT_TOTAL, dev, key_str, MAX_KEY_LEN, false);
694 cb->fill(cb, key_str, disk_total);
695 return disk_total;
696}
697
698/* This should be called with blkg->stats_lock held */
699static uint64_t blkio_get_stat(struct blkio_group *blkg,
700 struct cgroup_map_cb *cb, dev_t dev, enum stat_type type)
701{
702 uint64_t disk_total;
703 char key_str[MAX_KEY_LEN];
704 enum stat_sub_type sub_type;
705
706 if (type == BLKIO_STAT_TIME)
707 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
708 blkg->stats.time, cb, dev);
709#ifdef CONFIG_DEBUG_BLK_CGROUP
710 if (type == BLKIO_STAT_UNACCOUNTED_TIME)
711 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
712 blkg->stats.unaccounted_time, cb, dev);
713 if (type == BLKIO_STAT_AVG_QUEUE_SIZE) {
714 uint64_t sum = blkg->stats.avg_queue_size_sum;
715 uint64_t samples = blkg->stats.avg_queue_size_samples;
716 if (samples)
717 do_div(sum, samples);
718 else
719 sum = 0;
720 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, sum, cb, dev);
721 }
722 if (type == BLKIO_STAT_GROUP_WAIT_TIME)
723 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
724 blkg->stats.group_wait_time, cb, dev);
725 if (type == BLKIO_STAT_IDLE_TIME)
726 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
727 blkg->stats.idle_time, cb, dev);
728 if (type == BLKIO_STAT_EMPTY_TIME)
729 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
730 blkg->stats.empty_time, cb, dev);
731 if (type == BLKIO_STAT_DEQUEUE)
732 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
733 blkg->stats.dequeue, cb, dev);
734#endif
735
736 for (sub_type = BLKIO_STAT_READ; sub_type < BLKIO_STAT_TOTAL;
737 sub_type++) {
738 blkio_get_key_name(sub_type, dev, key_str, MAX_KEY_LEN, false);
739 cb->fill(cb, key_str, blkg->stats.stat_arr[type][sub_type]);
740 }
741 disk_total = blkg->stats.stat_arr[type][BLKIO_STAT_READ] +
742 blkg->stats.stat_arr[type][BLKIO_STAT_WRITE];
743 blkio_get_key_name(BLKIO_STAT_TOTAL, dev, key_str, MAX_KEY_LEN, false);
744 cb->fill(cb, key_str, disk_total);
745 return disk_total;
746}
747
748static int blkio_policy_parse_and_set(char *buf,
749 struct blkio_policy_node *newpn, enum blkio_policy_id plid, int fileid)
750{
751 struct gendisk *disk = NULL;
752 char *s[4], *p, *major_s = NULL, *minor_s = NULL;
753 unsigned long major, minor;
754 int i = 0, ret = -EINVAL;
755 int part;
756 dev_t dev;
757 u64 temp;
758
759 memset(s, 0, sizeof(s));
760
761 while ((p = strsep(&buf, " ")) != NULL) {
762 if (!*p)
763 continue;
764
765 s[i++] = p;
766
767 /* Prevent from inputing too many things */
768 if (i == 3)
769 break;
770 }
771
772 if (i != 2)
773 goto out;
774
775 p = strsep(&s[0], ":");
776 if (p != NULL)
777 major_s = p;
778 else
779 goto out;
780
781 minor_s = s[0];
782 if (!minor_s)
783 goto out;
784
785 if (strict_strtoul(major_s, 10, &major))
786 goto out;
787
788 if (strict_strtoul(minor_s, 10, &minor))
789 goto out;
790
791 dev = MKDEV(major, minor);
792
793 if (strict_strtoull(s[1], 10, &temp))
794 goto out;
795
796 /* For rule removal, do not check for device presence. */
797 if (temp) {
798 disk = get_gendisk(dev, &part);
799 if (!disk || part) {
800 ret = -ENODEV;
801 goto out;
802 }
803 }
804
805 newpn->dev = dev;
806
807 switch (plid) {
808 case BLKIO_POLICY_PROP:
809 if ((temp < BLKIO_WEIGHT_MIN && temp > 0) ||
810 temp > BLKIO_WEIGHT_MAX)
811 goto out;
812
813 newpn->plid = plid;
814 newpn->fileid = fileid;
815 newpn->val.weight = temp;
816 break;
817 case BLKIO_POLICY_THROTL:
818 switch(fileid) {
819 case BLKIO_THROTL_read_bps_device:
820 case BLKIO_THROTL_write_bps_device:
821 newpn->plid = plid;
822 newpn->fileid = fileid;
823 newpn->val.bps = temp;
824 break;
825 case BLKIO_THROTL_read_iops_device:
826 case BLKIO_THROTL_write_iops_device:
827 if (temp > THROTL_IOPS_MAX)
828 goto out;
829
830 newpn->plid = plid;
831 newpn->fileid = fileid;
832 newpn->val.iops = (unsigned int)temp;
833 break;
834 }
835 break;
836 default:
837 BUG();
838 }
839 ret = 0;
840out:
841 put_disk(disk);
842 return ret;
843}
844
845unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
846 dev_t dev)
847{
848 struct blkio_policy_node *pn;
849 unsigned long flags;
850 unsigned int weight;
851
852 spin_lock_irqsave(&blkcg->lock, flags);
853
854 pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_PROP,
855 BLKIO_PROP_weight_device);
856 if (pn)
857 weight = pn->val.weight;
858 else
859 weight = blkcg->weight;
860
861 spin_unlock_irqrestore(&blkcg->lock, flags);
862
863 return weight;
864}
865EXPORT_SYMBOL_GPL(blkcg_get_weight);
866
867uint64_t blkcg_get_read_bps(struct blkio_cgroup *blkcg, dev_t dev)
868{
869 struct blkio_policy_node *pn;
870 unsigned long flags;
871 uint64_t bps = -1;
872
873 spin_lock_irqsave(&blkcg->lock, flags);
874 pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
875 BLKIO_THROTL_read_bps_device);
876 if (pn)
877 bps = pn->val.bps;
878 spin_unlock_irqrestore(&blkcg->lock, flags);
879
880 return bps;
881}
882
883uint64_t blkcg_get_write_bps(struct blkio_cgroup *blkcg, dev_t dev)
884{
885 struct blkio_policy_node *pn;
886 unsigned long flags;
887 uint64_t bps = -1;
888
889 spin_lock_irqsave(&blkcg->lock, flags);
890 pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
891 BLKIO_THROTL_write_bps_device);
892 if (pn)
893 bps = pn->val.bps;
894 spin_unlock_irqrestore(&blkcg->lock, flags);
895
896 return bps;
897}
898
899unsigned int blkcg_get_read_iops(struct blkio_cgroup *blkcg, dev_t dev)
900{
901 struct blkio_policy_node *pn;
902 unsigned long flags;
903 unsigned int iops = -1;
904
905 spin_lock_irqsave(&blkcg->lock, flags);
906 pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
907 BLKIO_THROTL_read_iops_device);
908 if (pn)
909 iops = pn->val.iops;
910 spin_unlock_irqrestore(&blkcg->lock, flags);
911
912 return iops;
913}
914
915unsigned int blkcg_get_write_iops(struct blkio_cgroup *blkcg, dev_t dev)
916{
917 struct blkio_policy_node *pn;
918 unsigned long flags;
919 unsigned int iops = -1;
920
921 spin_lock_irqsave(&blkcg->lock, flags);
922 pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
923 BLKIO_THROTL_write_iops_device);
924 if (pn)
925 iops = pn->val.iops;
926 spin_unlock_irqrestore(&blkcg->lock, flags);
927
928 return iops;
929}
930 340
931/* Checks whether user asked for deleting a policy rule */ 341 if (blkcg_policy_enabled(blkg->q, pol) &&
932static bool blkio_delete_rule_command(struct blkio_policy_node *pn) 342 pol->pd_reset_stats_fn)
933{ 343 pol->pd_reset_stats_fn(blkg);
934 switch(pn->plid) {
935 case BLKIO_POLICY_PROP:
936 if (pn->val.weight == 0)
937 return 1;
938 break;
939 case BLKIO_POLICY_THROTL:
940 switch(pn->fileid) {
941 case BLKIO_THROTL_read_bps_device:
942 case BLKIO_THROTL_write_bps_device:
943 if (pn->val.bps == 0)
944 return 1;
945 break;
946 case BLKIO_THROTL_read_iops_device:
947 case BLKIO_THROTL_write_iops_device:
948 if (pn->val.iops == 0)
949 return 1;
950 } 344 }
951 break;
952 default:
953 BUG();
954 } 345 }
955 346
347 spin_unlock_irq(&blkcg->lock);
348 mutex_unlock(&blkcg_pol_mutex);
956 return 0; 349 return 0;
957} 350}
958 351
959static void blkio_update_policy_rule(struct blkio_policy_node *oldpn, 352static const char *blkg_dev_name(struct blkcg_gq *blkg)
960 struct blkio_policy_node *newpn)
961{
962 switch(oldpn->plid) {
963 case BLKIO_POLICY_PROP:
964 oldpn->val.weight = newpn->val.weight;
965 break;
966 case BLKIO_POLICY_THROTL:
967 switch(newpn->fileid) {
968 case BLKIO_THROTL_read_bps_device:
969 case BLKIO_THROTL_write_bps_device:
970 oldpn->val.bps = newpn->val.bps;
971 break;
972 case BLKIO_THROTL_read_iops_device:
973 case BLKIO_THROTL_write_iops_device:
974 oldpn->val.iops = newpn->val.iops;
975 }
976 break;
977 default:
978 BUG();
979 }
980}
981
982/*
983 * Some rules/values in blkg have changed. Propagate those to respective
984 * policies.
985 */
986static void blkio_update_blkg_policy(struct blkio_cgroup *blkcg,
987 struct blkio_group *blkg, struct blkio_policy_node *pn)
988{ 353{
989 unsigned int weight, iops; 354 /* some drivers (floppy) instantiate a queue w/o disk registered */
990 u64 bps; 355 if (blkg->q->backing_dev_info.dev)
991 356 return dev_name(blkg->q->backing_dev_info.dev);
992 switch(pn->plid) { 357 return NULL;
993 case BLKIO_POLICY_PROP:
994 weight = pn->val.weight ? pn->val.weight :
995 blkcg->weight;
996 blkio_update_group_weight(blkg, weight);
997 break;
998 case BLKIO_POLICY_THROTL:
999 switch(pn->fileid) {
1000 case BLKIO_THROTL_read_bps_device:
1001 case BLKIO_THROTL_write_bps_device:
1002 bps = pn->val.bps ? pn->val.bps : (-1);
1003 blkio_update_group_bps(blkg, bps, pn->fileid);
1004 break;
1005 case BLKIO_THROTL_read_iops_device:
1006 case BLKIO_THROTL_write_iops_device:
1007 iops = pn->val.iops ? pn->val.iops : (-1);
1008 blkio_update_group_iops(blkg, iops, pn->fileid);
1009 break;
1010 }
1011 break;
1012 default:
1013 BUG();
1014 }
1015} 358}
1016 359
1017/* 360/**
1018 * A policy node rule has been updated. Propagate this update to all the 361 * blkcg_print_blkgs - helper for printing per-blkg data
1019 * block groups which might be affected by this update. 362 * @sf: seq_file to print to
363 * @blkcg: blkcg of interest
364 * @prfill: fill function to print out a blkg
365 * @pol: policy in question
366 * @data: data to be passed to @prfill
367 * @show_total: to print out sum of prfill return values or not
368 *
369 * This function invokes @prfill on each blkg of @blkcg if pd for the
370 * policy specified by @pol exists. @prfill is invoked with @sf, the
371 * policy data and @data. If @show_total is %true, the sum of the return
372 * values from @prfill is printed with "Total" label at the end.
373 *
374 * This is to be used to construct print functions for
375 * cftype->read_seq_string method.
1020 */ 376 */
1021static void blkio_update_policy_node_blkg(struct blkio_cgroup *blkcg, 377void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg,
1022 struct blkio_policy_node *pn) 378 u64 (*prfill)(struct seq_file *,
379 struct blkg_policy_data *, int),
380 const struct blkcg_policy *pol, int data,
381 bool show_total)
1023{ 382{
1024 struct blkio_group *blkg; 383 struct blkcg_gq *blkg;
1025 struct hlist_node *n; 384 struct hlist_node *n;
385 u64 total = 0;
1026 386
1027 spin_lock(&blkio_list_lock);
1028 spin_lock_irq(&blkcg->lock); 387 spin_lock_irq(&blkcg->lock);
1029 388 hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node)
1030 hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) { 389 if (blkcg_policy_enabled(blkg->q, pol))
1031 if (pn->dev != blkg->dev || pn->plid != blkg->plid) 390 total += prfill(sf, blkg->pd[pol->plid], data);
1032 continue;
1033 blkio_update_blkg_policy(blkcg, blkg, pn);
1034 }
1035
1036 spin_unlock_irq(&blkcg->lock); 391 spin_unlock_irq(&blkcg->lock);
1037 spin_unlock(&blkio_list_lock); 392
393 if (show_total)
394 seq_printf(sf, "Total %llu\n", (unsigned long long)total);
1038} 395}
396EXPORT_SYMBOL_GPL(blkcg_print_blkgs);
1039 397
1040static int blkiocg_file_write(struct cgroup *cgrp, struct cftype *cft, 398/**
1041 const char *buffer) 399 * __blkg_prfill_u64 - prfill helper for a single u64 value
400 * @sf: seq_file to print to
401 * @pd: policy private data of interest
402 * @v: value to print
403 *
404 * Print @v to @sf for the device assocaited with @pd.
405 */
406u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v)
1042{ 407{
1043 int ret = 0; 408 const char *dname = blkg_dev_name(pd->blkg);
1044 char *buf;
1045 struct blkio_policy_node *newpn, *pn;
1046 struct blkio_cgroup *blkcg;
1047 int keep_newpn = 0;
1048 enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
1049 int fileid = BLKIOFILE_ATTR(cft->private);
1050
1051 buf = kstrdup(buffer, GFP_KERNEL);
1052 if (!buf)
1053 return -ENOMEM;
1054
1055 newpn = kzalloc(sizeof(*newpn), GFP_KERNEL);
1056 if (!newpn) {
1057 ret = -ENOMEM;
1058 goto free_buf;
1059 }
1060
1061 ret = blkio_policy_parse_and_set(buf, newpn, plid, fileid);
1062 if (ret)
1063 goto free_newpn;
1064
1065 blkcg = cgroup_to_blkio_cgroup(cgrp);
1066
1067 spin_lock_irq(&blkcg->lock);
1068
1069 pn = blkio_policy_search_node(blkcg, newpn->dev, plid, fileid);
1070 if (!pn) {
1071 if (!blkio_delete_rule_command(newpn)) {
1072 blkio_policy_insert_node(blkcg, newpn);
1073 keep_newpn = 1;
1074 }
1075 spin_unlock_irq(&blkcg->lock);
1076 goto update_io_group;
1077 }
1078
1079 if (blkio_delete_rule_command(newpn)) {
1080 blkio_policy_delete_node(pn);
1081 kfree(pn);
1082 spin_unlock_irq(&blkcg->lock);
1083 goto update_io_group;
1084 }
1085 spin_unlock_irq(&blkcg->lock);
1086 409
1087 blkio_update_policy_rule(pn, newpn); 410 if (!dname)
411 return 0;
1088 412
1089update_io_group: 413 seq_printf(sf, "%s %llu\n", dname, (unsigned long long)v);
1090 blkio_update_policy_node_blkg(blkcg, newpn); 414 return v;
1091
1092free_newpn:
1093 if (!keep_newpn)
1094 kfree(newpn);
1095free_buf:
1096 kfree(buf);
1097 return ret;
1098} 415}
416EXPORT_SYMBOL_GPL(__blkg_prfill_u64);
1099 417
1100static void 418/**
1101blkio_print_policy_node(struct seq_file *m, struct blkio_policy_node *pn) 419 * __blkg_prfill_rwstat - prfill helper for a blkg_rwstat
1102{ 420 * @sf: seq_file to print to
1103 switch(pn->plid) { 421 * @pd: policy private data of interest
1104 case BLKIO_POLICY_PROP: 422 * @rwstat: rwstat to print
1105 if (pn->fileid == BLKIO_PROP_weight_device) 423 *
1106 seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev), 424 * Print @rwstat to @sf for the device assocaited with @pd.
1107 MINOR(pn->dev), pn->val.weight); 425 */
1108 break; 426u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
1109 case BLKIO_POLICY_THROTL: 427 const struct blkg_rwstat *rwstat)
1110 switch(pn->fileid) { 428{
1111 case BLKIO_THROTL_read_bps_device: 429 static const char *rwstr[] = {
1112 case BLKIO_THROTL_write_bps_device: 430 [BLKG_RWSTAT_READ] = "Read",
1113 seq_printf(m, "%u:%u\t%llu\n", MAJOR(pn->dev), 431 [BLKG_RWSTAT_WRITE] = "Write",
1114 MINOR(pn->dev), pn->val.bps); 432 [BLKG_RWSTAT_SYNC] = "Sync",
1115 break; 433 [BLKG_RWSTAT_ASYNC] = "Async",
1116 case BLKIO_THROTL_read_iops_device: 434 };
1117 case BLKIO_THROTL_write_iops_device: 435 const char *dname = blkg_dev_name(pd->blkg);
1118 seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev), 436 u64 v;
1119 MINOR(pn->dev), pn->val.iops); 437 int i;
1120 break;
1121 }
1122 break;
1123 default:
1124 BUG();
1125 }
1126}
1127 438
1128/* cgroup files which read their data from policy nodes end up here */ 439 if (!dname)
1129static void blkio_read_policy_node_files(struct cftype *cft, 440 return 0;
1130 struct blkio_cgroup *blkcg, struct seq_file *m)
1131{
1132 struct blkio_policy_node *pn;
1133
1134 if (!list_empty(&blkcg->policy_list)) {
1135 spin_lock_irq(&blkcg->lock);
1136 list_for_each_entry(pn, &blkcg->policy_list, node) {
1137 if (!pn_matches_cftype(cft, pn))
1138 continue;
1139 blkio_print_policy_node(m, pn);
1140 }
1141 spin_unlock_irq(&blkcg->lock);
1142 }
1143}
1144 441
1145static int blkiocg_file_read(struct cgroup *cgrp, struct cftype *cft, 442 for (i = 0; i < BLKG_RWSTAT_NR; i++)
1146 struct seq_file *m) 443 seq_printf(sf, "%s %s %llu\n", dname, rwstr[i],
1147{ 444 (unsigned long long)rwstat->cnt[i]);
1148 struct blkio_cgroup *blkcg;
1149 enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
1150 int name = BLKIOFILE_ATTR(cft->private);
1151
1152 blkcg = cgroup_to_blkio_cgroup(cgrp);
1153
1154 switch(plid) {
1155 case BLKIO_POLICY_PROP:
1156 switch(name) {
1157 case BLKIO_PROP_weight_device:
1158 blkio_read_policy_node_files(cft, blkcg, m);
1159 return 0;
1160 default:
1161 BUG();
1162 }
1163 break;
1164 case BLKIO_POLICY_THROTL:
1165 switch(name){
1166 case BLKIO_THROTL_read_bps_device:
1167 case BLKIO_THROTL_write_bps_device:
1168 case BLKIO_THROTL_read_iops_device:
1169 case BLKIO_THROTL_write_iops_device:
1170 blkio_read_policy_node_files(cft, blkcg, m);
1171 return 0;
1172 default:
1173 BUG();
1174 }
1175 break;
1176 default:
1177 BUG();
1178 }
1179 445
1180 return 0; 446 v = rwstat->cnt[BLKG_RWSTAT_READ] + rwstat->cnt[BLKG_RWSTAT_WRITE];
447 seq_printf(sf, "%s Total %llu\n", dname, (unsigned long long)v);
448 return v;
1181} 449}
1182 450
1183static int blkio_read_blkg_stats(struct blkio_cgroup *blkcg, 451/**
1184 struct cftype *cft, struct cgroup_map_cb *cb, 452 * blkg_prfill_stat - prfill callback for blkg_stat
1185 enum stat_type type, bool show_total, bool pcpu) 453 * @sf: seq_file to print to
454 * @pd: policy private data of interest
455 * @off: offset to the blkg_stat in @pd
456 *
457 * prfill callback for printing a blkg_stat.
458 */
459u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, int off)
1186{ 460{
1187 struct blkio_group *blkg; 461 return __blkg_prfill_u64(sf, pd, blkg_stat_read((void *)pd + off));
1188 struct hlist_node *n;
1189 uint64_t cgroup_total = 0;
1190
1191 rcu_read_lock();
1192 hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {
1193 if (blkg->dev) {
1194 if (!cftype_blkg_same_policy(cft, blkg))
1195 continue;
1196 if (pcpu)
1197 cgroup_total += blkio_get_stat_cpu(blkg, cb,
1198 blkg->dev, type);
1199 else {
1200 spin_lock_irq(&blkg->stats_lock);
1201 cgroup_total += blkio_get_stat(blkg, cb,
1202 blkg->dev, type);
1203 spin_unlock_irq(&blkg->stats_lock);
1204 }
1205 }
1206 }
1207 if (show_total)
1208 cb->fill(cb, "Total", cgroup_total);
1209 rcu_read_unlock();
1210 return 0;
1211} 462}
463EXPORT_SYMBOL_GPL(blkg_prfill_stat);
1212 464
1213/* All map kind of cgroup file get serviced by this function */ 465/**
1214static int blkiocg_file_read_map(struct cgroup *cgrp, struct cftype *cft, 466 * blkg_prfill_rwstat - prfill callback for blkg_rwstat
1215 struct cgroup_map_cb *cb) 467 * @sf: seq_file to print to
468 * @pd: policy private data of interest
469 * @off: offset to the blkg_rwstat in @pd
470 *
471 * prfill callback for printing a blkg_rwstat.
472 */
473u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
474 int off)
1216{ 475{
1217 struct blkio_cgroup *blkcg; 476 struct blkg_rwstat rwstat = blkg_rwstat_read((void *)pd + off);
1218 enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
1219 int name = BLKIOFILE_ATTR(cft->private);
1220
1221 blkcg = cgroup_to_blkio_cgroup(cgrp);
1222
1223 switch(plid) {
1224 case BLKIO_POLICY_PROP:
1225 switch(name) {
1226 case BLKIO_PROP_time:
1227 return blkio_read_blkg_stats(blkcg, cft, cb,
1228 BLKIO_STAT_TIME, 0, 0);
1229 case BLKIO_PROP_sectors:
1230 return blkio_read_blkg_stats(blkcg, cft, cb,
1231 BLKIO_STAT_CPU_SECTORS, 0, 1);
1232 case BLKIO_PROP_io_service_bytes:
1233 return blkio_read_blkg_stats(blkcg, cft, cb,
1234 BLKIO_STAT_CPU_SERVICE_BYTES, 1, 1);
1235 case BLKIO_PROP_io_serviced:
1236 return blkio_read_blkg_stats(blkcg, cft, cb,
1237 BLKIO_STAT_CPU_SERVICED, 1, 1);
1238 case BLKIO_PROP_io_service_time:
1239 return blkio_read_blkg_stats(blkcg, cft, cb,
1240 BLKIO_STAT_SERVICE_TIME, 1, 0);
1241 case BLKIO_PROP_io_wait_time:
1242 return blkio_read_blkg_stats(blkcg, cft, cb,
1243 BLKIO_STAT_WAIT_TIME, 1, 0);
1244 case BLKIO_PROP_io_merged:
1245 return blkio_read_blkg_stats(blkcg, cft, cb,
1246 BLKIO_STAT_CPU_MERGED, 1, 1);
1247 case BLKIO_PROP_io_queued:
1248 return blkio_read_blkg_stats(blkcg, cft, cb,
1249 BLKIO_STAT_QUEUED, 1, 0);
1250#ifdef CONFIG_DEBUG_BLK_CGROUP
1251 case BLKIO_PROP_unaccounted_time:
1252 return blkio_read_blkg_stats(blkcg, cft, cb,
1253 BLKIO_STAT_UNACCOUNTED_TIME, 0, 0);
1254 case BLKIO_PROP_dequeue:
1255 return blkio_read_blkg_stats(blkcg, cft, cb,
1256 BLKIO_STAT_DEQUEUE, 0, 0);
1257 case BLKIO_PROP_avg_queue_size:
1258 return blkio_read_blkg_stats(blkcg, cft, cb,
1259 BLKIO_STAT_AVG_QUEUE_SIZE, 0, 0);
1260 case BLKIO_PROP_group_wait_time:
1261 return blkio_read_blkg_stats(blkcg, cft, cb,
1262 BLKIO_STAT_GROUP_WAIT_TIME, 0, 0);
1263 case BLKIO_PROP_idle_time:
1264 return blkio_read_blkg_stats(blkcg, cft, cb,
1265 BLKIO_STAT_IDLE_TIME, 0, 0);
1266 case BLKIO_PROP_empty_time:
1267 return blkio_read_blkg_stats(blkcg, cft, cb,
1268 BLKIO_STAT_EMPTY_TIME, 0, 0);
1269#endif
1270 default:
1271 BUG();
1272 }
1273 break;
1274 case BLKIO_POLICY_THROTL:
1275 switch(name){
1276 case BLKIO_THROTL_io_service_bytes:
1277 return blkio_read_blkg_stats(blkcg, cft, cb,
1278 BLKIO_STAT_CPU_SERVICE_BYTES, 1, 1);
1279 case BLKIO_THROTL_io_serviced:
1280 return blkio_read_blkg_stats(blkcg, cft, cb,
1281 BLKIO_STAT_CPU_SERVICED, 1, 1);
1282 default:
1283 BUG();
1284 }
1285 break;
1286 default:
1287 BUG();
1288 }
1289 477
1290 return 0; 478 return __blkg_prfill_rwstat(sf, pd, &rwstat);
1291} 479}
480EXPORT_SYMBOL_GPL(blkg_prfill_rwstat);
1292 481
1293static int blkio_weight_write(struct blkio_cgroup *blkcg, u64 val) 482/**
483 * blkg_conf_prep - parse and prepare for per-blkg config update
484 * @blkcg: target block cgroup
485 * @pol: target policy
486 * @input: input string
487 * @ctx: blkg_conf_ctx to be filled
488 *
489 * Parse per-blkg config update from @input and initialize @ctx with the
490 * result. @ctx->blkg points to the blkg to be updated and @ctx->v the new
491 * value. This function returns with RCU read lock and queue lock held and
492 * must be paired with blkg_conf_finish().
493 */
494int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
495 const char *input, struct blkg_conf_ctx *ctx)
496 __acquires(rcu) __acquires(disk->queue->queue_lock)
1294{ 497{
1295 struct blkio_group *blkg; 498 struct gendisk *disk;
1296 struct hlist_node *n; 499 struct blkcg_gq *blkg;
1297 struct blkio_policy_node *pn; 500 unsigned int major, minor;
501 unsigned long long v;
502 int part, ret;
1298 503
1299 if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX) 504 if (sscanf(input, "%u:%u %llu", &major, &minor, &v) != 3)
1300 return -EINVAL; 505 return -EINVAL;
1301 506
1302 spin_lock(&blkio_list_lock); 507 disk = get_gendisk(MKDEV(major, minor), &part);
1303 spin_lock_irq(&blkcg->lock); 508 if (!disk || part)
1304 blkcg->weight = (unsigned int)val; 509 return -EINVAL;
1305
1306 hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
1307 pn = blkio_policy_search_node(blkcg, blkg->dev,
1308 BLKIO_POLICY_PROP, BLKIO_PROP_weight_device);
1309 if (pn)
1310 continue;
1311
1312 blkio_update_group_weight(blkg, blkcg->weight);
1313 }
1314 spin_unlock_irq(&blkcg->lock);
1315 spin_unlock(&blkio_list_lock);
1316 return 0;
1317}
1318 510
1319static u64 blkiocg_file_read_u64 (struct cgroup *cgrp, struct cftype *cft) { 511 rcu_read_lock();
1320 struct blkio_cgroup *blkcg; 512 spin_lock_irq(disk->queue->queue_lock);
1321 enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
1322 int name = BLKIOFILE_ATTR(cft->private);
1323 513
1324 blkcg = cgroup_to_blkio_cgroup(cgrp); 514 if (blkcg_policy_enabled(disk->queue, pol))
515 blkg = blkg_lookup_create(blkcg, disk->queue);
516 else
517 blkg = ERR_PTR(-EINVAL);
1325 518
1326 switch(plid) { 519 if (IS_ERR(blkg)) {
1327 case BLKIO_POLICY_PROP: 520 ret = PTR_ERR(blkg);
1328 switch(name) { 521 rcu_read_unlock();
1329 case BLKIO_PROP_weight: 522 spin_unlock_irq(disk->queue->queue_lock);
1330 return (u64)blkcg->weight; 523 put_disk(disk);
524 /*
525 * If queue was bypassing, we should retry. Do so after a
526 * short msleep(). It isn't strictly necessary but queue
527 * can be bypassing for some time and it's always nice to
528 * avoid busy looping.
529 */
530 if (ret == -EBUSY) {
531 msleep(10);
532 ret = restart_syscall();
1331 } 533 }
1332 break; 534 return ret;
1333 default:
1334 BUG();
1335 } 535 }
536
537 ctx->disk = disk;
538 ctx->blkg = blkg;
539 ctx->v = v;
1336 return 0; 540 return 0;
1337} 541}
542EXPORT_SYMBOL_GPL(blkg_conf_prep);
1338 543
1339static int 544/**
1340blkiocg_file_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val) 545 * blkg_conf_finish - finish up per-blkg config update
546 * @ctx: blkg_conf_ctx intiailized by blkg_conf_prep()
547 *
548 * Finish up after per-blkg config update. This function must be paired
549 * with blkg_conf_prep().
550 */
551void blkg_conf_finish(struct blkg_conf_ctx *ctx)
552 __releases(ctx->disk->queue->queue_lock) __releases(rcu)
1341{ 553{
1342 struct blkio_cgroup *blkcg; 554 spin_unlock_irq(ctx->disk->queue->queue_lock);
1343 enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); 555 rcu_read_unlock();
1344 int name = BLKIOFILE_ATTR(cft->private); 556 put_disk(ctx->disk);
1345
1346 blkcg = cgroup_to_blkio_cgroup(cgrp);
1347
1348 switch(plid) {
1349 case BLKIO_POLICY_PROP:
1350 switch(name) {
1351 case BLKIO_PROP_weight:
1352 return blkio_weight_write(blkcg, val);
1353 }
1354 break;
1355 default:
1356 BUG();
1357 }
1358
1359 return 0;
1360} 557}
558EXPORT_SYMBOL_GPL(blkg_conf_finish);
1361 559
1362struct cftype blkio_files[] = { 560struct cftype blkcg_files[] = {
1363 {
1364 .name = "weight_device",
1365 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1366 BLKIO_PROP_weight_device),
1367 .read_seq_string = blkiocg_file_read,
1368 .write_string = blkiocg_file_write,
1369 .max_write_len = 256,
1370 },
1371 {
1372 .name = "weight",
1373 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1374 BLKIO_PROP_weight),
1375 .read_u64 = blkiocg_file_read_u64,
1376 .write_u64 = blkiocg_file_write_u64,
1377 },
1378 {
1379 .name = "time",
1380 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1381 BLKIO_PROP_time),
1382 .read_map = blkiocg_file_read_map,
1383 },
1384 {
1385 .name = "sectors",
1386 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1387 BLKIO_PROP_sectors),
1388 .read_map = blkiocg_file_read_map,
1389 },
1390 {
1391 .name = "io_service_bytes",
1392 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1393 BLKIO_PROP_io_service_bytes),
1394 .read_map = blkiocg_file_read_map,
1395 },
1396 {
1397 .name = "io_serviced",
1398 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1399 BLKIO_PROP_io_serviced),
1400 .read_map = blkiocg_file_read_map,
1401 },
1402 {
1403 .name = "io_service_time",
1404 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1405 BLKIO_PROP_io_service_time),
1406 .read_map = blkiocg_file_read_map,
1407 },
1408 {
1409 .name = "io_wait_time",
1410 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1411 BLKIO_PROP_io_wait_time),
1412 .read_map = blkiocg_file_read_map,
1413 },
1414 {
1415 .name = "io_merged",
1416 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1417 BLKIO_PROP_io_merged),
1418 .read_map = blkiocg_file_read_map,
1419 },
1420 {
1421 .name = "io_queued",
1422 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1423 BLKIO_PROP_io_queued),
1424 .read_map = blkiocg_file_read_map,
1425 },
1426 { 561 {
1427 .name = "reset_stats", 562 .name = "reset_stats",
1428 .write_u64 = blkiocg_reset_stats, 563 .write_u64 = blkcg_reset_stats,
1429 },
1430#ifdef CONFIG_BLK_DEV_THROTTLING
1431 {
1432 .name = "throttle.read_bps_device",
1433 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1434 BLKIO_THROTL_read_bps_device),
1435 .read_seq_string = blkiocg_file_read,
1436 .write_string = blkiocg_file_write,
1437 .max_write_len = 256,
1438 },
1439
1440 {
1441 .name = "throttle.write_bps_device",
1442 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1443 BLKIO_THROTL_write_bps_device),
1444 .read_seq_string = blkiocg_file_read,
1445 .write_string = blkiocg_file_write,
1446 .max_write_len = 256,
1447 },
1448
1449 {
1450 .name = "throttle.read_iops_device",
1451 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1452 BLKIO_THROTL_read_iops_device),
1453 .read_seq_string = blkiocg_file_read,
1454 .write_string = blkiocg_file_write,
1455 .max_write_len = 256,
1456 },
1457
1458 {
1459 .name = "throttle.write_iops_device",
1460 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1461 BLKIO_THROTL_write_iops_device),
1462 .read_seq_string = blkiocg_file_read,
1463 .write_string = blkiocg_file_write,
1464 .max_write_len = 256,
1465 },
1466 {
1467 .name = "throttle.io_service_bytes",
1468 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1469 BLKIO_THROTL_io_service_bytes),
1470 .read_map = blkiocg_file_read_map,
1471 },
1472 {
1473 .name = "throttle.io_serviced",
1474 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1475 BLKIO_THROTL_io_serviced),
1476 .read_map = blkiocg_file_read_map,
1477 },
1478#endif /* CONFIG_BLK_DEV_THROTTLING */
1479
1480#ifdef CONFIG_DEBUG_BLK_CGROUP
1481 {
1482 .name = "avg_queue_size",
1483 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1484 BLKIO_PROP_avg_queue_size),
1485 .read_map = blkiocg_file_read_map,
1486 }, 564 },
1487 {
1488 .name = "group_wait_time",
1489 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1490 BLKIO_PROP_group_wait_time),
1491 .read_map = blkiocg_file_read_map,
1492 },
1493 {
1494 .name = "idle_time",
1495 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1496 BLKIO_PROP_idle_time),
1497 .read_map = blkiocg_file_read_map,
1498 },
1499 {
1500 .name = "empty_time",
1501 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1502 BLKIO_PROP_empty_time),
1503 .read_map = blkiocg_file_read_map,
1504 },
1505 {
1506 .name = "dequeue",
1507 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1508 BLKIO_PROP_dequeue),
1509 .read_map = blkiocg_file_read_map,
1510 },
1511 {
1512 .name = "unaccounted_time",
1513 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1514 BLKIO_PROP_unaccounted_time),
1515 .read_map = blkiocg_file_read_map,
1516 },
1517#endif
1518 { } /* terminate */ 565 { } /* terminate */
1519}; 566};
1520 567
1521static void blkiocg_destroy(struct cgroup *cgroup) 568/**
569 * blkcg_pre_destroy - cgroup pre_destroy callback
570 * @cgroup: cgroup of interest
571 *
572 * This function is called when @cgroup is about to go away and responsible
573 * for shooting down all blkgs associated with @cgroup. blkgs should be
574 * removed while holding both q and blkcg locks. As blkcg lock is nested
575 * inside q lock, this function performs reverse double lock dancing.
576 *
577 * This is the blkcg counterpart of ioc_release_fn().
578 */
579static int blkcg_pre_destroy(struct cgroup *cgroup)
1522{ 580{
1523 struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup); 581 struct blkcg *blkcg = cgroup_to_blkcg(cgroup);
1524 unsigned long flags;
1525 struct blkio_group *blkg;
1526 void *key;
1527 struct blkio_policy_type *blkiop;
1528 struct blkio_policy_node *pn, *pntmp;
1529 582
1530 rcu_read_lock(); 583 spin_lock_irq(&blkcg->lock);
1531 do {
1532 spin_lock_irqsave(&blkcg->lock, flags);
1533 584
1534 if (hlist_empty(&blkcg->blkg_list)) { 585 while (!hlist_empty(&blkcg->blkg_list)) {
1535 spin_unlock_irqrestore(&blkcg->lock, flags); 586 struct blkcg_gq *blkg = hlist_entry(blkcg->blkg_list.first,
1536 break; 587 struct blkcg_gq, blkcg_node);
588 struct request_queue *q = blkg->q;
589
590 if (spin_trylock(q->queue_lock)) {
591 blkg_destroy(blkg);
592 spin_unlock(q->queue_lock);
593 } else {
594 spin_unlock_irq(&blkcg->lock);
595 cpu_relax();
596 spin_lock_irq(&blkcg->lock);
1537 } 597 }
598 }
1538 599
1539 blkg = hlist_entry(blkcg->blkg_list.first, struct blkio_group, 600 spin_unlock_irq(&blkcg->lock);
1540 blkcg_node); 601 return 0;
1541 key = rcu_dereference(blkg->key); 602}
1542 __blkiocg_del_blkio_group(blkg);
1543
1544 spin_unlock_irqrestore(&blkcg->lock, flags);
1545
1546 /*
1547 * This blkio_group is being unlinked as associated cgroup is
1548 * going away. Let all the IO controlling policies know about
1549 * this event.
1550 */
1551 spin_lock(&blkio_list_lock);
1552 list_for_each_entry(blkiop, &blkio_list, list) {
1553 if (blkiop->plid != blkg->plid)
1554 continue;
1555 blkiop->ops.blkio_unlink_group_fn(key, blkg);
1556 }
1557 spin_unlock(&blkio_list_lock);
1558 } while (1);
1559 603
1560 list_for_each_entry_safe(pn, pntmp, &blkcg->policy_list, node) { 604static void blkcg_destroy(struct cgroup *cgroup)
1561 blkio_policy_delete_node(pn); 605{
1562 kfree(pn); 606 struct blkcg *blkcg = cgroup_to_blkcg(cgroup);
1563 }
1564 607
1565 free_css_id(&blkio_subsys, &blkcg->css); 608 if (blkcg != &blkcg_root)
1566 rcu_read_unlock();
1567 if (blkcg != &blkio_root_cgroup)
1568 kfree(blkcg); 609 kfree(blkcg);
1569} 610}
1570 611
1571static struct cgroup_subsys_state *blkiocg_create(struct cgroup *cgroup) 612static struct cgroup_subsys_state *blkcg_create(struct cgroup *cgroup)
1572{ 613{
1573 struct blkio_cgroup *blkcg; 614 static atomic64_t id_seq = ATOMIC64_INIT(0);
615 struct blkcg *blkcg;
1574 struct cgroup *parent = cgroup->parent; 616 struct cgroup *parent = cgroup->parent;
1575 617
1576 if (!parent) { 618 if (!parent) {
1577 blkcg = &blkio_root_cgroup; 619 blkcg = &blkcg_root;
1578 goto done; 620 goto done;
1579 } 621 }
1580 622
@@ -1582,22 +624,68 @@ static struct cgroup_subsys_state *blkiocg_create(struct cgroup *cgroup)
1582 if (!blkcg) 624 if (!blkcg)
1583 return ERR_PTR(-ENOMEM); 625 return ERR_PTR(-ENOMEM);
1584 626
1585 blkcg->weight = BLKIO_WEIGHT_DEFAULT; 627 blkcg->cfq_weight = CFQ_WEIGHT_DEFAULT;
628 blkcg->id = atomic64_inc_return(&id_seq); /* root is 0, start from 1 */
1586done: 629done:
1587 spin_lock_init(&blkcg->lock); 630 spin_lock_init(&blkcg->lock);
631 INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_ATOMIC);
1588 INIT_HLIST_HEAD(&blkcg->blkg_list); 632 INIT_HLIST_HEAD(&blkcg->blkg_list);
1589 633
1590 INIT_LIST_HEAD(&blkcg->policy_list);
1591 return &blkcg->css; 634 return &blkcg->css;
1592} 635}
1593 636
637/**
638 * blkcg_init_queue - initialize blkcg part of request queue
639 * @q: request_queue to initialize
640 *
641 * Called from blk_alloc_queue_node(). Responsible for initializing blkcg
642 * part of new request_queue @q.
643 *
644 * RETURNS:
645 * 0 on success, -errno on failure.
646 */
647int blkcg_init_queue(struct request_queue *q)
648{
649 might_sleep();
650
651 return blk_throtl_init(q);
652}
653
654/**
655 * blkcg_drain_queue - drain blkcg part of request_queue
656 * @q: request_queue to drain
657 *
658 * Called from blk_drain_queue(). Responsible for draining blkcg part.
659 */
660void blkcg_drain_queue(struct request_queue *q)
661{
662 lockdep_assert_held(q->queue_lock);
663
664 blk_throtl_drain(q);
665}
666
667/**
668 * blkcg_exit_queue - exit and release blkcg part of request_queue
669 * @q: request_queue being released
670 *
671 * Called from blk_release_queue(). Responsible for exiting blkcg part.
672 */
673void blkcg_exit_queue(struct request_queue *q)
674{
675 spin_lock_irq(q->queue_lock);
676 blkg_destroy_all(q);
677 spin_unlock_irq(q->queue_lock);
678
679 blk_throtl_exit(q);
680}
681
1594/* 682/*
1595 * We cannot support shared io contexts, as we have no mean to support 683 * We cannot support shared io contexts, as we have no mean to support
1596 * two tasks with the same ioc in two different groups without major rework 684 * two tasks with the same ioc in two different groups without major rework
1597 * of the main cic data structures. For now we allow a task to change 685 * of the main cic data structures. For now we allow a task to change
1598 * its cgroup only if it's the only owner of its ioc. 686 * its cgroup only if it's the only owner of its ioc.
1599 */ 687 */
1600static int blkiocg_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) 688static int blkcg_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
1601{ 689{
1602 struct task_struct *task; 690 struct task_struct *task;
1603 struct io_context *ioc; 691 struct io_context *ioc;
@@ -1616,63 +704,213 @@ static int blkiocg_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
1616 return ret; 704 return ret;
1617} 705}
1618 706
1619static void blkiocg_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
1620{
1621 struct task_struct *task;
1622 struct io_context *ioc;
1623
1624 cgroup_taskset_for_each(task, cgrp, tset) {
1625 /* we don't lose anything even if ioc allocation fails */
1626 ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE);
1627 if (ioc) {
1628 ioc_cgroup_changed(ioc);
1629 put_io_context(ioc);
1630 }
1631 }
1632}
1633
1634struct cgroup_subsys blkio_subsys = { 707struct cgroup_subsys blkio_subsys = {
1635 .name = "blkio", 708 .name = "blkio",
1636 .create = blkiocg_create, 709 .create = blkcg_create,
1637 .can_attach = blkiocg_can_attach, 710 .can_attach = blkcg_can_attach,
1638 .attach = blkiocg_attach, 711 .pre_destroy = blkcg_pre_destroy,
1639 .destroy = blkiocg_destroy, 712 .destroy = blkcg_destroy,
1640#ifdef CONFIG_BLK_CGROUP
1641 /* note: blkio_subsys_id is otherwise defined in blk-cgroup.h */
1642 .subsys_id = blkio_subsys_id, 713 .subsys_id = blkio_subsys_id,
1643#endif 714 .base_cftypes = blkcg_files,
1644 .base_cftypes = blkio_files,
1645 .use_id = 1,
1646 .module = THIS_MODULE, 715 .module = THIS_MODULE,
1647}; 716};
1648EXPORT_SYMBOL_GPL(blkio_subsys); 717EXPORT_SYMBOL_GPL(blkio_subsys);
1649 718
1650void blkio_policy_register(struct blkio_policy_type *blkiop) 719/**
720 * blkcg_activate_policy - activate a blkcg policy on a request_queue
721 * @q: request_queue of interest
722 * @pol: blkcg policy to activate
723 *
724 * Activate @pol on @q. Requires %GFP_KERNEL context. @q goes through
725 * bypass mode to populate its blkgs with policy_data for @pol.
726 *
727 * Activation happens with @q bypassed, so nobody would be accessing blkgs
728 * from IO path. Update of each blkg is protected by both queue and blkcg
729 * locks so that holding either lock and testing blkcg_policy_enabled() is
730 * always enough for dereferencing policy data.
731 *
732 * The caller is responsible for synchronizing [de]activations and policy
733 * [un]registerations. Returns 0 on success, -errno on failure.
734 */
735int blkcg_activate_policy(struct request_queue *q,
736 const struct blkcg_policy *pol)
1651{ 737{
1652 spin_lock(&blkio_list_lock); 738 LIST_HEAD(pds);
1653 list_add_tail(&blkiop->list, &blkio_list); 739 struct blkcg_gq *blkg;
1654 spin_unlock(&blkio_list_lock); 740 struct blkg_policy_data *pd, *n;
741 int cnt = 0, ret;
742
743 if (blkcg_policy_enabled(q, pol))
744 return 0;
745
746 blk_queue_bypass_start(q);
747
748 /* make sure the root blkg exists and count the existing blkgs */
749 spin_lock_irq(q->queue_lock);
750
751 rcu_read_lock();
752 blkg = __blkg_lookup_create(&blkcg_root, q);
753 rcu_read_unlock();
754
755 if (IS_ERR(blkg)) {
756 ret = PTR_ERR(blkg);
757 goto out_unlock;
758 }
759 q->root_blkg = blkg;
760
761 list_for_each_entry(blkg, &q->blkg_list, q_node)
762 cnt++;
763
764 spin_unlock_irq(q->queue_lock);
765
766 /* allocate policy_data for all existing blkgs */
767 while (cnt--) {
768 pd = kzalloc_node(pol->pd_size, GFP_KERNEL, q->node);
769 if (!pd) {
770 ret = -ENOMEM;
771 goto out_free;
772 }
773 list_add_tail(&pd->alloc_node, &pds);
774 }
775
776 /*
777 * Install the allocated pds. With @q bypassing, no new blkg
778 * should have been created while the queue lock was dropped.
779 */
780 spin_lock_irq(q->queue_lock);
781
782 list_for_each_entry(blkg, &q->blkg_list, q_node) {
783 if (WARN_ON(list_empty(&pds))) {
784 /* umm... this shouldn't happen, just abort */
785 ret = -ENOMEM;
786 goto out_unlock;
787 }
788 pd = list_first_entry(&pds, struct blkg_policy_data, alloc_node);
789 list_del_init(&pd->alloc_node);
790
791 /* grab blkcg lock too while installing @pd on @blkg */
792 spin_lock(&blkg->blkcg->lock);
793
794 blkg->pd[pol->plid] = pd;
795 pd->blkg = blkg;
796 pol->pd_init_fn(blkg);
797
798 spin_unlock(&blkg->blkcg->lock);
799 }
800
801 __set_bit(pol->plid, q->blkcg_pols);
802 ret = 0;
803out_unlock:
804 spin_unlock_irq(q->queue_lock);
805out_free:
806 blk_queue_bypass_end(q);
807 list_for_each_entry_safe(pd, n, &pds, alloc_node)
808 kfree(pd);
809 return ret;
1655} 810}
1656EXPORT_SYMBOL_GPL(blkio_policy_register); 811EXPORT_SYMBOL_GPL(blkcg_activate_policy);
1657 812
1658void blkio_policy_unregister(struct blkio_policy_type *blkiop) 813/**
814 * blkcg_deactivate_policy - deactivate a blkcg policy on a request_queue
815 * @q: request_queue of interest
816 * @pol: blkcg policy to deactivate
817 *
818 * Deactivate @pol on @q. Follows the same synchronization rules as
819 * blkcg_activate_policy().
820 */
821void blkcg_deactivate_policy(struct request_queue *q,
822 const struct blkcg_policy *pol)
1659{ 823{
1660 spin_lock(&blkio_list_lock); 824 struct blkcg_gq *blkg;
1661 list_del_init(&blkiop->list); 825
1662 spin_unlock(&blkio_list_lock); 826 if (!blkcg_policy_enabled(q, pol))
827 return;
828
829 blk_queue_bypass_start(q);
830 spin_lock_irq(q->queue_lock);
831
832 __clear_bit(pol->plid, q->blkcg_pols);
833
834 /* if no policy is left, no need for blkgs - shoot them down */
835 if (bitmap_empty(q->blkcg_pols, BLKCG_MAX_POLS))
836 blkg_destroy_all(q);
837
838 list_for_each_entry(blkg, &q->blkg_list, q_node) {
839 /* grab blkcg lock too while removing @pd from @blkg */
840 spin_lock(&blkg->blkcg->lock);
841
842 if (pol->pd_exit_fn)
843 pol->pd_exit_fn(blkg);
844
845 kfree(blkg->pd[pol->plid]);
846 blkg->pd[pol->plid] = NULL;
847
848 spin_unlock(&blkg->blkcg->lock);
849 }
850
851 spin_unlock_irq(q->queue_lock);
852 blk_queue_bypass_end(q);
1663} 853}
1664EXPORT_SYMBOL_GPL(blkio_policy_unregister); 854EXPORT_SYMBOL_GPL(blkcg_deactivate_policy);
1665 855
1666static int __init init_cgroup_blkio(void) 856/**
857 * blkcg_policy_register - register a blkcg policy
858 * @pol: blkcg policy to register
859 *
860 * Register @pol with blkcg core. Might sleep and @pol may be modified on
861 * successful registration. Returns 0 on success and -errno on failure.
862 */
863int blkcg_policy_register(struct blkcg_policy *pol)
1667{ 864{
1668 return cgroup_load_subsys(&blkio_subsys); 865 int i, ret;
866
867 if (WARN_ON(pol->pd_size < sizeof(struct blkg_policy_data)))
868 return -EINVAL;
869
870 mutex_lock(&blkcg_pol_mutex);
871
872 /* find an empty slot */
873 ret = -ENOSPC;
874 for (i = 0; i < BLKCG_MAX_POLS; i++)
875 if (!blkcg_policy[i])
876 break;
877 if (i >= BLKCG_MAX_POLS)
878 goto out_unlock;
879
880 /* register and update blkgs */
881 pol->plid = i;
882 blkcg_policy[i] = pol;
883
884 /* everything is in place, add intf files for the new policy */
885 if (pol->cftypes)
886 WARN_ON(cgroup_add_cftypes(&blkio_subsys, pol->cftypes));
887 ret = 0;
888out_unlock:
889 mutex_unlock(&blkcg_pol_mutex);
890 return ret;
1669} 891}
892EXPORT_SYMBOL_GPL(blkcg_policy_register);
1670 893
1671static void __exit exit_cgroup_blkio(void) 894/**
895 * blkcg_policy_unregister - unregister a blkcg policy
896 * @pol: blkcg policy to unregister
897 *
898 * Undo blkcg_policy_register(@pol). Might sleep.
899 */
900void blkcg_policy_unregister(struct blkcg_policy *pol)
1672{ 901{
1673 cgroup_unload_subsys(&blkio_subsys); 902 mutex_lock(&blkcg_pol_mutex);
1674}
1675 903
1676module_init(init_cgroup_blkio); 904 if (WARN_ON(blkcg_policy[pol->plid] != pol))
1677module_exit(exit_cgroup_blkio); 905 goto out_unlock;
1678MODULE_LICENSE("GPL"); 906
907 /* kill the intf files first */
908 if (pol->cftypes)
909 cgroup_rm_cftypes(&blkio_subsys, pol->cftypes);
910
911 /* unregister and update blkgs */
912 blkcg_policy[pol->plid] = NULL;
913out_unlock:
914 mutex_unlock(&blkcg_pol_mutex);
915}
916EXPORT_SYMBOL_GPL(blkcg_policy_unregister);
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 6f3ace7e792f..8ac457ce7783 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -15,350 +15,371 @@
15 15
16#include <linux/cgroup.h> 16#include <linux/cgroup.h>
17#include <linux/u64_stats_sync.h> 17#include <linux/u64_stats_sync.h>
18 18#include <linux/seq_file.h>
19enum blkio_policy_id { 19#include <linux/radix-tree.h>
20 BLKIO_POLICY_PROP = 0, /* Proportional Bandwidth division */
21 BLKIO_POLICY_THROTL, /* Throttling */
22};
23 20
24/* Max limits for throttle policy */ 21/* Max limits for throttle policy */
25#define THROTL_IOPS_MAX UINT_MAX 22#define THROTL_IOPS_MAX UINT_MAX
26 23
27#if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE) 24/* CFQ specific, out here for blkcg->cfq_weight */
28 25#define CFQ_WEIGHT_MIN 10
29#ifndef CONFIG_BLK_CGROUP 26#define CFQ_WEIGHT_MAX 1000
30/* When blk-cgroup is a module, its subsys_id isn't a compile-time constant */ 27#define CFQ_WEIGHT_DEFAULT 500
31extern struct cgroup_subsys blkio_subsys;
32#define blkio_subsys_id blkio_subsys.subsys_id
33#endif
34
35enum stat_type {
36 /* Total time spent (in ns) between request dispatch to the driver and
37 * request completion for IOs doen by this cgroup. This may not be
38 * accurate when NCQ is turned on. */
39 BLKIO_STAT_SERVICE_TIME = 0,
40 /* Total time spent waiting in scheduler queue in ns */
41 BLKIO_STAT_WAIT_TIME,
42 /* Number of IOs queued up */
43 BLKIO_STAT_QUEUED,
44 /* All the single valued stats go below this */
45 BLKIO_STAT_TIME,
46#ifdef CONFIG_DEBUG_BLK_CGROUP
47 /* Time not charged to this cgroup */
48 BLKIO_STAT_UNACCOUNTED_TIME,
49 BLKIO_STAT_AVG_QUEUE_SIZE,
50 BLKIO_STAT_IDLE_TIME,
51 BLKIO_STAT_EMPTY_TIME,
52 BLKIO_STAT_GROUP_WAIT_TIME,
53 BLKIO_STAT_DEQUEUE
54#endif
55};
56 28
57/* Per cpu stats */ 29#ifdef CONFIG_BLK_CGROUP
58enum stat_type_cpu {
59 BLKIO_STAT_CPU_SECTORS,
60 /* Total bytes transferred */
61 BLKIO_STAT_CPU_SERVICE_BYTES,
62 /* Total IOs serviced, post merge */
63 BLKIO_STAT_CPU_SERVICED,
64 /* Number of IOs merged */
65 BLKIO_STAT_CPU_MERGED,
66 BLKIO_STAT_CPU_NR
67};
68 30
69enum stat_sub_type { 31enum blkg_rwstat_type {
70 BLKIO_STAT_READ = 0, 32 BLKG_RWSTAT_READ,
71 BLKIO_STAT_WRITE, 33 BLKG_RWSTAT_WRITE,
72 BLKIO_STAT_SYNC, 34 BLKG_RWSTAT_SYNC,
73 BLKIO_STAT_ASYNC, 35 BLKG_RWSTAT_ASYNC,
74 BLKIO_STAT_TOTAL
75};
76 36
77/* blkg state flags */ 37 BLKG_RWSTAT_NR,
78enum blkg_state_flags { 38 BLKG_RWSTAT_TOTAL = BLKG_RWSTAT_NR,
79 BLKG_waiting = 0,
80 BLKG_idling,
81 BLKG_empty,
82}; 39};
83 40
84/* cgroup files owned by proportional weight policy */ 41struct blkcg_gq;
85enum blkcg_file_name_prop {
86 BLKIO_PROP_weight = 1,
87 BLKIO_PROP_weight_device,
88 BLKIO_PROP_io_service_bytes,
89 BLKIO_PROP_io_serviced,
90 BLKIO_PROP_time,
91 BLKIO_PROP_sectors,
92 BLKIO_PROP_unaccounted_time,
93 BLKIO_PROP_io_service_time,
94 BLKIO_PROP_io_wait_time,
95 BLKIO_PROP_io_merged,
96 BLKIO_PROP_io_queued,
97 BLKIO_PROP_avg_queue_size,
98 BLKIO_PROP_group_wait_time,
99 BLKIO_PROP_idle_time,
100 BLKIO_PROP_empty_time,
101 BLKIO_PROP_dequeue,
102};
103 42
104/* cgroup files owned by throttle policy */ 43struct blkcg {
105enum blkcg_file_name_throtl { 44 struct cgroup_subsys_state css;
106 BLKIO_THROTL_read_bps_device, 45 spinlock_t lock;
107 BLKIO_THROTL_write_bps_device,
108 BLKIO_THROTL_read_iops_device,
109 BLKIO_THROTL_write_iops_device,
110 BLKIO_THROTL_io_service_bytes,
111 BLKIO_THROTL_io_serviced,
112};
113 46
114struct blkio_cgroup { 47 struct radix_tree_root blkg_tree;
115 struct cgroup_subsys_state css; 48 struct blkcg_gq *blkg_hint;
116 unsigned int weight; 49 struct hlist_head blkg_list;
117 spinlock_t lock; 50
118 struct hlist_head blkg_list; 51 /* for policies to test whether associated blkcg has changed */
119 struct list_head policy_list; /* list of blkio_policy_node */ 52 uint64_t id;
120};
121 53
122struct blkio_group_stats { 54 /* TODO: per-policy storage in blkcg */
123 /* total disk time and nr sectors dispatched by this group */ 55 unsigned int cfq_weight; /* belongs to cfq */
124 uint64_t time;
125 uint64_t stat_arr[BLKIO_STAT_QUEUED + 1][BLKIO_STAT_TOTAL];
126#ifdef CONFIG_DEBUG_BLK_CGROUP
127 /* Time not charged to this cgroup */
128 uint64_t unaccounted_time;
129
130 /* Sum of number of IOs queued across all samples */
131 uint64_t avg_queue_size_sum;
132 /* Count of samples taken for average */
133 uint64_t avg_queue_size_samples;
134 /* How many times this group has been removed from service tree */
135 unsigned long dequeue;
136
137 /* Total time spent waiting for it to be assigned a timeslice. */
138 uint64_t group_wait_time;
139 uint64_t start_group_wait_time;
140
141 /* Time spent idling for this blkio_group */
142 uint64_t idle_time;
143 uint64_t start_idle_time;
144 /*
145 * Total time when we have requests queued and do not contain the
146 * current active queue.
147 */
148 uint64_t empty_time;
149 uint64_t start_empty_time;
150 uint16_t flags;
151#endif
152}; 56};
153 57
154/* Per cpu blkio group stats */ 58struct blkg_stat {
155struct blkio_group_stats_cpu { 59 struct u64_stats_sync syncp;
156 uint64_t sectors; 60 uint64_t cnt;
157 uint64_t stat_arr_cpu[BLKIO_STAT_CPU_NR][BLKIO_STAT_TOTAL];
158 struct u64_stats_sync syncp;
159}; 61};
160 62
161struct blkio_group { 63struct blkg_rwstat {
162 /* An rcu protected unique identifier for the group */ 64 struct u64_stats_sync syncp;
163 void *key; 65 uint64_t cnt[BLKG_RWSTAT_NR];
164 struct hlist_node blkcg_node;
165 unsigned short blkcg_id;
166 /* Store cgroup path */
167 char path[128];
168 /* The device MKDEV(major, minor), this group has been created for */
169 dev_t dev;
170 /* policy which owns this blk group */
171 enum blkio_policy_id plid;
172
173 /* Need to serialize the stats in the case of reset/update */
174 spinlock_t stats_lock;
175 struct blkio_group_stats stats;
176 /* Per cpu stats pointer */
177 struct blkio_group_stats_cpu __percpu *stats_cpu;
178}; 66};
179 67
180struct blkio_policy_node { 68/*
181 struct list_head node; 69 * A blkcg_gq (blkg) is association between a block cgroup (blkcg) and a
182 dev_t dev; 70 * request_queue (q). This is used by blkcg policies which need to track
183 /* This node belongs to max bw policy or porportional weight policy */ 71 * information per blkcg - q pair.
184 enum blkio_policy_id plid; 72 *
185 /* cgroup file to which this rule belongs to */ 73 * There can be multiple active blkcg policies and each has its private
186 int fileid; 74 * data on each blkg, the size of which is determined by
187 75 * blkcg_policy->pd_size. blkcg core allocates and frees such areas
188 union { 76 * together with blkg and invokes pd_init/exit_fn() methods.
189 unsigned int weight; 77 *
190 /* 78 * Such private data must embed struct blkg_policy_data (pd) at the
191 * Rate read/write in terms of bytes per second 79 * beginning and pd_size can't be smaller than pd.
192 * Whether this rate represents read or write is determined 80 */
193 * by file type "fileid". 81struct blkg_policy_data {
194 */ 82 /* the blkg this per-policy data belongs to */
195 u64 bps; 83 struct blkcg_gq *blkg;
196 unsigned int iops; 84
197 } val; 85 /* used during policy activation */
86 struct list_head alloc_node;
198}; 87};
199 88
200extern unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg, 89/* association between a blk cgroup and a request queue */
201 dev_t dev); 90struct blkcg_gq {
202extern uint64_t blkcg_get_read_bps(struct blkio_cgroup *blkcg, 91 /* Pointer to the associated request_queue */
203 dev_t dev); 92 struct request_queue *q;
204extern uint64_t blkcg_get_write_bps(struct blkio_cgroup *blkcg, 93 struct list_head q_node;
205 dev_t dev); 94 struct hlist_node blkcg_node;
206extern unsigned int blkcg_get_read_iops(struct blkio_cgroup *blkcg, 95 struct blkcg *blkcg;
207 dev_t dev); 96 /* reference count */
208extern unsigned int blkcg_get_write_iops(struct blkio_cgroup *blkcg, 97 int refcnt;
209 dev_t dev); 98
210 99 struct blkg_policy_data *pd[BLKCG_MAX_POLS];
211typedef void (blkio_unlink_group_fn) (void *key, struct blkio_group *blkg); 100
212 101 struct rcu_head rcu_head;
213typedef void (blkio_update_group_weight_fn) (void *key,
214 struct blkio_group *blkg, unsigned int weight);
215typedef void (blkio_update_group_read_bps_fn) (void * key,
216 struct blkio_group *blkg, u64 read_bps);
217typedef void (blkio_update_group_write_bps_fn) (void *key,
218 struct blkio_group *blkg, u64 write_bps);
219typedef void (blkio_update_group_read_iops_fn) (void *key,
220 struct blkio_group *blkg, unsigned int read_iops);
221typedef void (blkio_update_group_write_iops_fn) (void *key,
222 struct blkio_group *blkg, unsigned int write_iops);
223
224struct blkio_policy_ops {
225 blkio_unlink_group_fn *blkio_unlink_group_fn;
226 blkio_update_group_weight_fn *blkio_update_group_weight_fn;
227 blkio_update_group_read_bps_fn *blkio_update_group_read_bps_fn;
228 blkio_update_group_write_bps_fn *blkio_update_group_write_bps_fn;
229 blkio_update_group_read_iops_fn *blkio_update_group_read_iops_fn;
230 blkio_update_group_write_iops_fn *blkio_update_group_write_iops_fn;
231}; 102};
232 103
233struct blkio_policy_type { 104typedef void (blkcg_pol_init_pd_fn)(struct blkcg_gq *blkg);
234 struct list_head list; 105typedef void (blkcg_pol_exit_pd_fn)(struct blkcg_gq *blkg);
235 struct blkio_policy_ops ops; 106typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkcg_gq *blkg);
236 enum blkio_policy_id plid; 107
108struct blkcg_policy {
109 int plid;
110 /* policy specific private data size */
111 size_t pd_size;
112 /* cgroup files for the policy */
113 struct cftype *cftypes;
114
115 /* operations */
116 blkcg_pol_init_pd_fn *pd_init_fn;
117 blkcg_pol_exit_pd_fn *pd_exit_fn;
118 blkcg_pol_reset_pd_stats_fn *pd_reset_stats_fn;
237}; 119};
238 120
121extern struct blkcg blkcg_root;
122
123struct blkcg *cgroup_to_blkcg(struct cgroup *cgroup);
124struct blkcg *bio_blkcg(struct bio *bio);
125struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q);
126struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
127 struct request_queue *q);
128int blkcg_init_queue(struct request_queue *q);
129void blkcg_drain_queue(struct request_queue *q);
130void blkcg_exit_queue(struct request_queue *q);
131
239/* Blkio controller policy registration */ 132/* Blkio controller policy registration */
240extern void blkio_policy_register(struct blkio_policy_type *); 133int blkcg_policy_register(struct blkcg_policy *pol);
241extern void blkio_policy_unregister(struct blkio_policy_type *); 134void blkcg_policy_unregister(struct blkcg_policy *pol);
135int blkcg_activate_policy(struct request_queue *q,
136 const struct blkcg_policy *pol);
137void blkcg_deactivate_policy(struct request_queue *q,
138 const struct blkcg_policy *pol);
139
140void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg,
141 u64 (*prfill)(struct seq_file *,
142 struct blkg_policy_data *, int),
143 const struct blkcg_policy *pol, int data,
144 bool show_total);
145u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v);
146u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
147 const struct blkg_rwstat *rwstat);
148u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, int off);
149u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
150 int off);
151
152struct blkg_conf_ctx {
153 struct gendisk *disk;
154 struct blkcg_gq *blkg;
155 u64 v;
156};
157
158int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
159 const char *input, struct blkg_conf_ctx *ctx);
160void blkg_conf_finish(struct blkg_conf_ctx *ctx);
161
162
163/**
164 * blkg_to_pdata - get policy private data
165 * @blkg: blkg of interest
166 * @pol: policy of interest
167 *
168 * Return pointer to private data associated with the @blkg-@pol pair.
169 */
170static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg,
171 struct blkcg_policy *pol)
172{
173 return blkg ? blkg->pd[pol->plid] : NULL;
174}
175
176/**
177 * pdata_to_blkg - get blkg associated with policy private data
178 * @pd: policy private data of interest
179 *
180 * @pd is policy private data. Determine the blkg it's associated with.
181 */
182static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd)
183{
184 return pd ? pd->blkg : NULL;
185}
186
187/**
188 * blkg_path - format cgroup path of blkg
189 * @blkg: blkg of interest
190 * @buf: target buffer
191 * @buflen: target buffer length
192 *
193 * Format the path of the cgroup of @blkg into @buf.
194 */
195static inline int blkg_path(struct blkcg_gq *blkg, char *buf, int buflen)
196{
197 int ret;
198
199 rcu_read_lock();
200 ret = cgroup_path(blkg->blkcg->css.cgroup, buf, buflen);
201 rcu_read_unlock();
202 if (ret)
203 strncpy(buf, "<unavailable>", buflen);
204 return ret;
205}
242 206
243static inline char *blkg_path(struct blkio_group *blkg) 207/**
208 * blkg_get - get a blkg reference
209 * @blkg: blkg to get
210 *
211 * The caller should be holding queue_lock and an existing reference.
212 */
213static inline void blkg_get(struct blkcg_gq *blkg)
244{ 214{
245 return blkg->path; 215 lockdep_assert_held(blkg->q->queue_lock);
216 WARN_ON_ONCE(!blkg->refcnt);
217 blkg->refcnt++;
246} 218}
247 219
248#else 220void __blkg_release(struct blkcg_gq *blkg);
249 221
250struct blkio_group { 222/**
223 * blkg_put - put a blkg reference
224 * @blkg: blkg to put
225 *
226 * The caller should be holding queue_lock.
227 */
228static inline void blkg_put(struct blkcg_gq *blkg)
229{
230 lockdep_assert_held(blkg->q->queue_lock);
231 WARN_ON_ONCE(blkg->refcnt <= 0);
232 if (!--blkg->refcnt)
233 __blkg_release(blkg);
234}
235
236/**
237 * blkg_stat_add - add a value to a blkg_stat
238 * @stat: target blkg_stat
239 * @val: value to add
240 *
241 * Add @val to @stat. The caller is responsible for synchronizing calls to
242 * this function.
243 */
244static inline void blkg_stat_add(struct blkg_stat *stat, uint64_t val)
245{
246 u64_stats_update_begin(&stat->syncp);
247 stat->cnt += val;
248 u64_stats_update_end(&stat->syncp);
249}
250
251/**
252 * blkg_stat_read - read the current value of a blkg_stat
253 * @stat: blkg_stat to read
254 *
255 * Read the current value of @stat. This function can be called without
256 * synchroniztion and takes care of u64 atomicity.
257 */
258static inline uint64_t blkg_stat_read(struct blkg_stat *stat)
259{
260 unsigned int start;
261 uint64_t v;
262
263 do {
264 start = u64_stats_fetch_begin(&stat->syncp);
265 v = stat->cnt;
266 } while (u64_stats_fetch_retry(&stat->syncp, start));
267
268 return v;
269}
270
271/**
272 * blkg_stat_reset - reset a blkg_stat
273 * @stat: blkg_stat to reset
274 */
275static inline void blkg_stat_reset(struct blkg_stat *stat)
276{
277 stat->cnt = 0;
278}
279
280/**
281 * blkg_rwstat_add - add a value to a blkg_rwstat
282 * @rwstat: target blkg_rwstat
283 * @rw: mask of REQ_{WRITE|SYNC}
284 * @val: value to add
285 *
286 * Add @val to @rwstat. The counters are chosen according to @rw. The
287 * caller is responsible for synchronizing calls to this function.
288 */
289static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat,
290 int rw, uint64_t val)
291{
292 u64_stats_update_begin(&rwstat->syncp);
293
294 if (rw & REQ_WRITE)
295 rwstat->cnt[BLKG_RWSTAT_WRITE] += val;
296 else
297 rwstat->cnt[BLKG_RWSTAT_READ] += val;
298 if (rw & REQ_SYNC)
299 rwstat->cnt[BLKG_RWSTAT_SYNC] += val;
300 else
301 rwstat->cnt[BLKG_RWSTAT_ASYNC] += val;
302
303 u64_stats_update_end(&rwstat->syncp);
304}
305
306/**
307 * blkg_rwstat_read - read the current values of a blkg_rwstat
308 * @rwstat: blkg_rwstat to read
309 *
310 * Read the current snapshot of @rwstat and return it as the return value.
311 * This function can be called without synchronization and takes care of
312 * u64 atomicity.
313 */
314static inline struct blkg_rwstat blkg_rwstat_read(struct blkg_rwstat *rwstat)
315{
316 unsigned int start;
317 struct blkg_rwstat tmp;
318
319 do {
320 start = u64_stats_fetch_begin(&rwstat->syncp);
321 tmp = *rwstat;
322 } while (u64_stats_fetch_retry(&rwstat->syncp, start));
323
324 return tmp;
325}
326
327/**
328 * blkg_rwstat_sum - read the total count of a blkg_rwstat
329 * @rwstat: blkg_rwstat to read
330 *
331 * Return the total count of @rwstat regardless of the IO direction. This
332 * function can be called without synchronization and takes care of u64
333 * atomicity.
334 */
335static inline uint64_t blkg_rwstat_sum(struct blkg_rwstat *rwstat)
336{
337 struct blkg_rwstat tmp = blkg_rwstat_read(rwstat);
338
339 return tmp.cnt[BLKG_RWSTAT_READ] + tmp.cnt[BLKG_RWSTAT_WRITE];
340}
341
342/**
343 * blkg_rwstat_reset - reset a blkg_rwstat
344 * @rwstat: blkg_rwstat to reset
345 */
346static inline void blkg_rwstat_reset(struct blkg_rwstat *rwstat)
347{
348 memset(rwstat->cnt, 0, sizeof(rwstat->cnt));
349}
350
351#else /* CONFIG_BLK_CGROUP */
352
353struct cgroup;
354
355struct blkg_policy_data {
251}; 356};
252 357
253struct blkio_policy_type { 358struct blkcg_gq {
254}; 359};
255 360
256static inline void blkio_policy_register(struct blkio_policy_type *blkiop) { } 361struct blkcg_policy {
257static inline void blkio_policy_unregister(struct blkio_policy_type *blkiop) { } 362};
258 363
259static inline char *blkg_path(struct blkio_group *blkg) { return NULL; } 364static inline struct blkcg *cgroup_to_blkcg(struct cgroup *cgroup) { return NULL; }
260 365static inline struct blkcg *bio_blkcg(struct bio *bio) { return NULL; }
261#endif 366static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; }
262 367static inline int blkcg_init_queue(struct request_queue *q) { return 0; }
263#define BLKIO_WEIGHT_MIN 10 368static inline void blkcg_drain_queue(struct request_queue *q) { }
264#define BLKIO_WEIGHT_MAX 1000 369static inline void blkcg_exit_queue(struct request_queue *q) { }
265#define BLKIO_WEIGHT_DEFAULT 500 370static inline int blkcg_policy_register(struct blkcg_policy *pol) { return 0; }
266 371static inline void blkcg_policy_unregister(struct blkcg_policy *pol) { }
267#ifdef CONFIG_DEBUG_BLK_CGROUP 372static inline int blkcg_activate_policy(struct request_queue *q,
268void blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg); 373 const struct blkcg_policy *pol) { return 0; }
269void blkiocg_update_dequeue_stats(struct blkio_group *blkg, 374static inline void blkcg_deactivate_policy(struct request_queue *q,
270 unsigned long dequeue); 375 const struct blkcg_policy *pol) { }
271void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg); 376
272void blkiocg_update_idle_time_stats(struct blkio_group *blkg); 377static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg,
273void blkiocg_set_start_empty_time(struct blkio_group *blkg); 378 struct blkcg_policy *pol) { return NULL; }
274 379static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) { return NULL; }
275#define BLKG_FLAG_FNS(name) \ 380static inline char *blkg_path(struct blkcg_gq *blkg) { return NULL; }
276static inline void blkio_mark_blkg_##name( \ 381static inline void blkg_get(struct blkcg_gq *blkg) { }
277 struct blkio_group_stats *stats) \ 382static inline void blkg_put(struct blkcg_gq *blkg) { }
278{ \ 383
279 stats->flags |= (1 << BLKG_##name); \ 384#endif /* CONFIG_BLK_CGROUP */
280} \ 385#endif /* _BLK_CGROUP_H */
281static inline void blkio_clear_blkg_##name( \
282 struct blkio_group_stats *stats) \
283{ \
284 stats->flags &= ~(1 << BLKG_##name); \
285} \
286static inline int blkio_blkg_##name(struct blkio_group_stats *stats) \
287{ \
288 return (stats->flags & (1 << BLKG_##name)) != 0; \
289} \
290
291BLKG_FLAG_FNS(waiting)
292BLKG_FLAG_FNS(idling)
293BLKG_FLAG_FNS(empty)
294#undef BLKG_FLAG_FNS
295#else
296static inline void blkiocg_update_avg_queue_size_stats(
297 struct blkio_group *blkg) {}
298static inline void blkiocg_update_dequeue_stats(struct blkio_group *blkg,
299 unsigned long dequeue) {}
300static inline void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg)
301{}
302static inline void blkiocg_update_idle_time_stats(struct blkio_group *blkg) {}
303static inline void blkiocg_set_start_empty_time(struct blkio_group *blkg) {}
304#endif
305
306#if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE)
307extern struct blkio_cgroup blkio_root_cgroup;
308extern struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup);
309extern struct blkio_cgroup *task_blkio_cgroup(struct task_struct *tsk);
310extern void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
311 struct blkio_group *blkg, void *key, dev_t dev,
312 enum blkio_policy_id plid);
313extern int blkio_alloc_blkg_stats(struct blkio_group *blkg);
314extern int blkiocg_del_blkio_group(struct blkio_group *blkg);
315extern struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg,
316 void *key);
317void blkiocg_update_timeslice_used(struct blkio_group *blkg,
318 unsigned long time,
319 unsigned long unaccounted_time);
320void blkiocg_update_dispatch_stats(struct blkio_group *blkg, uint64_t bytes,
321 bool direction, bool sync);
322void blkiocg_update_completion_stats(struct blkio_group *blkg,
323 uint64_t start_time, uint64_t io_start_time, bool direction, bool sync);
324void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction,
325 bool sync);
326void blkiocg_update_io_add_stats(struct blkio_group *blkg,
327 struct blkio_group *curr_blkg, bool direction, bool sync);
328void blkiocg_update_io_remove_stats(struct blkio_group *blkg,
329 bool direction, bool sync);
330#else
331struct cgroup;
332static inline struct blkio_cgroup *
333cgroup_to_blkio_cgroup(struct cgroup *cgroup) { return NULL; }
334static inline struct blkio_cgroup *
335task_blkio_cgroup(struct task_struct *tsk) { return NULL; }
336
337static inline void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
338 struct blkio_group *blkg, void *key, dev_t dev,
339 enum blkio_policy_id plid) {}
340
341static inline int blkio_alloc_blkg_stats(struct blkio_group *blkg) { return 0; }
342
343static inline int
344blkiocg_del_blkio_group(struct blkio_group *blkg) { return 0; }
345
346static inline struct blkio_group *
347blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key) { return NULL; }
348static inline void blkiocg_update_timeslice_used(struct blkio_group *blkg,
349 unsigned long time,
350 unsigned long unaccounted_time)
351{}
352static inline void blkiocg_update_dispatch_stats(struct blkio_group *blkg,
353 uint64_t bytes, bool direction, bool sync) {}
354static inline void blkiocg_update_completion_stats(struct blkio_group *blkg,
355 uint64_t start_time, uint64_t io_start_time, bool direction,
356 bool sync) {}
357static inline void blkiocg_update_io_merged_stats(struct blkio_group *blkg,
358 bool direction, bool sync) {}
359static inline void blkiocg_update_io_add_stats(struct blkio_group *blkg,
360 struct blkio_group *curr_blkg, bool direction, bool sync) {}
361static inline void blkiocg_update_io_remove_stats(struct blkio_group *blkg,
362 bool direction, bool sync) {}
363#endif
364#endif /* _BLK_CGROUP_H */
diff --git a/block/blk-core.c b/block/blk-core.c
index 1f61b74867e4..3c923a7aeb56 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -29,11 +29,13 @@
29#include <linux/fault-inject.h> 29#include <linux/fault-inject.h>
30#include <linux/list_sort.h> 30#include <linux/list_sort.h>
31#include <linux/delay.h> 31#include <linux/delay.h>
32#include <linux/ratelimit.h>
32 33
33#define CREATE_TRACE_POINTS 34#define CREATE_TRACE_POINTS
34#include <trace/events/block.h> 35#include <trace/events/block.h>
35 36
36#include "blk.h" 37#include "blk.h"
38#include "blk-cgroup.h"
37 39
38EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap); 40EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);
39EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap); 41EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap);
@@ -280,7 +282,7 @@ EXPORT_SYMBOL(blk_stop_queue);
280 * 282 *
281 * This function does not cancel any asynchronous activity arising 283 * This function does not cancel any asynchronous activity arising
282 * out of elevator or throttling code. That would require elevaotor_exit() 284 * out of elevator or throttling code. That would require elevaotor_exit()
283 * and blk_throtl_exit() to be called with queue lock initialized. 285 * and blkcg_exit_queue() to be called with queue lock initialized.
284 * 286 *
285 */ 287 */
286void blk_sync_queue(struct request_queue *q) 288void blk_sync_queue(struct request_queue *q)
@@ -365,17 +367,23 @@ void blk_drain_queue(struct request_queue *q, bool drain_all)
365 367
366 spin_lock_irq(q->queue_lock); 368 spin_lock_irq(q->queue_lock);
367 369
368 elv_drain_elevator(q); 370 /*
369 if (drain_all) 371 * The caller might be trying to drain @q before its
370 blk_throtl_drain(q); 372 * elevator is initialized.
373 */
374 if (q->elevator)
375 elv_drain_elevator(q);
376
377 blkcg_drain_queue(q);
371 378
372 /* 379 /*
373 * This function might be called on a queue which failed 380 * This function might be called on a queue which failed
374 * driver init after queue creation. Some drivers 381 * driver init after queue creation or is not yet fully
375 * (e.g. fd) get unhappy in such cases. Kick queue iff 382 * active yet. Some drivers (e.g. fd and loop) get unhappy
376 * dispatch queue has something on it. 383 * in such cases. Kick queue iff dispatch queue has
384 * something on it and @q has request_fn set.
377 */ 385 */
378 if (!list_empty(&q->queue_head)) 386 if (!list_empty(&q->queue_head) && q->request_fn)
379 __blk_run_queue(q); 387 __blk_run_queue(q);
380 388
381 drain |= q->rq.elvpriv; 389 drain |= q->rq.elvpriv;
@@ -403,6 +411,49 @@ void blk_drain_queue(struct request_queue *q, bool drain_all)
403} 411}
404 412
405/** 413/**
414 * blk_queue_bypass_start - enter queue bypass mode
415 * @q: queue of interest
416 *
417 * In bypass mode, only the dispatch FIFO queue of @q is used. This
418 * function makes @q enter bypass mode and drains all requests which were
419 * throttled or issued before. On return, it's guaranteed that no request
420 * is being throttled or has ELVPRIV set and blk_queue_bypass() %true
421 * inside queue or RCU read lock.
422 */
423void blk_queue_bypass_start(struct request_queue *q)
424{
425 bool drain;
426
427 spin_lock_irq(q->queue_lock);
428 drain = !q->bypass_depth++;
429 queue_flag_set(QUEUE_FLAG_BYPASS, q);
430 spin_unlock_irq(q->queue_lock);
431
432 if (drain) {
433 blk_drain_queue(q, false);
434 /* ensure blk_queue_bypass() is %true inside RCU read lock */
435 synchronize_rcu();
436 }
437}
438EXPORT_SYMBOL_GPL(blk_queue_bypass_start);
439
440/**
441 * blk_queue_bypass_end - leave queue bypass mode
442 * @q: queue of interest
443 *
444 * Leave bypass mode and restore the normal queueing behavior.
445 */
446void blk_queue_bypass_end(struct request_queue *q)
447{
448 spin_lock_irq(q->queue_lock);
449 if (!--q->bypass_depth)
450 queue_flag_clear(QUEUE_FLAG_BYPASS, q);
451 WARN_ON_ONCE(q->bypass_depth < 0);
452 spin_unlock_irq(q->queue_lock);
453}
454EXPORT_SYMBOL_GPL(blk_queue_bypass_end);
455
456/**
406 * blk_cleanup_queue - shutdown a request queue 457 * blk_cleanup_queue - shutdown a request queue
407 * @q: request queue to shutdown 458 * @q: request queue to shutdown
408 * 459 *
@@ -418,6 +469,19 @@ void blk_cleanup_queue(struct request_queue *q)
418 queue_flag_set_unlocked(QUEUE_FLAG_DEAD, q); 469 queue_flag_set_unlocked(QUEUE_FLAG_DEAD, q);
419 470
420 spin_lock_irq(lock); 471 spin_lock_irq(lock);
472
473 /*
474 * Dead queue is permanently in bypass mode till released. Note
475 * that, unlike blk_queue_bypass_start(), we aren't performing
476 * synchronize_rcu() after entering bypass mode to avoid the delay
477 * as some drivers create and destroy a lot of queues while
478 * probing. This is still safe because blk_release_queue() will be
479 * called only after the queue refcnt drops to zero and nothing,
480 * RCU or not, would be traversing the queue by then.
481 */
482 q->bypass_depth++;
483 queue_flag_set(QUEUE_FLAG_BYPASS, q);
484
421 queue_flag_set(QUEUE_FLAG_NOMERGES, q); 485 queue_flag_set(QUEUE_FLAG_NOMERGES, q);
422 queue_flag_set(QUEUE_FLAG_NOXMERGES, q); 486 queue_flag_set(QUEUE_FLAG_NOXMERGES, q);
423 queue_flag_set(QUEUE_FLAG_DEAD, q); 487 queue_flag_set(QUEUE_FLAG_DEAD, q);
@@ -428,13 +492,8 @@ void blk_cleanup_queue(struct request_queue *q)
428 spin_unlock_irq(lock); 492 spin_unlock_irq(lock);
429 mutex_unlock(&q->sysfs_lock); 493 mutex_unlock(&q->sysfs_lock);
430 494
431 /* 495 /* drain all requests queued before DEAD marking */
432 * Drain all requests queued before DEAD marking. The caller might 496 blk_drain_queue(q, true);
433 * be trying to tear down @q before its elevator is initialized, in
434 * which case we don't want to call into draining.
435 */
436 if (q->elevator)
437 blk_drain_queue(q, true);
438 497
439 /* @q won't process any more request, flush async actions */ 498 /* @q won't process any more request, flush async actions */
440 del_timer_sync(&q->backing_dev_info.laptop_mode_wb_timer); 499 del_timer_sync(&q->backing_dev_info.laptop_mode_wb_timer);
@@ -498,14 +557,15 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
498 if (err) 557 if (err)
499 goto fail_id; 558 goto fail_id;
500 559
501 if (blk_throtl_init(q))
502 goto fail_id;
503
504 setup_timer(&q->backing_dev_info.laptop_mode_wb_timer, 560 setup_timer(&q->backing_dev_info.laptop_mode_wb_timer,
505 laptop_mode_timer_fn, (unsigned long) q); 561 laptop_mode_timer_fn, (unsigned long) q);
506 setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q); 562 setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q);
563 INIT_LIST_HEAD(&q->queue_head);
507 INIT_LIST_HEAD(&q->timeout_list); 564 INIT_LIST_HEAD(&q->timeout_list);
508 INIT_LIST_HEAD(&q->icq_list); 565 INIT_LIST_HEAD(&q->icq_list);
566#ifdef CONFIG_BLK_CGROUP
567 INIT_LIST_HEAD(&q->blkg_list);
568#endif
509 INIT_LIST_HEAD(&q->flush_queue[0]); 569 INIT_LIST_HEAD(&q->flush_queue[0]);
510 INIT_LIST_HEAD(&q->flush_queue[1]); 570 INIT_LIST_HEAD(&q->flush_queue[1]);
511 INIT_LIST_HEAD(&q->flush_data_in_flight); 571 INIT_LIST_HEAD(&q->flush_data_in_flight);
@@ -522,6 +582,18 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
522 */ 582 */
523 q->queue_lock = &q->__queue_lock; 583 q->queue_lock = &q->__queue_lock;
524 584
585 /*
586 * A queue starts its life with bypass turned on to avoid
587 * unnecessary bypass on/off overhead and nasty surprises during
588 * init. The initial bypass will be finished at the end of
589 * blk_init_allocated_queue().
590 */
591 q->bypass_depth = 1;
592 __set_bit(QUEUE_FLAG_BYPASS, &q->queue_flags);
593
594 if (blkcg_init_queue(q))
595 goto fail_id;
596
525 return q; 597 return q;
526 598
527fail_id: 599fail_id:
@@ -614,15 +686,15 @@ blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn,
614 686
615 q->sg_reserved_size = INT_MAX; 687 q->sg_reserved_size = INT_MAX;
616 688
617 /* 689 /* init elevator */
618 * all done 690 if (elevator_init(q, NULL))
619 */ 691 return NULL;
620 if (!elevator_init(q, NULL)) {
621 blk_queue_congestion_threshold(q);
622 return q;
623 }
624 692
625 return NULL; 693 blk_queue_congestion_threshold(q);
694
695 /* all done, end the initial bypass */
696 blk_queue_bypass_end(q);
697 return q;
626} 698}
627EXPORT_SYMBOL(blk_init_allocated_queue); 699EXPORT_SYMBOL(blk_init_allocated_queue);
628 700
@@ -648,33 +720,6 @@ static inline void blk_free_request(struct request_queue *q, struct request *rq)
648 mempool_free(rq, q->rq.rq_pool); 720 mempool_free(rq, q->rq.rq_pool);
649} 721}
650 722
651static struct request *
652blk_alloc_request(struct request_queue *q, struct io_cq *icq,
653 unsigned int flags, gfp_t gfp_mask)
654{
655 struct request *rq = mempool_alloc(q->rq.rq_pool, gfp_mask);
656
657 if (!rq)
658 return NULL;
659
660 blk_rq_init(q, rq);
661
662 rq->cmd_flags = flags | REQ_ALLOCED;
663
664 if (flags & REQ_ELVPRIV) {
665 rq->elv.icq = icq;
666 if (unlikely(elv_set_request(q, rq, gfp_mask))) {
667 mempool_free(rq, q->rq.rq_pool);
668 return NULL;
669 }
670 /* @rq->elv.icq holds on to io_context until @rq is freed */
671 if (icq)
672 get_io_context(icq->ioc);
673 }
674
675 return rq;
676}
677
678/* 723/*
679 * ioc_batching returns true if the ioc is a valid batching request and 724 * ioc_batching returns true if the ioc is a valid batching request and
680 * should be given priority access to a request. 725 * should be given priority access to a request.
@@ -763,6 +808,22 @@ static bool blk_rq_should_init_elevator(struct bio *bio)
763} 808}
764 809
765/** 810/**
811 * rq_ioc - determine io_context for request allocation
812 * @bio: request being allocated is for this bio (can be %NULL)
813 *
814 * Determine io_context to use for request allocation for @bio. May return
815 * %NULL if %current->io_context doesn't exist.
816 */
817static struct io_context *rq_ioc(struct bio *bio)
818{
819#ifdef CONFIG_BLK_CGROUP
820 if (bio && bio->bi_ioc)
821 return bio->bi_ioc;
822#endif
823 return current->io_context;
824}
825
826/**
766 * get_request - get a free request 827 * get_request - get a free request
767 * @q: request_queue to allocate request from 828 * @q: request_queue to allocate request from
768 * @rw_flags: RW and SYNC flags 829 * @rw_flags: RW and SYNC flags
@@ -779,7 +840,7 @@ static bool blk_rq_should_init_elevator(struct bio *bio)
779static struct request *get_request(struct request_queue *q, int rw_flags, 840static struct request *get_request(struct request_queue *q, int rw_flags,
780 struct bio *bio, gfp_t gfp_mask) 841 struct bio *bio, gfp_t gfp_mask)
781{ 842{
782 struct request *rq = NULL; 843 struct request *rq;
783 struct request_list *rl = &q->rq; 844 struct request_list *rl = &q->rq;
784 struct elevator_type *et; 845 struct elevator_type *et;
785 struct io_context *ioc; 846 struct io_context *ioc;
@@ -789,7 +850,7 @@ static struct request *get_request(struct request_queue *q, int rw_flags,
789 int may_queue; 850 int may_queue;
790retry: 851retry:
791 et = q->elevator->type; 852 et = q->elevator->type;
792 ioc = current->io_context; 853 ioc = rq_ioc(bio);
793 854
794 if (unlikely(blk_queue_dead(q))) 855 if (unlikely(blk_queue_dead(q)))
795 return NULL; 856 return NULL;
@@ -808,7 +869,7 @@ retry:
808 */ 869 */
809 if (!ioc && !retried) { 870 if (!ioc && !retried) {
810 spin_unlock_irq(q->queue_lock); 871 spin_unlock_irq(q->queue_lock);
811 create_io_context(current, gfp_mask, q->node); 872 create_io_context(gfp_mask, q->node);
812 spin_lock_irq(q->queue_lock); 873 spin_lock_irq(q->queue_lock);
813 retried = true; 874 retried = true;
814 goto retry; 875 goto retry;
@@ -831,7 +892,7 @@ retry:
831 * process is not a "batcher", and not 892 * process is not a "batcher", and not
832 * exempted by the IO scheduler 893 * exempted by the IO scheduler
833 */ 894 */
834 goto out; 895 return NULL;
835 } 896 }
836 } 897 }
837 } 898 }
@@ -844,7 +905,7 @@ retry:
844 * allocated with any setting of ->nr_requests 905 * allocated with any setting of ->nr_requests
845 */ 906 */
846 if (rl->count[is_sync] >= (3 * q->nr_requests / 2)) 907 if (rl->count[is_sync] >= (3 * q->nr_requests / 2))
847 goto out; 908 return NULL;
848 909
849 rl->count[is_sync]++; 910 rl->count[is_sync]++;
850 rl->starved[is_sync] = 0; 911 rl->starved[is_sync] = 0;
@@ -859,8 +920,7 @@ retry:
859 * Also, lookup icq while holding queue_lock. If it doesn't exist, 920 * Also, lookup icq while holding queue_lock. If it doesn't exist,
860 * it will be created after releasing queue_lock. 921 * it will be created after releasing queue_lock.
861 */ 922 */
862 if (blk_rq_should_init_elevator(bio) && 923 if (blk_rq_should_init_elevator(bio) && !blk_queue_bypass(q)) {
863 !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags)) {
864 rw_flags |= REQ_ELVPRIV; 924 rw_flags |= REQ_ELVPRIV;
865 rl->elvpriv++; 925 rl->elvpriv++;
866 if (et->icq_cache && ioc) 926 if (et->icq_cache && ioc)
@@ -871,41 +931,36 @@ retry:
871 rw_flags |= REQ_IO_STAT; 931 rw_flags |= REQ_IO_STAT;
872 spin_unlock_irq(q->queue_lock); 932 spin_unlock_irq(q->queue_lock);
873 933
874 /* create icq if missing */ 934 /* allocate and init request */
875 if ((rw_flags & REQ_ELVPRIV) && unlikely(et->icq_cache && !icq)) { 935 rq = mempool_alloc(q->rq.rq_pool, gfp_mask);
876 icq = ioc_create_icq(q, gfp_mask); 936 if (!rq)
877 if (!icq) 937 goto fail_alloc;
878 goto fail_icq;
879 }
880
881 rq = blk_alloc_request(q, icq, rw_flags, gfp_mask);
882 938
883fail_icq: 939 blk_rq_init(q, rq);
884 if (unlikely(!rq)) { 940 rq->cmd_flags = rw_flags | REQ_ALLOCED;
885 /* 941
886 * Allocation failed presumably due to memory. Undo anything 942 /* init elvpriv */
887 * we might have messed up. 943 if (rw_flags & REQ_ELVPRIV) {
888 * 944 if (unlikely(et->icq_cache && !icq)) {
889 * Allocating task should really be put onto the front of the 945 create_io_context(gfp_mask, q->node);
890 * wait queue, but this is pretty rare. 946 ioc = rq_ioc(bio);
891 */ 947 if (!ioc)
892 spin_lock_irq(q->queue_lock); 948 goto fail_elvpriv;
893 freed_request(q, rw_flags); 949
950 icq = ioc_create_icq(ioc, q, gfp_mask);
951 if (!icq)
952 goto fail_elvpriv;
953 }
894 954
895 /* 955 rq->elv.icq = icq;
896 * in the very unlikely event that allocation failed and no 956 if (unlikely(elv_set_request(q, rq, bio, gfp_mask)))
897 * requests for this direction was pending, mark us starved 957 goto fail_elvpriv;
898 * so that freeing of a request in the other direction will
899 * notice us. another possible fix would be to split the
900 * rq mempool into READ and WRITE
901 */
902rq_starved:
903 if (unlikely(rl->count[is_sync] == 0))
904 rl->starved[is_sync] = 1;
905 958
906 goto out; 959 /* @rq->elv.icq holds io_context until @rq is freed */
960 if (icq)
961 get_io_context(icq->ioc);
907 } 962 }
908 963out:
909 /* 964 /*
910 * ioc may be NULL here, and ioc_batching will be false. That's 965 * ioc may be NULL here, and ioc_batching will be false. That's
911 * OK, if the queue is under the request limit then requests need 966 * OK, if the queue is under the request limit then requests need
@@ -916,8 +971,48 @@ rq_starved:
916 ioc->nr_batch_requests--; 971 ioc->nr_batch_requests--;
917 972
918 trace_block_getrq(q, bio, rw_flags & 1); 973 trace_block_getrq(q, bio, rw_flags & 1);
919out:
920 return rq; 974 return rq;
975
976fail_elvpriv:
977 /*
978 * elvpriv init failed. ioc, icq and elvpriv aren't mempool backed
979 * and may fail indefinitely under memory pressure and thus
980 * shouldn't stall IO. Treat this request as !elvpriv. This will
981 * disturb iosched and blkcg but weird is bettern than dead.
982 */
983 printk_ratelimited(KERN_WARNING "%s: request aux data allocation failed, iosched may be disturbed\n",
984 dev_name(q->backing_dev_info.dev));
985
986 rq->cmd_flags &= ~REQ_ELVPRIV;
987 rq->elv.icq = NULL;
988
989 spin_lock_irq(q->queue_lock);
990 rl->elvpriv--;
991 spin_unlock_irq(q->queue_lock);
992 goto out;
993
994fail_alloc:
995 /*
996 * Allocation failed presumably due to memory. Undo anything we
997 * might have messed up.
998 *
999 * Allocating task should really be put onto the front of the wait
1000 * queue, but this is pretty rare.
1001 */
1002 spin_lock_irq(q->queue_lock);
1003 freed_request(q, rw_flags);
1004
1005 /*
1006 * in the very unlikely event that allocation failed and no
1007 * requests for this direction was pending, mark us starved so that
1008 * freeing of a request in the other direction will notice
1009 * us. another possible fix would be to split the rq mempool into
1010 * READ and WRITE
1011 */
1012rq_starved:
1013 if (unlikely(rl->count[is_sync] == 0))
1014 rl->starved[is_sync] = 1;
1015 return NULL;
921} 1016}
922 1017
923/** 1018/**
@@ -961,7 +1056,7 @@ static struct request *get_request_wait(struct request_queue *q, int rw_flags,
961 * up to a big batch of them for a small period time. 1056 * up to a big batch of them for a small period time.
962 * See ioc_batching, ioc_set_batching 1057 * See ioc_batching, ioc_set_batching
963 */ 1058 */
964 create_io_context(current, GFP_NOIO, q->node); 1059 create_io_context(GFP_NOIO, q->node);
965 ioc_set_batching(q, current->io_context); 1060 ioc_set_batching(q, current->io_context);
966 1061
967 spin_lock_irq(q->queue_lock); 1062 spin_lock_irq(q->queue_lock);
diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index fb95dd2f889a..1e2d53b04858 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -155,20 +155,20 @@ void put_io_context(struct io_context *ioc)
155} 155}
156EXPORT_SYMBOL(put_io_context); 156EXPORT_SYMBOL(put_io_context);
157 157
158/* Called by the exiting task */ 158/**
159void exit_io_context(struct task_struct *task) 159 * put_io_context_active - put active reference on ioc
160 * @ioc: ioc of interest
161 *
162 * Undo get_io_context_active(). If active reference reaches zero after
163 * put, @ioc can never issue further IOs and ioscheds are notified.
164 */
165void put_io_context_active(struct io_context *ioc)
160{ 166{
161 struct io_context *ioc;
162 struct io_cq *icq;
163 struct hlist_node *n; 167 struct hlist_node *n;
164 unsigned long flags; 168 unsigned long flags;
169 struct io_cq *icq;
165 170
166 task_lock(task); 171 if (!atomic_dec_and_test(&ioc->active_ref)) {
167 ioc = task->io_context;
168 task->io_context = NULL;
169 task_unlock(task);
170
171 if (!atomic_dec_and_test(&ioc->nr_tasks)) {
172 put_io_context(ioc); 172 put_io_context(ioc);
173 return; 173 return;
174 } 174 }
@@ -197,6 +197,20 @@ retry:
197 put_io_context(ioc); 197 put_io_context(ioc);
198} 198}
199 199
200/* Called by the exiting task */
201void exit_io_context(struct task_struct *task)
202{
203 struct io_context *ioc;
204
205 task_lock(task);
206 ioc = task->io_context;
207 task->io_context = NULL;
208 task_unlock(task);
209
210 atomic_dec(&ioc->nr_tasks);
211 put_io_context_active(ioc);
212}
213
200/** 214/**
201 * ioc_clear_queue - break any ioc association with the specified queue 215 * ioc_clear_queue - break any ioc association with the specified queue
202 * @q: request_queue being cleared 216 * @q: request_queue being cleared
@@ -218,19 +232,18 @@ void ioc_clear_queue(struct request_queue *q)
218 } 232 }
219} 233}
220 234
221void create_io_context_slowpath(struct task_struct *task, gfp_t gfp_flags, 235int create_task_io_context(struct task_struct *task, gfp_t gfp_flags, int node)
222 int node)
223{ 236{
224 struct io_context *ioc; 237 struct io_context *ioc;
225 238
226 ioc = kmem_cache_alloc_node(iocontext_cachep, gfp_flags | __GFP_ZERO, 239 ioc = kmem_cache_alloc_node(iocontext_cachep, gfp_flags | __GFP_ZERO,
227 node); 240 node);
228 if (unlikely(!ioc)) 241 if (unlikely(!ioc))
229 return; 242 return -ENOMEM;
230 243
231 /* initialize */ 244 /* initialize */
232 atomic_long_set(&ioc->refcount, 1); 245 atomic_long_set(&ioc->refcount, 1);
233 atomic_set(&ioc->nr_tasks, 1); 246 atomic_set(&ioc->active_ref, 1);
234 spin_lock_init(&ioc->lock); 247 spin_lock_init(&ioc->lock);
235 INIT_RADIX_TREE(&ioc->icq_tree, GFP_ATOMIC | __GFP_HIGH); 248 INIT_RADIX_TREE(&ioc->icq_tree, GFP_ATOMIC | __GFP_HIGH);
236 INIT_HLIST_HEAD(&ioc->icq_list); 249 INIT_HLIST_HEAD(&ioc->icq_list);
@@ -250,6 +263,8 @@ void create_io_context_slowpath(struct task_struct *task, gfp_t gfp_flags,
250 else 263 else
251 kmem_cache_free(iocontext_cachep, ioc); 264 kmem_cache_free(iocontext_cachep, ioc);
252 task_unlock(task); 265 task_unlock(task);
266
267 return 0;
253} 268}
254 269
255/** 270/**
@@ -281,7 +296,7 @@ struct io_context *get_task_io_context(struct task_struct *task,
281 return ioc; 296 return ioc;
282 } 297 }
283 task_unlock(task); 298 task_unlock(task);
284 } while (create_io_context(task, gfp_flags, node)); 299 } while (!create_task_io_context(task, gfp_flags, node));
285 300
286 return NULL; 301 return NULL;
287} 302}
@@ -325,26 +340,23 @@ EXPORT_SYMBOL(ioc_lookup_icq);
325 340
326/** 341/**
327 * ioc_create_icq - create and link io_cq 342 * ioc_create_icq - create and link io_cq
343 * @ioc: io_context of interest
328 * @q: request_queue of interest 344 * @q: request_queue of interest
329 * @gfp_mask: allocation mask 345 * @gfp_mask: allocation mask
330 * 346 *
331 * Make sure io_cq linking %current->io_context and @q exists. If either 347 * Make sure io_cq linking @ioc and @q exists. If icq doesn't exist, they
332 * io_context and/or icq don't exist, they will be created using @gfp_mask. 348 * will be created using @gfp_mask.
333 * 349 *
334 * The caller is responsible for ensuring @ioc won't go away and @q is 350 * The caller is responsible for ensuring @ioc won't go away and @q is
335 * alive and will stay alive until this function returns. 351 * alive and will stay alive until this function returns.
336 */ 352 */
337struct io_cq *ioc_create_icq(struct request_queue *q, gfp_t gfp_mask) 353struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q,
354 gfp_t gfp_mask)
338{ 355{
339 struct elevator_type *et = q->elevator->type; 356 struct elevator_type *et = q->elevator->type;
340 struct io_context *ioc;
341 struct io_cq *icq; 357 struct io_cq *icq;
342 358
343 /* allocate stuff */ 359 /* allocate stuff */
344 ioc = create_io_context(current, gfp_mask, q->node);
345 if (!ioc)
346 return NULL;
347
348 icq = kmem_cache_alloc_node(et->icq_cache, gfp_mask | __GFP_ZERO, 360 icq = kmem_cache_alloc_node(et->icq_cache, gfp_mask | __GFP_ZERO,
349 q->node); 361 q->node);
350 if (!icq) 362 if (!icq)
@@ -382,74 +394,6 @@ struct io_cq *ioc_create_icq(struct request_queue *q, gfp_t gfp_mask)
382 return icq; 394 return icq;
383} 395}
384 396
385void ioc_set_icq_flags(struct io_context *ioc, unsigned int flags)
386{
387 struct io_cq *icq;
388 struct hlist_node *n;
389
390 hlist_for_each_entry(icq, n, &ioc->icq_list, ioc_node)
391 icq->flags |= flags;
392}
393
394/**
395 * ioc_ioprio_changed - notify ioprio change
396 * @ioc: io_context of interest
397 * @ioprio: new ioprio
398 *
399 * @ioc's ioprio has changed to @ioprio. Set %ICQ_IOPRIO_CHANGED for all
400 * icq's. iosched is responsible for checking the bit and applying it on
401 * request issue path.
402 */
403void ioc_ioprio_changed(struct io_context *ioc, int ioprio)
404{
405 unsigned long flags;
406
407 spin_lock_irqsave(&ioc->lock, flags);
408 ioc->ioprio = ioprio;
409 ioc_set_icq_flags(ioc, ICQ_IOPRIO_CHANGED);
410 spin_unlock_irqrestore(&ioc->lock, flags);
411}
412
413/**
414 * ioc_cgroup_changed - notify cgroup change
415 * @ioc: io_context of interest
416 *
417 * @ioc's cgroup has changed. Set %ICQ_CGROUP_CHANGED for all icq's.
418 * iosched is responsible for checking the bit and applying it on request
419 * issue path.
420 */
421void ioc_cgroup_changed(struct io_context *ioc)
422{
423 unsigned long flags;
424
425 spin_lock_irqsave(&ioc->lock, flags);
426 ioc_set_icq_flags(ioc, ICQ_CGROUP_CHANGED);
427 spin_unlock_irqrestore(&ioc->lock, flags);
428}
429EXPORT_SYMBOL(ioc_cgroup_changed);
430
431/**
432 * icq_get_changed - fetch and clear icq changed mask
433 * @icq: icq of interest
434 *
435 * Fetch and clear ICQ_*_CHANGED bits from @icq. Grabs and releases
436 * @icq->ioc->lock.
437 */
438unsigned icq_get_changed(struct io_cq *icq)
439{
440 unsigned int changed = 0;
441 unsigned long flags;
442
443 if (unlikely(icq->flags & ICQ_CHANGED_MASK)) {
444 spin_lock_irqsave(&icq->ioc->lock, flags);
445 changed = icq->flags & ICQ_CHANGED_MASK;
446 icq->flags &= ~ICQ_CHANGED_MASK;
447 spin_unlock_irqrestore(&icq->ioc->lock, flags);
448 }
449 return changed;
450}
451EXPORT_SYMBOL(icq_get_changed);
452
453static int __init blk_ioc_init(void) 397static int __init blk_ioc_init(void)
454{ 398{
455 iocontext_cachep = kmem_cache_create("blkdev_ioc", 399 iocontext_cachep = kmem_cache_create("blkdev_ioc",
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index cf150011d808..aa41b47c22d2 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -9,6 +9,7 @@
9#include <linux/blktrace_api.h> 9#include <linux/blktrace_api.h>
10 10
11#include "blk.h" 11#include "blk.h"
12#include "blk-cgroup.h"
12 13
13struct queue_sysfs_entry { 14struct queue_sysfs_entry {
14 struct attribute attr; 15 struct attribute attr;
@@ -479,6 +480,8 @@ static void blk_release_queue(struct kobject *kobj)
479 480
480 blk_sync_queue(q); 481 blk_sync_queue(q);
481 482
483 blkcg_exit_queue(q);
484
482 if (q->elevator) { 485 if (q->elevator) {
483 spin_lock_irq(q->queue_lock); 486 spin_lock_irq(q->queue_lock);
484 ioc_clear_queue(q); 487 ioc_clear_queue(q);
@@ -486,15 +489,12 @@ static void blk_release_queue(struct kobject *kobj)
486 elevator_exit(q->elevator); 489 elevator_exit(q->elevator);
487 } 490 }
488 491
489 blk_throtl_exit(q);
490
491 if (rl->rq_pool) 492 if (rl->rq_pool)
492 mempool_destroy(rl->rq_pool); 493 mempool_destroy(rl->rq_pool);
493 494
494 if (q->queue_tags) 495 if (q->queue_tags)
495 __blk_queue_free_tags(q); 496 __blk_queue_free_tags(q);
496 497
497 blk_throtl_release(q);
498 blk_trace_shutdown(q); 498 blk_trace_shutdown(q);
499 499
500 bdi_destroy(&q->backing_dev_info); 500 bdi_destroy(&q->backing_dev_info);
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index f2ddb94626bd..5b0659512047 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -21,6 +21,8 @@ static int throtl_quantum = 32;
21/* Throttling is performed over 100ms slice and after that slice is renewed */ 21/* Throttling is performed over 100ms slice and after that slice is renewed */
22static unsigned long throtl_slice = HZ/10; /* 100 ms */ 22static unsigned long throtl_slice = HZ/10; /* 100 ms */
23 23
24static struct blkcg_policy blkcg_policy_throtl;
25
24/* A workqueue to queue throttle related work */ 26/* A workqueue to queue throttle related work */
25static struct workqueue_struct *kthrotld_workqueue; 27static struct workqueue_struct *kthrotld_workqueue;
26static void throtl_schedule_delayed_work(struct throtl_data *td, 28static void throtl_schedule_delayed_work(struct throtl_data *td,
@@ -38,9 +40,17 @@ struct throtl_rb_root {
38 40
39#define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node) 41#define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node)
40 42
43/* Per-cpu group stats */
44struct tg_stats_cpu {
45 /* total bytes transferred */
46 struct blkg_rwstat service_bytes;
47 /* total IOs serviced, post merge */
48 struct blkg_rwstat serviced;
49};
50
41struct throtl_grp { 51struct throtl_grp {
42 /* List of throtl groups on the request queue*/ 52 /* must be the first member */
43 struct hlist_node tg_node; 53 struct blkg_policy_data pd;
44 54
45 /* active throtl group service_tree member */ 55 /* active throtl group service_tree member */
46 struct rb_node rb_node; 56 struct rb_node rb_node;
@@ -52,8 +62,6 @@ struct throtl_grp {
52 */ 62 */
53 unsigned long disptime; 63 unsigned long disptime;
54 64
55 struct blkio_group blkg;
56 atomic_t ref;
57 unsigned int flags; 65 unsigned int flags;
58 66
59 /* Two lists for READ and WRITE */ 67 /* Two lists for READ and WRITE */
@@ -80,18 +88,18 @@ struct throtl_grp {
80 /* Some throttle limits got updated for the group */ 88 /* Some throttle limits got updated for the group */
81 int limits_changed; 89 int limits_changed;
82 90
83 struct rcu_head rcu_head; 91 /* Per cpu stats pointer */
92 struct tg_stats_cpu __percpu *stats_cpu;
93
94 /* List of tgs waiting for per cpu stats memory to be allocated */
95 struct list_head stats_alloc_node;
84}; 96};
85 97
86struct throtl_data 98struct throtl_data
87{ 99{
88 /* List of throtl groups */
89 struct hlist_head tg_list;
90
91 /* service tree for active throtl groups */ 100 /* service tree for active throtl groups */
92 struct throtl_rb_root tg_service_tree; 101 struct throtl_rb_root tg_service_tree;
93 102
94 struct throtl_grp *root_tg;
95 struct request_queue *queue; 103 struct request_queue *queue;
96 104
97 /* Total Number of queued bios on READ and WRITE lists */ 105 /* Total Number of queued bios on READ and WRITE lists */
@@ -108,6 +116,33 @@ struct throtl_data
108 int limits_changed; 116 int limits_changed;
109}; 117};
110 118
119/* list and work item to allocate percpu group stats */
120static DEFINE_SPINLOCK(tg_stats_alloc_lock);
121static LIST_HEAD(tg_stats_alloc_list);
122
123static void tg_stats_alloc_fn(struct work_struct *);
124static DECLARE_DELAYED_WORK(tg_stats_alloc_work, tg_stats_alloc_fn);
125
126static inline struct throtl_grp *pd_to_tg(struct blkg_policy_data *pd)
127{
128 return pd ? container_of(pd, struct throtl_grp, pd) : NULL;
129}
130
131static inline struct throtl_grp *blkg_to_tg(struct blkcg_gq *blkg)
132{
133 return pd_to_tg(blkg_to_pd(blkg, &blkcg_policy_throtl));
134}
135
136static inline struct blkcg_gq *tg_to_blkg(struct throtl_grp *tg)
137{
138 return pd_to_blkg(&tg->pd);
139}
140
141static inline struct throtl_grp *td_root_tg(struct throtl_data *td)
142{
143 return blkg_to_tg(td->queue->root_blkg);
144}
145
111enum tg_state_flags { 146enum tg_state_flags {
112 THROTL_TG_FLAG_on_rr = 0, /* on round-robin busy list */ 147 THROTL_TG_FLAG_on_rr = 0, /* on round-robin busy list */
113}; 148};
@@ -128,244 +163,150 @@ static inline int throtl_tg_##name(const struct throtl_grp *tg) \
128 163
129THROTL_TG_FNS(on_rr); 164THROTL_TG_FNS(on_rr);
130 165
131#define throtl_log_tg(td, tg, fmt, args...) \ 166#define throtl_log_tg(td, tg, fmt, args...) do { \
132 blk_add_trace_msg((td)->queue, "throtl %s " fmt, \ 167 char __pbuf[128]; \
133 blkg_path(&(tg)->blkg), ##args); \ 168 \
169 blkg_path(tg_to_blkg(tg), __pbuf, sizeof(__pbuf)); \
170 blk_add_trace_msg((td)->queue, "throtl %s " fmt, __pbuf, ##args); \
171} while (0)
134 172
135#define throtl_log(td, fmt, args...) \ 173#define throtl_log(td, fmt, args...) \
136 blk_add_trace_msg((td)->queue, "throtl " fmt, ##args) 174 blk_add_trace_msg((td)->queue, "throtl " fmt, ##args)
137 175
138static inline struct throtl_grp *tg_of_blkg(struct blkio_group *blkg)
139{
140 if (blkg)
141 return container_of(blkg, struct throtl_grp, blkg);
142
143 return NULL;
144}
145
146static inline unsigned int total_nr_queued(struct throtl_data *td) 176static inline unsigned int total_nr_queued(struct throtl_data *td)
147{ 177{
148 return td->nr_queued[0] + td->nr_queued[1]; 178 return td->nr_queued[0] + td->nr_queued[1];
149} 179}
150 180
151static inline struct throtl_grp *throtl_ref_get_tg(struct throtl_grp *tg) 181/*
152{ 182 * Worker for allocating per cpu stat for tgs. This is scheduled on the
153 atomic_inc(&tg->ref); 183 * system_nrt_wq once there are some groups on the alloc_list waiting for
154 return tg; 184 * allocation.
155} 185 */
156 186static void tg_stats_alloc_fn(struct work_struct *work)
157static void throtl_free_tg(struct rcu_head *head)
158{ 187{
159 struct throtl_grp *tg; 188 static struct tg_stats_cpu *stats_cpu; /* this fn is non-reentrant */
189 struct delayed_work *dwork = to_delayed_work(work);
190 bool empty = false;
191
192alloc_stats:
193 if (!stats_cpu) {
194 stats_cpu = alloc_percpu(struct tg_stats_cpu);
195 if (!stats_cpu) {
196 /* allocation failed, try again after some time */
197 queue_delayed_work(system_nrt_wq, dwork,
198 msecs_to_jiffies(10));
199 return;
200 }
201 }
160 202
161 tg = container_of(head, struct throtl_grp, rcu_head); 203 spin_lock_irq(&tg_stats_alloc_lock);
162 free_percpu(tg->blkg.stats_cpu);
163 kfree(tg);
164}
165 204
166static void throtl_put_tg(struct throtl_grp *tg) 205 if (!list_empty(&tg_stats_alloc_list)) {
167{ 206 struct throtl_grp *tg = list_first_entry(&tg_stats_alloc_list,
168 BUG_ON(atomic_read(&tg->ref) <= 0); 207 struct throtl_grp,
169 if (!atomic_dec_and_test(&tg->ref)) 208 stats_alloc_node);
170 return; 209 swap(tg->stats_cpu, stats_cpu);
210 list_del_init(&tg->stats_alloc_node);
211 }
171 212
172 /* 213 empty = list_empty(&tg_stats_alloc_list);
173 * A group is freed in rcu manner. But having an rcu lock does not 214 spin_unlock_irq(&tg_stats_alloc_lock);
174 * mean that one can access all the fields of blkg and assume these 215 if (!empty)
175 * are valid. For example, don't try to follow throtl_data and 216 goto alloc_stats;
176 * request queue links.
177 *
178 * Having a reference to blkg under an rcu allows acess to only
179 * values local to groups like group stats and group rate limits
180 */
181 call_rcu(&tg->rcu_head, throtl_free_tg);
182} 217}
183 218
184static void throtl_init_group(struct throtl_grp *tg) 219static void throtl_pd_init(struct blkcg_gq *blkg)
185{ 220{
186 INIT_HLIST_NODE(&tg->tg_node); 221 struct throtl_grp *tg = blkg_to_tg(blkg);
222 unsigned long flags;
223
187 RB_CLEAR_NODE(&tg->rb_node); 224 RB_CLEAR_NODE(&tg->rb_node);
188 bio_list_init(&tg->bio_lists[0]); 225 bio_list_init(&tg->bio_lists[0]);
189 bio_list_init(&tg->bio_lists[1]); 226 bio_list_init(&tg->bio_lists[1]);
190 tg->limits_changed = false; 227 tg->limits_changed = false;
191 228
192 /* Practically unlimited BW */ 229 tg->bps[READ] = -1;
193 tg->bps[0] = tg->bps[1] = -1; 230 tg->bps[WRITE] = -1;
194 tg->iops[0] = tg->iops[1] = -1; 231 tg->iops[READ] = -1;
232 tg->iops[WRITE] = -1;
195 233
196 /* 234 /*
197 * Take the initial reference that will be released on destroy 235 * Ugh... We need to perform per-cpu allocation for tg->stats_cpu
198 * This can be thought of a joint reference by cgroup and 236 * but percpu allocator can't be called from IO path. Queue tg on
199 * request queue which will be dropped by either request queue 237 * tg_stats_alloc_list and allocate from work item.
200 * exit or cgroup deletion path depending on who is exiting first.
201 */ 238 */
202 atomic_set(&tg->ref, 1); 239 spin_lock_irqsave(&tg_stats_alloc_lock, flags);
240 list_add(&tg->stats_alloc_node, &tg_stats_alloc_list);
241 queue_delayed_work(system_nrt_wq, &tg_stats_alloc_work, 0);
242 spin_unlock_irqrestore(&tg_stats_alloc_lock, flags);
203} 243}
204 244
205/* Should be called with rcu read lock held (needed for blkcg) */ 245static void throtl_pd_exit(struct blkcg_gq *blkg)
206static void
207throtl_add_group_to_td_list(struct throtl_data *td, struct throtl_grp *tg)
208{ 246{
209 hlist_add_head(&tg->tg_node, &td->tg_list); 247 struct throtl_grp *tg = blkg_to_tg(blkg);
210 td->nr_undestroyed_grps++; 248 unsigned long flags;
211}
212
213static void
214__throtl_tg_fill_dev_details(struct throtl_data *td, struct throtl_grp *tg)
215{
216 struct backing_dev_info *bdi = &td->queue->backing_dev_info;
217 unsigned int major, minor;
218
219 if (!tg || tg->blkg.dev)
220 return;
221
222 /*
223 * Fill in device details for a group which might not have been
224 * filled at group creation time as queue was being instantiated
225 * and driver had not attached a device yet
226 */
227 if (bdi->dev && dev_name(bdi->dev)) {
228 sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
229 tg->blkg.dev = MKDEV(major, minor);
230 }
231}
232
233/*
234 * Should be called with without queue lock held. Here queue lock will be
235 * taken rarely. It will be taken only once during life time of a group
236 * if need be
237 */
238static void
239throtl_tg_fill_dev_details(struct throtl_data *td, struct throtl_grp *tg)
240{
241 if (!tg || tg->blkg.dev)
242 return;
243
244 spin_lock_irq(td->queue->queue_lock);
245 __throtl_tg_fill_dev_details(td, tg);
246 spin_unlock_irq(td->queue->queue_lock);
247}
248
249static void throtl_init_add_tg_lists(struct throtl_data *td,
250 struct throtl_grp *tg, struct blkio_cgroup *blkcg)
251{
252 __throtl_tg_fill_dev_details(td, tg);
253
254 /* Add group onto cgroup list */
255 blkiocg_add_blkio_group(blkcg, &tg->blkg, (void *)td,
256 tg->blkg.dev, BLKIO_POLICY_THROTL);
257 249
258 tg->bps[READ] = blkcg_get_read_bps(blkcg, tg->blkg.dev); 250 spin_lock_irqsave(&tg_stats_alloc_lock, flags);
259 tg->bps[WRITE] = blkcg_get_write_bps(blkcg, tg->blkg.dev); 251 list_del_init(&tg->stats_alloc_node);
260 tg->iops[READ] = blkcg_get_read_iops(blkcg, tg->blkg.dev); 252 spin_unlock_irqrestore(&tg_stats_alloc_lock, flags);
261 tg->iops[WRITE] = blkcg_get_write_iops(blkcg, tg->blkg.dev);
262 253
263 throtl_add_group_to_td_list(td, tg); 254 free_percpu(tg->stats_cpu);
264} 255}
265 256
266/* Should be called without queue lock and outside of rcu period */ 257static void throtl_pd_reset_stats(struct blkcg_gq *blkg)
267static struct throtl_grp *throtl_alloc_tg(struct throtl_data *td)
268{ 258{
269 struct throtl_grp *tg = NULL; 259 struct throtl_grp *tg = blkg_to_tg(blkg);
270 int ret; 260 int cpu;
271 261
272 tg = kzalloc_node(sizeof(*tg), GFP_ATOMIC, td->queue->node); 262 if (tg->stats_cpu == NULL)
273 if (!tg) 263 return;
274 return NULL;
275 264
276 ret = blkio_alloc_blkg_stats(&tg->blkg); 265 for_each_possible_cpu(cpu) {
266 struct tg_stats_cpu *sc = per_cpu_ptr(tg->stats_cpu, cpu);
277 267
278 if (ret) { 268 blkg_rwstat_reset(&sc->service_bytes);
279 kfree(tg); 269 blkg_rwstat_reset(&sc->serviced);
280 return NULL;
281 } 270 }
282
283 throtl_init_group(tg);
284 return tg;
285} 271}
286 272
287static struct 273static struct throtl_grp *throtl_lookup_tg(struct throtl_data *td,
288throtl_grp *throtl_find_tg(struct throtl_data *td, struct blkio_cgroup *blkcg) 274 struct blkcg *blkcg)
289{ 275{
290 struct throtl_grp *tg = NULL;
291 void *key = td;
292
293 /* 276 /*
294 * This is the common case when there are no blkio cgroups. 277 * This is the common case when there are no blkcgs. Avoid lookup
295 * Avoid lookup in this case 278 * in this case
296 */ 279 */
297 if (blkcg == &blkio_root_cgroup) 280 if (blkcg == &blkcg_root)
298 tg = td->root_tg; 281 return td_root_tg(td);
299 else
300 tg = tg_of_blkg(blkiocg_lookup_group(blkcg, key));
301 282
302 __throtl_tg_fill_dev_details(td, tg); 283 return blkg_to_tg(blkg_lookup(blkcg, td->queue));
303 return tg;
304} 284}
305 285
306static struct throtl_grp * throtl_get_tg(struct throtl_data *td) 286static struct throtl_grp *throtl_lookup_create_tg(struct throtl_data *td,
287 struct blkcg *blkcg)
307{ 288{
308 struct throtl_grp *tg = NULL, *__tg = NULL;
309 struct blkio_cgroup *blkcg;
310 struct request_queue *q = td->queue; 289 struct request_queue *q = td->queue;
311 290 struct throtl_grp *tg = NULL;
312 /* no throttling for dead queue */
313 if (unlikely(blk_queue_dead(q)))
314 return NULL;
315
316 rcu_read_lock();
317 blkcg = task_blkio_cgroup(current);
318 tg = throtl_find_tg(td, blkcg);
319 if (tg) {
320 rcu_read_unlock();
321 return tg;
322 }
323
324 /*
325 * Need to allocate a group. Allocation of group also needs allocation
326 * of per cpu stats which in-turn takes a mutex() and can block. Hence
327 * we need to drop rcu lock and queue_lock before we call alloc.
328 */
329 rcu_read_unlock();
330 spin_unlock_irq(q->queue_lock);
331
332 tg = throtl_alloc_tg(td);
333
334 /* Group allocated and queue is still alive. take the lock */
335 spin_lock_irq(q->queue_lock);
336
337 /* Make sure @q is still alive */
338 if (unlikely(blk_queue_dead(q))) {
339 kfree(tg);
340 return NULL;
341 }
342
343 /*
344 * Initialize the new group. After sleeping, read the blkcg again.
345 */
346 rcu_read_lock();
347 blkcg = task_blkio_cgroup(current);
348 291
349 /* 292 /*
350 * If some other thread already allocated the group while we were 293 * This is the common case when there are no blkcgs. Avoid lookup
351 * not holding queue lock, free up the group 294 * in this case
352 */ 295 */
353 __tg = throtl_find_tg(td, blkcg); 296 if (blkcg == &blkcg_root) {
354 297 tg = td_root_tg(td);
355 if (__tg) { 298 } else {
356 kfree(tg); 299 struct blkcg_gq *blkg;
357 rcu_read_unlock(); 300
358 return __tg; 301 blkg = blkg_lookup_create(blkcg, q);
359 } 302
360 303 /* if %NULL and @q is alive, fall back to root_tg */
361 /* Group allocation failed. Account the IO to root group */ 304 if (!IS_ERR(blkg))
362 if (!tg) { 305 tg = blkg_to_tg(blkg);
363 tg = td->root_tg; 306 else if (!blk_queue_dead(q))
364 return tg; 307 tg = td_root_tg(td);
365 } 308 }
366 309
367 throtl_init_add_tg_lists(td, tg, blkcg);
368 rcu_read_unlock();
369 return tg; 310 return tg;
370} 311}
371 312
@@ -734,16 +675,41 @@ static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg,
734 return 0; 675 return 0;
735} 676}
736 677
678static void throtl_update_dispatch_stats(struct blkcg_gq *blkg, u64 bytes,
679 int rw)
680{
681 struct throtl_grp *tg = blkg_to_tg(blkg);
682 struct tg_stats_cpu *stats_cpu;
683 unsigned long flags;
684
685 /* If per cpu stats are not allocated yet, don't do any accounting. */
686 if (tg->stats_cpu == NULL)
687 return;
688
689 /*
690 * Disabling interrupts to provide mutual exclusion between two
691 * writes on same cpu. It probably is not needed for 64bit. Not
692 * optimizing that case yet.
693 */
694 local_irq_save(flags);
695
696 stats_cpu = this_cpu_ptr(tg->stats_cpu);
697
698 blkg_rwstat_add(&stats_cpu->serviced, rw, 1);
699 blkg_rwstat_add(&stats_cpu->service_bytes, rw, bytes);
700
701 local_irq_restore(flags);
702}
703
737static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) 704static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
738{ 705{
739 bool rw = bio_data_dir(bio); 706 bool rw = bio_data_dir(bio);
740 bool sync = rw_is_sync(bio->bi_rw);
741 707
742 /* Charge the bio to the group */ 708 /* Charge the bio to the group */
743 tg->bytes_disp[rw] += bio->bi_size; 709 tg->bytes_disp[rw] += bio->bi_size;
744 tg->io_disp[rw]++; 710 tg->io_disp[rw]++;
745 711
746 blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size, rw, sync); 712 throtl_update_dispatch_stats(tg_to_blkg(tg), bio->bi_size, bio->bi_rw);
747} 713}
748 714
749static void throtl_add_bio_tg(struct throtl_data *td, struct throtl_grp *tg, 715static void throtl_add_bio_tg(struct throtl_data *td, struct throtl_grp *tg,
@@ -753,7 +719,7 @@ static void throtl_add_bio_tg(struct throtl_data *td, struct throtl_grp *tg,
753 719
754 bio_list_add(&tg->bio_lists[rw], bio); 720 bio_list_add(&tg->bio_lists[rw], bio);
755 /* Take a bio reference on tg */ 721 /* Take a bio reference on tg */
756 throtl_ref_get_tg(tg); 722 blkg_get(tg_to_blkg(tg));
757 tg->nr_queued[rw]++; 723 tg->nr_queued[rw]++;
758 td->nr_queued[rw]++; 724 td->nr_queued[rw]++;
759 throtl_enqueue_tg(td, tg); 725 throtl_enqueue_tg(td, tg);
@@ -786,8 +752,8 @@ static void tg_dispatch_one_bio(struct throtl_data *td, struct throtl_grp *tg,
786 752
787 bio = bio_list_pop(&tg->bio_lists[rw]); 753 bio = bio_list_pop(&tg->bio_lists[rw]);
788 tg->nr_queued[rw]--; 754 tg->nr_queued[rw]--;
789 /* Drop bio reference on tg */ 755 /* Drop bio reference on blkg */
790 throtl_put_tg(tg); 756 blkg_put(tg_to_blkg(tg));
791 757
792 BUG_ON(td->nr_queued[rw] <= 0); 758 BUG_ON(td->nr_queued[rw] <= 0);
793 td->nr_queued[rw]--; 759 td->nr_queued[rw]--;
@@ -865,8 +831,8 @@ static int throtl_select_dispatch(struct throtl_data *td, struct bio_list *bl)
865 831
866static void throtl_process_limit_change(struct throtl_data *td) 832static void throtl_process_limit_change(struct throtl_data *td)
867{ 833{
868 struct throtl_grp *tg; 834 struct request_queue *q = td->queue;
869 struct hlist_node *pos, *n; 835 struct blkcg_gq *blkg, *n;
870 836
871 if (!td->limits_changed) 837 if (!td->limits_changed)
872 return; 838 return;
@@ -875,7 +841,9 @@ static void throtl_process_limit_change(struct throtl_data *td)
875 841
876 throtl_log(td, "limits changed"); 842 throtl_log(td, "limits changed");
877 843
878 hlist_for_each_entry_safe(tg, pos, n, &td->tg_list, tg_node) { 844 list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) {
845 struct throtl_grp *tg = blkg_to_tg(blkg);
846
879 if (!tg->limits_changed) 847 if (!tg->limits_changed)
880 continue; 848 continue;
881 849
@@ -973,120 +941,159 @@ throtl_schedule_delayed_work(struct throtl_data *td, unsigned long delay)
973 } 941 }
974} 942}
975 943
976static void 944static u64 tg_prfill_cpu_rwstat(struct seq_file *sf,
977throtl_destroy_tg(struct throtl_data *td, struct throtl_grp *tg) 945 struct blkg_policy_data *pd, int off)
978{ 946{
979 /* Something wrong if we are trying to remove same group twice */ 947 struct throtl_grp *tg = pd_to_tg(pd);
980 BUG_ON(hlist_unhashed(&tg->tg_node)); 948 struct blkg_rwstat rwstat = { }, tmp;
949 int i, cpu;
981 950
982 hlist_del_init(&tg->tg_node); 951 for_each_possible_cpu(cpu) {
952 struct tg_stats_cpu *sc = per_cpu_ptr(tg->stats_cpu, cpu);
983 953
984 /* 954 tmp = blkg_rwstat_read((void *)sc + off);
985 * Put the reference taken at the time of creation so that when all 955 for (i = 0; i < BLKG_RWSTAT_NR; i++)
986 * queues are gone, group can be destroyed. 956 rwstat.cnt[i] += tmp.cnt[i];
987 */ 957 }
988 throtl_put_tg(tg); 958
989 td->nr_undestroyed_grps--; 959 return __blkg_prfill_rwstat(sf, pd, &rwstat);
990} 960}
991 961
992static void throtl_release_tgs(struct throtl_data *td) 962static int tg_print_cpu_rwstat(struct cgroup *cgrp, struct cftype *cft,
963 struct seq_file *sf)
993{ 964{
994 struct hlist_node *pos, *n; 965 struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
995 struct throtl_grp *tg;
996 966
997 hlist_for_each_entry_safe(tg, pos, n, &td->tg_list, tg_node) { 967 blkcg_print_blkgs(sf, blkcg, tg_prfill_cpu_rwstat, &blkcg_policy_throtl,
998 /* 968 cft->private, true);
999 * If cgroup removal path got to blk_group first and removed 969 return 0;
1000 * it from cgroup list, then it will take care of destroying
1001 * cfqg also.
1002 */
1003 if (!blkiocg_del_blkio_group(&tg->blkg))
1004 throtl_destroy_tg(td, tg);
1005 }
1006} 970}
1007 971
1008/* 972static u64 tg_prfill_conf_u64(struct seq_file *sf, struct blkg_policy_data *pd,
1009 * Blk cgroup controller notification saying that blkio_group object is being 973 int off)
1010 * delinked as associated cgroup object is going away. That also means that
1011 * no new IO will come in this group. So get rid of this group as soon as
1012 * any pending IO in the group is finished.
1013 *
1014 * This function is called under rcu_read_lock(). key is the rcu protected
1015 * pointer. That means "key" is a valid throtl_data pointer as long as we are
1016 * rcu read lock.
1017 *
1018 * "key" was fetched from blkio_group under blkio_cgroup->lock. That means
1019 * it should not be NULL as even if queue was going away, cgroup deltion
1020 * path got to it first.
1021 */
1022void throtl_unlink_blkio_group(void *key, struct blkio_group *blkg)
1023{ 974{
1024 unsigned long flags; 975 struct throtl_grp *tg = pd_to_tg(pd);
1025 struct throtl_data *td = key; 976 u64 v = *(u64 *)((void *)tg + off);
1026 977
1027 spin_lock_irqsave(td->queue->queue_lock, flags); 978 if (v == -1)
1028 throtl_destroy_tg(td, tg_of_blkg(blkg)); 979 return 0;
1029 spin_unlock_irqrestore(td->queue->queue_lock, flags); 980 return __blkg_prfill_u64(sf, pd, v);
1030} 981}
1031 982
1032static void throtl_update_blkio_group_common(struct throtl_data *td, 983static u64 tg_prfill_conf_uint(struct seq_file *sf, struct blkg_policy_data *pd,
1033 struct throtl_grp *tg) 984 int off)
1034{ 985{
1035 xchg(&tg->limits_changed, true); 986 struct throtl_grp *tg = pd_to_tg(pd);
1036 xchg(&td->limits_changed, true); 987 unsigned int v = *(unsigned int *)((void *)tg + off);
1037 /* Schedule a work now to process the limit change */ 988
1038 throtl_schedule_delayed_work(td, 0); 989 if (v == -1)
990 return 0;
991 return __blkg_prfill_u64(sf, pd, v);
1039} 992}
1040 993
1041/* 994static int tg_print_conf_u64(struct cgroup *cgrp, struct cftype *cft,
1042 * For all update functions, key should be a valid pointer because these 995 struct seq_file *sf)
1043 * update functions are called under blkcg_lock, that means, blkg is
1044 * valid and in turn key is valid. queue exit path can not race because
1045 * of blkcg_lock
1046 *
1047 * Can not take queue lock in update functions as queue lock under blkcg_lock
1048 * is not allowed. Under other paths we take blkcg_lock under queue_lock.
1049 */
1050static void throtl_update_blkio_group_read_bps(void *key,
1051 struct blkio_group *blkg, u64 read_bps)
1052{ 996{
1053 struct throtl_data *td = key; 997 blkcg_print_blkgs(sf, cgroup_to_blkcg(cgrp), tg_prfill_conf_u64,
1054 struct throtl_grp *tg = tg_of_blkg(blkg); 998 &blkcg_policy_throtl, cft->private, false);
1055 999 return 0;
1056 tg->bps[READ] = read_bps;
1057 throtl_update_blkio_group_common(td, tg);
1058} 1000}
1059 1001
1060static void throtl_update_blkio_group_write_bps(void *key, 1002static int tg_print_conf_uint(struct cgroup *cgrp, struct cftype *cft,
1061 struct blkio_group *blkg, u64 write_bps) 1003 struct seq_file *sf)
1062{ 1004{
1063 struct throtl_data *td = key; 1005 blkcg_print_blkgs(sf, cgroup_to_blkcg(cgrp), tg_prfill_conf_uint,
1064 struct throtl_grp *tg = tg_of_blkg(blkg); 1006 &blkcg_policy_throtl, cft->private, false);
1065 1007 return 0;
1066 tg->bps[WRITE] = write_bps;
1067 throtl_update_blkio_group_common(td, tg);
1068} 1008}
1069 1009
1070static void throtl_update_blkio_group_read_iops(void *key, 1010static int tg_set_conf(struct cgroup *cgrp, struct cftype *cft, const char *buf,
1071 struct blkio_group *blkg, unsigned int read_iops) 1011 bool is_u64)
1072{ 1012{
1073 struct throtl_data *td = key; 1013 struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
1074 struct throtl_grp *tg = tg_of_blkg(blkg); 1014 struct blkg_conf_ctx ctx;
1015 struct throtl_grp *tg;
1016 struct throtl_data *td;
1017 int ret;
1018
1019 ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx);
1020 if (ret)
1021 return ret;
1022
1023 tg = blkg_to_tg(ctx.blkg);
1024 td = ctx.blkg->q->td;
1025
1026 if (!ctx.v)
1027 ctx.v = -1;
1028
1029 if (is_u64)
1030 *(u64 *)((void *)tg + cft->private) = ctx.v;
1031 else
1032 *(unsigned int *)((void *)tg + cft->private) = ctx.v;
1033
1034 /* XXX: we don't need the following deferred processing */
1035 xchg(&tg->limits_changed, true);
1036 xchg(&td->limits_changed, true);
1037 throtl_schedule_delayed_work(td, 0);
1075 1038
1076 tg->iops[READ] = read_iops; 1039 blkg_conf_finish(&ctx);
1077 throtl_update_blkio_group_common(td, tg); 1040 return 0;
1078} 1041}
1079 1042
1080static void throtl_update_blkio_group_write_iops(void *key, 1043static int tg_set_conf_u64(struct cgroup *cgrp, struct cftype *cft,
1081 struct blkio_group *blkg, unsigned int write_iops) 1044 const char *buf)
1082{ 1045{
1083 struct throtl_data *td = key; 1046 return tg_set_conf(cgrp, cft, buf, true);
1084 struct throtl_grp *tg = tg_of_blkg(blkg); 1047}
1085 1048
1086 tg->iops[WRITE] = write_iops; 1049static int tg_set_conf_uint(struct cgroup *cgrp, struct cftype *cft,
1087 throtl_update_blkio_group_common(td, tg); 1050 const char *buf)
1051{
1052 return tg_set_conf(cgrp, cft, buf, false);
1088} 1053}
1089 1054
1055static struct cftype throtl_files[] = {
1056 {
1057 .name = "throttle.read_bps_device",
1058 .private = offsetof(struct throtl_grp, bps[READ]),
1059 .read_seq_string = tg_print_conf_u64,
1060 .write_string = tg_set_conf_u64,
1061 .max_write_len = 256,
1062 },
1063 {
1064 .name = "throttle.write_bps_device",
1065 .private = offsetof(struct throtl_grp, bps[WRITE]),
1066 .read_seq_string = tg_print_conf_u64,
1067 .write_string = tg_set_conf_u64,
1068 .max_write_len = 256,
1069 },
1070 {
1071 .name = "throttle.read_iops_device",
1072 .private = offsetof(struct throtl_grp, iops[READ]),
1073 .read_seq_string = tg_print_conf_uint,
1074 .write_string = tg_set_conf_uint,
1075 .max_write_len = 256,
1076 },
1077 {
1078 .name = "throttle.write_iops_device",
1079 .private = offsetof(struct throtl_grp, iops[WRITE]),
1080 .read_seq_string = tg_print_conf_uint,
1081 .write_string = tg_set_conf_uint,
1082 .max_write_len = 256,
1083 },
1084 {
1085 .name = "throttle.io_service_bytes",
1086 .private = offsetof(struct tg_stats_cpu, service_bytes),
1087 .read_seq_string = tg_print_cpu_rwstat,
1088 },
1089 {
1090 .name = "throttle.io_serviced",
1091 .private = offsetof(struct tg_stats_cpu, serviced),
1092 .read_seq_string = tg_print_cpu_rwstat,
1093 },
1094 { } /* terminate */
1095};
1096
1090static void throtl_shutdown_wq(struct request_queue *q) 1097static void throtl_shutdown_wq(struct request_queue *q)
1091{ 1098{
1092 struct throtl_data *td = q->td; 1099 struct throtl_data *td = q->td;
@@ -1094,19 +1101,13 @@ static void throtl_shutdown_wq(struct request_queue *q)
1094 cancel_delayed_work_sync(&td->throtl_work); 1101 cancel_delayed_work_sync(&td->throtl_work);
1095} 1102}
1096 1103
1097static struct blkio_policy_type blkio_policy_throtl = { 1104static struct blkcg_policy blkcg_policy_throtl = {
1098 .ops = { 1105 .pd_size = sizeof(struct throtl_grp),
1099 .blkio_unlink_group_fn = throtl_unlink_blkio_group, 1106 .cftypes = throtl_files,
1100 .blkio_update_group_read_bps_fn = 1107
1101 throtl_update_blkio_group_read_bps, 1108 .pd_init_fn = throtl_pd_init,
1102 .blkio_update_group_write_bps_fn = 1109 .pd_exit_fn = throtl_pd_exit,
1103 throtl_update_blkio_group_write_bps, 1110 .pd_reset_stats_fn = throtl_pd_reset_stats,
1104 .blkio_update_group_read_iops_fn =
1105 throtl_update_blkio_group_read_iops,
1106 .blkio_update_group_write_iops_fn =
1107 throtl_update_blkio_group_write_iops,
1108 },
1109 .plid = BLKIO_POLICY_THROTL,
1110}; 1111};
1111 1112
1112bool blk_throtl_bio(struct request_queue *q, struct bio *bio) 1113bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
@@ -1114,7 +1115,7 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
1114 struct throtl_data *td = q->td; 1115 struct throtl_data *td = q->td;
1115 struct throtl_grp *tg; 1116 struct throtl_grp *tg;
1116 bool rw = bio_data_dir(bio), update_disptime = true; 1117 bool rw = bio_data_dir(bio), update_disptime = true;
1117 struct blkio_cgroup *blkcg; 1118 struct blkcg *blkcg;
1118 bool throttled = false; 1119 bool throttled = false;
1119 1120
1120 if (bio->bi_rw & REQ_THROTTLED) { 1121 if (bio->bi_rw & REQ_THROTTLED) {
@@ -1122,33 +1123,31 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
1122 goto out; 1123 goto out;
1123 } 1124 }
1124 1125
1126 /* bio_associate_current() needs ioc, try creating */
1127 create_io_context(GFP_ATOMIC, q->node);
1128
1125 /* 1129 /*
1126 * A throtl_grp pointer retrieved under rcu can be used to access 1130 * A throtl_grp pointer retrieved under rcu can be used to access
1127 * basic fields like stats and io rates. If a group has no rules, 1131 * basic fields like stats and io rates. If a group has no rules,
1128 * just update the dispatch stats in lockless manner and return. 1132 * just update the dispatch stats in lockless manner and return.
1129 */ 1133 */
1130
1131 rcu_read_lock(); 1134 rcu_read_lock();
1132 blkcg = task_blkio_cgroup(current); 1135 blkcg = bio_blkcg(bio);
1133 tg = throtl_find_tg(td, blkcg); 1136 tg = throtl_lookup_tg(td, blkcg);
1134 if (tg) { 1137 if (tg) {
1135 throtl_tg_fill_dev_details(td, tg);
1136
1137 if (tg_no_rule_group(tg, rw)) { 1138 if (tg_no_rule_group(tg, rw)) {
1138 blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size, 1139 throtl_update_dispatch_stats(tg_to_blkg(tg),
1139 rw, rw_is_sync(bio->bi_rw)); 1140 bio->bi_size, bio->bi_rw);
1140 rcu_read_unlock(); 1141 goto out_unlock_rcu;
1141 goto out;
1142 } 1142 }
1143 } 1143 }
1144 rcu_read_unlock();
1145 1144
1146 /* 1145 /*
1147 * Either group has not been allocated yet or it is not an unlimited 1146 * Either group has not been allocated yet or it is not an unlimited
1148 * IO group 1147 * IO group
1149 */ 1148 */
1150 spin_lock_irq(q->queue_lock); 1149 spin_lock_irq(q->queue_lock);
1151 tg = throtl_get_tg(td); 1150 tg = throtl_lookup_create_tg(td, blkcg);
1152 if (unlikely(!tg)) 1151 if (unlikely(!tg))
1153 goto out_unlock; 1152 goto out_unlock;
1154 1153
@@ -1189,6 +1188,7 @@ queue_bio:
1189 tg->io_disp[rw], tg->iops[rw], 1188 tg->io_disp[rw], tg->iops[rw],
1190 tg->nr_queued[READ], tg->nr_queued[WRITE]); 1189 tg->nr_queued[READ], tg->nr_queued[WRITE]);
1191 1190
1191 bio_associate_current(bio);
1192 throtl_add_bio_tg(q->td, tg, bio); 1192 throtl_add_bio_tg(q->td, tg, bio);
1193 throttled = true; 1193 throttled = true;
1194 1194
@@ -1199,6 +1199,8 @@ queue_bio:
1199 1199
1200out_unlock: 1200out_unlock:
1201 spin_unlock_irq(q->queue_lock); 1201 spin_unlock_irq(q->queue_lock);
1202out_unlock_rcu:
1203 rcu_read_unlock();
1202out: 1204out:
1203 return throttled; 1205 return throttled;
1204} 1206}
@@ -1241,79 +1243,31 @@ void blk_throtl_drain(struct request_queue *q)
1241int blk_throtl_init(struct request_queue *q) 1243int blk_throtl_init(struct request_queue *q)
1242{ 1244{
1243 struct throtl_data *td; 1245 struct throtl_data *td;
1244 struct throtl_grp *tg; 1246 int ret;
1245 1247
1246 td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node); 1248 td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node);
1247 if (!td) 1249 if (!td)
1248 return -ENOMEM; 1250 return -ENOMEM;
1249 1251
1250 INIT_HLIST_HEAD(&td->tg_list);
1251 td->tg_service_tree = THROTL_RB_ROOT; 1252 td->tg_service_tree = THROTL_RB_ROOT;
1252 td->limits_changed = false; 1253 td->limits_changed = false;
1253 INIT_DELAYED_WORK(&td->throtl_work, blk_throtl_work); 1254 INIT_DELAYED_WORK(&td->throtl_work, blk_throtl_work);
1254 1255
1255 /* alloc and Init root group. */ 1256 q->td = td;
1256 td->queue = q; 1257 td->queue = q;
1257 tg = throtl_alloc_tg(td);
1258 1258
1259 if (!tg) { 1259 /* activate policy */
1260 ret = blkcg_activate_policy(q, &blkcg_policy_throtl);
1261 if (ret)
1260 kfree(td); 1262 kfree(td);
1261 return -ENOMEM; 1263 return ret;
1262 }
1263
1264 td->root_tg = tg;
1265
1266 rcu_read_lock();
1267 throtl_init_add_tg_lists(td, tg, &blkio_root_cgroup);
1268 rcu_read_unlock();
1269
1270 /* Attach throtl data to request queue */
1271 q->td = td;
1272 return 0;
1273} 1264}
1274 1265
1275void blk_throtl_exit(struct request_queue *q) 1266void blk_throtl_exit(struct request_queue *q)
1276{ 1267{
1277 struct throtl_data *td = q->td; 1268 BUG_ON(!q->td);
1278 bool wait = false;
1279
1280 BUG_ON(!td);
1281
1282 throtl_shutdown_wq(q);
1283
1284 spin_lock_irq(q->queue_lock);
1285 throtl_release_tgs(td);
1286
1287 /* If there are other groups */
1288 if (td->nr_undestroyed_grps > 0)
1289 wait = true;
1290
1291 spin_unlock_irq(q->queue_lock);
1292
1293 /*
1294 * Wait for tg->blkg->key accessors to exit their grace periods.
1295 * Do this wait only if there are other undestroyed groups out
1296 * there (other than root group). This can happen if cgroup deletion
1297 * path claimed the responsibility of cleaning up a group before
1298 * queue cleanup code get to the group.
1299 *
1300 * Do not call synchronize_rcu() unconditionally as there are drivers
1301 * which create/delete request queue hundreds of times during scan/boot
1302 * and synchronize_rcu() can take significant time and slow down boot.
1303 */
1304 if (wait)
1305 synchronize_rcu();
1306
1307 /*
1308 * Just being safe to make sure after previous flush if some body did
1309 * update limits through cgroup and another work got queued, cancel
1310 * it.
1311 */
1312 throtl_shutdown_wq(q); 1269 throtl_shutdown_wq(q);
1313} 1270 blkcg_deactivate_policy(q, &blkcg_policy_throtl);
1314
1315void blk_throtl_release(struct request_queue *q)
1316{
1317 kfree(q->td); 1271 kfree(q->td);
1318} 1272}
1319 1273
@@ -1323,8 +1277,7 @@ static int __init throtl_init(void)
1323 if (!kthrotld_workqueue) 1277 if (!kthrotld_workqueue)
1324 panic("Failed to create kthrotld\n"); 1278 panic("Failed to create kthrotld\n");
1325 1279
1326 blkio_policy_register(&blkio_policy_throtl); 1280 return blkcg_policy_register(&blkcg_policy_throtl);
1327 return 0;
1328} 1281}
1329 1282
1330module_init(throtl_init); 1283module_init(throtl_init);
diff --git a/block/blk.h b/block/blk.h
index d45be871329e..85f6ae42f7d3 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -23,7 +23,8 @@ void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
23 struct bio *bio); 23 struct bio *bio);
24int blk_rq_append_bio(struct request_queue *q, struct request *rq, 24int blk_rq_append_bio(struct request_queue *q, struct request *rq,
25 struct bio *bio); 25 struct bio *bio);
26void blk_drain_queue(struct request_queue *q, bool drain_all); 26void blk_queue_bypass_start(struct request_queue *q);
27void blk_queue_bypass_end(struct request_queue *q);
27void blk_dequeue_request(struct request *rq); 28void blk_dequeue_request(struct request *rq);
28void __blk_queue_free_tags(struct request_queue *q); 29void __blk_queue_free_tags(struct request_queue *q);
29bool __blk_end_bidi_request(struct request *rq, int error, 30bool __blk_end_bidi_request(struct request *rq, int error,
@@ -144,9 +145,6 @@ void blk_queue_congestion_threshold(struct request_queue *q);
144 145
145int blk_dev_init(void); 146int blk_dev_init(void);
146 147
147void elv_quiesce_start(struct request_queue *q);
148void elv_quiesce_end(struct request_queue *q);
149
150 148
151/* 149/*
152 * Return the threshold (number of used requests) at which the queue is 150 * Return the threshold (number of used requests) at which the queue is
@@ -186,32 +184,30 @@ static inline int blk_do_io_stat(struct request *rq)
186 */ 184 */
187void get_io_context(struct io_context *ioc); 185void get_io_context(struct io_context *ioc);
188struct io_cq *ioc_lookup_icq(struct io_context *ioc, struct request_queue *q); 186struct io_cq *ioc_lookup_icq(struct io_context *ioc, struct request_queue *q);
189struct io_cq *ioc_create_icq(struct request_queue *q, gfp_t gfp_mask); 187struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q,
188 gfp_t gfp_mask);
190void ioc_clear_queue(struct request_queue *q); 189void ioc_clear_queue(struct request_queue *q);
191 190
192void create_io_context_slowpath(struct task_struct *task, gfp_t gfp_mask, 191int create_task_io_context(struct task_struct *task, gfp_t gfp_mask, int node);
193 int node);
194 192
195/** 193/**
196 * create_io_context - try to create task->io_context 194 * create_io_context - try to create task->io_context
197 * @task: target task
198 * @gfp_mask: allocation mask 195 * @gfp_mask: allocation mask
199 * @node: allocation node 196 * @node: allocation node
200 * 197 *
201 * If @task->io_context is %NULL, allocate a new io_context and install it. 198 * If %current->io_context is %NULL, allocate a new io_context and install
202 * Returns the current @task->io_context which may be %NULL if allocation 199 * it. Returns the current %current->io_context which may be %NULL if
203 * failed. 200 * allocation failed.
204 * 201 *
205 * Note that this function can't be called with IRQ disabled because 202 * Note that this function can't be called with IRQ disabled because
206 * task_lock which protects @task->io_context is IRQ-unsafe. 203 * task_lock which protects %current->io_context is IRQ-unsafe.
207 */ 204 */
208static inline struct io_context *create_io_context(struct task_struct *task, 205static inline struct io_context *create_io_context(gfp_t gfp_mask, int node)
209 gfp_t gfp_mask, int node)
210{ 206{
211 WARN_ON_ONCE(irqs_disabled()); 207 WARN_ON_ONCE(irqs_disabled());
212 if (unlikely(!task->io_context)) 208 if (unlikely(!current->io_context))
213 create_io_context_slowpath(task, gfp_mask, node); 209 create_task_io_context(current, gfp_mask, node);
214 return task->io_context; 210 return current->io_context;
215} 211}
216 212
217/* 213/*
@@ -222,7 +218,6 @@ extern bool blk_throtl_bio(struct request_queue *q, struct bio *bio);
222extern void blk_throtl_drain(struct request_queue *q); 218extern void blk_throtl_drain(struct request_queue *q);
223extern int blk_throtl_init(struct request_queue *q); 219extern int blk_throtl_init(struct request_queue *q);
224extern void blk_throtl_exit(struct request_queue *q); 220extern void blk_throtl_exit(struct request_queue *q);
225extern void blk_throtl_release(struct request_queue *q);
226#else /* CONFIG_BLK_DEV_THROTTLING */ 221#else /* CONFIG_BLK_DEV_THROTTLING */
227static inline bool blk_throtl_bio(struct request_queue *q, struct bio *bio) 222static inline bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
228{ 223{
@@ -231,7 +226,6 @@ static inline bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
231static inline void blk_throtl_drain(struct request_queue *q) { } 226static inline void blk_throtl_drain(struct request_queue *q) { }
232static inline int blk_throtl_init(struct request_queue *q) { return 0; } 227static inline int blk_throtl_init(struct request_queue *q) { return 0; }
233static inline void blk_throtl_exit(struct request_queue *q) { } 228static inline void blk_throtl_exit(struct request_queue *q) { }
234static inline void blk_throtl_release(struct request_queue *q) { }
235#endif /* CONFIG_BLK_DEV_THROTTLING */ 229#endif /* CONFIG_BLK_DEV_THROTTLING */
236 230
237#endif /* BLK_INTERNAL_H */ 231#endif /* BLK_INTERNAL_H */
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 3c38536bd52c..673c977cc2bf 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -15,7 +15,9 @@
15#include <linux/ioprio.h> 15#include <linux/ioprio.h>
16#include <linux/blktrace_api.h> 16#include <linux/blktrace_api.h>
17#include "blk.h" 17#include "blk.h"
18#include "cfq.h" 18#include "blk-cgroup.h"
19
20static struct blkcg_policy blkcg_policy_cfq __maybe_unused;
19 21
20/* 22/*
21 * tunables 23 * tunables
@@ -171,8 +173,53 @@ enum wl_type_t {
171 SYNC_WORKLOAD = 2 173 SYNC_WORKLOAD = 2
172}; 174};
173 175
176struct cfqg_stats {
177#ifdef CONFIG_CFQ_GROUP_IOSCHED
178 /* total bytes transferred */
179 struct blkg_rwstat service_bytes;
180 /* total IOs serviced, post merge */
181 struct blkg_rwstat serviced;
182 /* number of ios merged */
183 struct blkg_rwstat merged;
184 /* total time spent on device in ns, may not be accurate w/ queueing */
185 struct blkg_rwstat service_time;
186 /* total time spent waiting in scheduler queue in ns */
187 struct blkg_rwstat wait_time;
188 /* number of IOs queued up */
189 struct blkg_rwstat queued;
190 /* total sectors transferred */
191 struct blkg_stat sectors;
192 /* total disk time and nr sectors dispatched by this group */
193 struct blkg_stat time;
194#ifdef CONFIG_DEBUG_BLK_CGROUP
195 /* time not charged to this cgroup */
196 struct blkg_stat unaccounted_time;
197 /* sum of number of ios queued across all samples */
198 struct blkg_stat avg_queue_size_sum;
199 /* count of samples taken for average */
200 struct blkg_stat avg_queue_size_samples;
201 /* how many times this group has been removed from service tree */
202 struct blkg_stat dequeue;
203 /* total time spent waiting for it to be assigned a timeslice. */
204 struct blkg_stat group_wait_time;
205 /* time spent idling for this blkcg_gq */
206 struct blkg_stat idle_time;
207 /* total time with empty current active q with other requests queued */
208 struct blkg_stat empty_time;
209 /* fields after this shouldn't be cleared on stat reset */
210 uint64_t start_group_wait_time;
211 uint64_t start_idle_time;
212 uint64_t start_empty_time;
213 uint16_t flags;
214#endif /* CONFIG_DEBUG_BLK_CGROUP */
215#endif /* CONFIG_CFQ_GROUP_IOSCHED */
216};
217
174/* This is per cgroup per device grouping structure */ 218/* This is per cgroup per device grouping structure */
175struct cfq_group { 219struct cfq_group {
220 /* must be the first member */
221 struct blkg_policy_data pd;
222
176 /* group service_tree member */ 223 /* group service_tree member */
177 struct rb_node rb_node; 224 struct rb_node rb_node;
178 225
@@ -180,7 +227,7 @@ struct cfq_group {
180 u64 vdisktime; 227 u64 vdisktime;
181 unsigned int weight; 228 unsigned int weight;
182 unsigned int new_weight; 229 unsigned int new_weight;
183 bool needs_update; 230 unsigned int dev_weight;
184 231
185 /* number of cfqq currently on this group */ 232 /* number of cfqq currently on this group */
186 int nr_cfqq; 233 int nr_cfqq;
@@ -206,20 +253,21 @@ struct cfq_group {
206 unsigned long saved_workload_slice; 253 unsigned long saved_workload_slice;
207 enum wl_type_t saved_workload; 254 enum wl_type_t saved_workload;
208 enum wl_prio_t saved_serving_prio; 255 enum wl_prio_t saved_serving_prio;
209 struct blkio_group blkg; 256
210#ifdef CONFIG_CFQ_GROUP_IOSCHED
211 struct hlist_node cfqd_node;
212 int ref;
213#endif
214 /* number of requests that are on the dispatch list or inside driver */ 257 /* number of requests that are on the dispatch list or inside driver */
215 int dispatched; 258 int dispatched;
216 struct cfq_ttime ttime; 259 struct cfq_ttime ttime;
260 struct cfqg_stats stats;
217}; 261};
218 262
219struct cfq_io_cq { 263struct cfq_io_cq {
220 struct io_cq icq; /* must be the first member */ 264 struct io_cq icq; /* must be the first member */
221 struct cfq_queue *cfqq[2]; 265 struct cfq_queue *cfqq[2];
222 struct cfq_ttime ttime; 266 struct cfq_ttime ttime;
267 int ioprio; /* the current ioprio */
268#ifdef CONFIG_CFQ_GROUP_IOSCHED
269 uint64_t blkcg_id; /* the current blkcg ID */
270#endif
223}; 271};
224 272
225/* 273/*
@@ -229,7 +277,7 @@ struct cfq_data {
229 struct request_queue *queue; 277 struct request_queue *queue;
230 /* Root service tree for cfq_groups */ 278 /* Root service tree for cfq_groups */
231 struct cfq_rb_root grp_service_tree; 279 struct cfq_rb_root grp_service_tree;
232 struct cfq_group root_group; 280 struct cfq_group *root_group;
233 281
234 /* 282 /*
235 * The priority currently being served 283 * The priority currently being served
@@ -303,12 +351,6 @@ struct cfq_data {
303 struct cfq_queue oom_cfqq; 351 struct cfq_queue oom_cfqq;
304 352
305 unsigned long last_delayed_sync; 353 unsigned long last_delayed_sync;
306
307 /* List of cfq groups being managed on this device*/
308 struct hlist_head cfqg_list;
309
310 /* Number of groups which are on blkcg->blkg_list */
311 unsigned int nr_blkcg_linked_grps;
312}; 354};
313 355
314static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd); 356static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);
@@ -371,21 +413,284 @@ CFQ_CFQQ_FNS(deep);
371CFQ_CFQQ_FNS(wait_busy); 413CFQ_CFQQ_FNS(wait_busy);
372#undef CFQ_CFQQ_FNS 414#undef CFQ_CFQQ_FNS
373 415
416static inline struct cfq_group *pd_to_cfqg(struct blkg_policy_data *pd)
417{
418 return pd ? container_of(pd, struct cfq_group, pd) : NULL;
419}
420
421static inline struct cfq_group *blkg_to_cfqg(struct blkcg_gq *blkg)
422{
423 return pd_to_cfqg(blkg_to_pd(blkg, &blkcg_policy_cfq));
424}
425
426static inline struct blkcg_gq *cfqg_to_blkg(struct cfq_group *cfqg)
427{
428 return pd_to_blkg(&cfqg->pd);
429}
430
431#if defined(CONFIG_CFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP)
432
433/* cfqg stats flags */
434enum cfqg_stats_flags {
435 CFQG_stats_waiting = 0,
436 CFQG_stats_idling,
437 CFQG_stats_empty,
438};
439
440#define CFQG_FLAG_FNS(name) \
441static inline void cfqg_stats_mark_##name(struct cfqg_stats *stats) \
442{ \
443 stats->flags |= (1 << CFQG_stats_##name); \
444} \
445static inline void cfqg_stats_clear_##name(struct cfqg_stats *stats) \
446{ \
447 stats->flags &= ~(1 << CFQG_stats_##name); \
448} \
449static inline int cfqg_stats_##name(struct cfqg_stats *stats) \
450{ \
451 return (stats->flags & (1 << CFQG_stats_##name)) != 0; \
452} \
453
454CFQG_FLAG_FNS(waiting)
455CFQG_FLAG_FNS(idling)
456CFQG_FLAG_FNS(empty)
457#undef CFQG_FLAG_FNS
458
459/* This should be called with the queue_lock held. */
460static void cfqg_stats_update_group_wait_time(struct cfqg_stats *stats)
461{
462 unsigned long long now;
463
464 if (!cfqg_stats_waiting(stats))
465 return;
466
467 now = sched_clock();
468 if (time_after64(now, stats->start_group_wait_time))
469 blkg_stat_add(&stats->group_wait_time,
470 now - stats->start_group_wait_time);
471 cfqg_stats_clear_waiting(stats);
472}
473
474/* This should be called with the queue_lock held. */
475static void cfqg_stats_set_start_group_wait_time(struct cfq_group *cfqg,
476 struct cfq_group *curr_cfqg)
477{
478 struct cfqg_stats *stats = &cfqg->stats;
479
480 if (cfqg_stats_waiting(stats))
481 return;
482 if (cfqg == curr_cfqg)
483 return;
484 stats->start_group_wait_time = sched_clock();
485 cfqg_stats_mark_waiting(stats);
486}
487
488/* This should be called with the queue_lock held. */
489static void cfqg_stats_end_empty_time(struct cfqg_stats *stats)
490{
491 unsigned long long now;
492
493 if (!cfqg_stats_empty(stats))
494 return;
495
496 now = sched_clock();
497 if (time_after64(now, stats->start_empty_time))
498 blkg_stat_add(&stats->empty_time,
499 now - stats->start_empty_time);
500 cfqg_stats_clear_empty(stats);
501}
502
503static void cfqg_stats_update_dequeue(struct cfq_group *cfqg)
504{
505 blkg_stat_add(&cfqg->stats.dequeue, 1);
506}
507
508static void cfqg_stats_set_start_empty_time(struct cfq_group *cfqg)
509{
510 struct cfqg_stats *stats = &cfqg->stats;
511
512 if (blkg_rwstat_sum(&stats->queued))
513 return;
514
515 /*
516 * group is already marked empty. This can happen if cfqq got new
517 * request in parent group and moved to this group while being added
518 * to service tree. Just ignore the event and move on.
519 */
520 if (cfqg_stats_empty(stats))
521 return;
522
523 stats->start_empty_time = sched_clock();
524 cfqg_stats_mark_empty(stats);
525}
526
527static void cfqg_stats_update_idle_time(struct cfq_group *cfqg)
528{
529 struct cfqg_stats *stats = &cfqg->stats;
530
531 if (cfqg_stats_idling(stats)) {
532 unsigned long long now = sched_clock();
533
534 if (time_after64(now, stats->start_idle_time))
535 blkg_stat_add(&stats->idle_time,
536 now - stats->start_idle_time);
537 cfqg_stats_clear_idling(stats);
538 }
539}
540
541static void cfqg_stats_set_start_idle_time(struct cfq_group *cfqg)
542{
543 struct cfqg_stats *stats = &cfqg->stats;
544
545 BUG_ON(cfqg_stats_idling(stats));
546
547 stats->start_idle_time = sched_clock();
548 cfqg_stats_mark_idling(stats);
549}
550
551static void cfqg_stats_update_avg_queue_size(struct cfq_group *cfqg)
552{
553 struct cfqg_stats *stats = &cfqg->stats;
554
555 blkg_stat_add(&stats->avg_queue_size_sum,
556 blkg_rwstat_sum(&stats->queued));
557 blkg_stat_add(&stats->avg_queue_size_samples, 1);
558 cfqg_stats_update_group_wait_time(stats);
559}
560
561#else /* CONFIG_CFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */
562
563static inline void cfqg_stats_set_start_group_wait_time(struct cfq_group *cfqg, struct cfq_group *curr_cfqg) { }
564static inline void cfqg_stats_end_empty_time(struct cfqg_stats *stats) { }
565static inline void cfqg_stats_update_dequeue(struct cfq_group *cfqg) { }
566static inline void cfqg_stats_set_start_empty_time(struct cfq_group *cfqg) { }
567static inline void cfqg_stats_update_idle_time(struct cfq_group *cfqg) { }
568static inline void cfqg_stats_set_start_idle_time(struct cfq_group *cfqg) { }
569static inline void cfqg_stats_update_avg_queue_size(struct cfq_group *cfqg) { }
570
571#endif /* CONFIG_CFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */
572
374#ifdef CONFIG_CFQ_GROUP_IOSCHED 573#ifdef CONFIG_CFQ_GROUP_IOSCHED
375#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \ 574
575static inline void cfqg_get(struct cfq_group *cfqg)
576{
577 return blkg_get(cfqg_to_blkg(cfqg));
578}
579
580static inline void cfqg_put(struct cfq_group *cfqg)
581{
582 return blkg_put(cfqg_to_blkg(cfqg));
583}
584
585#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) do { \
586 char __pbuf[128]; \
587 \
588 blkg_path(cfqg_to_blkg((cfqq)->cfqg), __pbuf, sizeof(__pbuf)); \
376 blk_add_trace_msg((cfqd)->queue, "cfq%d%c %s " fmt, (cfqq)->pid, \ 589 blk_add_trace_msg((cfqd)->queue, "cfq%d%c %s " fmt, (cfqq)->pid, \
377 cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \ 590 cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \
378 blkg_path(&(cfqq)->cfqg->blkg), ##args) 591 __pbuf, ##args); \
592} while (0)
379 593
380#define cfq_log_cfqg(cfqd, cfqg, fmt, args...) \ 594#define cfq_log_cfqg(cfqd, cfqg, fmt, args...) do { \
381 blk_add_trace_msg((cfqd)->queue, "%s " fmt, \ 595 char __pbuf[128]; \
382 blkg_path(&(cfqg)->blkg), ##args) \ 596 \
597 blkg_path(cfqg_to_blkg(cfqg), __pbuf, sizeof(__pbuf)); \
598 blk_add_trace_msg((cfqd)->queue, "%s " fmt, __pbuf, ##args); \
599} while (0)
600
601static inline void cfqg_stats_update_io_add(struct cfq_group *cfqg,
602 struct cfq_group *curr_cfqg, int rw)
603{
604 blkg_rwstat_add(&cfqg->stats.queued, rw, 1);
605 cfqg_stats_end_empty_time(&cfqg->stats);
606 cfqg_stats_set_start_group_wait_time(cfqg, curr_cfqg);
607}
608
609static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg,
610 unsigned long time, unsigned long unaccounted_time)
611{
612 blkg_stat_add(&cfqg->stats.time, time);
613#ifdef CONFIG_DEBUG_BLK_CGROUP
614 blkg_stat_add(&cfqg->stats.unaccounted_time, unaccounted_time);
615#endif
616}
617
618static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg, int rw)
619{
620 blkg_rwstat_add(&cfqg->stats.queued, rw, -1);
621}
622
623static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, int rw)
624{
625 blkg_rwstat_add(&cfqg->stats.merged, rw, 1);
626}
627
628static inline void cfqg_stats_update_dispatch(struct cfq_group *cfqg,
629 uint64_t bytes, int rw)
630{
631 blkg_stat_add(&cfqg->stats.sectors, bytes >> 9);
632 blkg_rwstat_add(&cfqg->stats.serviced, rw, 1);
633 blkg_rwstat_add(&cfqg->stats.service_bytes, rw, bytes);
634}
635
636static inline void cfqg_stats_update_completion(struct cfq_group *cfqg,
637 uint64_t start_time, uint64_t io_start_time, int rw)
638{
639 struct cfqg_stats *stats = &cfqg->stats;
640 unsigned long long now = sched_clock();
641
642 if (time_after64(now, io_start_time))
643 blkg_rwstat_add(&stats->service_time, rw, now - io_start_time);
644 if (time_after64(io_start_time, start_time))
645 blkg_rwstat_add(&stats->wait_time, rw,
646 io_start_time - start_time);
647}
648
649static void cfq_pd_reset_stats(struct blkcg_gq *blkg)
650{
651 struct cfq_group *cfqg = blkg_to_cfqg(blkg);
652 struct cfqg_stats *stats = &cfqg->stats;
653
654 /* queued stats shouldn't be cleared */
655 blkg_rwstat_reset(&stats->service_bytes);
656 blkg_rwstat_reset(&stats->serviced);
657 blkg_rwstat_reset(&stats->merged);
658 blkg_rwstat_reset(&stats->service_time);
659 blkg_rwstat_reset(&stats->wait_time);
660 blkg_stat_reset(&stats->time);
661#ifdef CONFIG_DEBUG_BLK_CGROUP
662 blkg_stat_reset(&stats->unaccounted_time);
663 blkg_stat_reset(&stats->avg_queue_size_sum);
664 blkg_stat_reset(&stats->avg_queue_size_samples);
665 blkg_stat_reset(&stats->dequeue);
666 blkg_stat_reset(&stats->group_wait_time);
667 blkg_stat_reset(&stats->idle_time);
668 blkg_stat_reset(&stats->empty_time);
669#endif
670}
671
672#else /* CONFIG_CFQ_GROUP_IOSCHED */
673
674static inline void cfqg_get(struct cfq_group *cfqg) { }
675static inline void cfqg_put(struct cfq_group *cfqg) { }
383 676
384#else
385#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \ 677#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \
386 blk_add_trace_msg((cfqd)->queue, "cfq%d " fmt, (cfqq)->pid, ##args) 678 blk_add_trace_msg((cfqd)->queue, "cfq%d " fmt, (cfqq)->pid, ##args)
387#define cfq_log_cfqg(cfqd, cfqg, fmt, args...) do {} while (0) 679#define cfq_log_cfqg(cfqd, cfqg, fmt, args...) do {} while (0)
388#endif 680
681static inline void cfqg_stats_update_io_add(struct cfq_group *cfqg,
682 struct cfq_group *curr_cfqg, int rw) { }
683static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg,
684 unsigned long time, unsigned long unaccounted_time) { }
685static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg, int rw) { }
686static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, int rw) { }
687static inline void cfqg_stats_update_dispatch(struct cfq_group *cfqg,
688 uint64_t bytes, int rw) { }
689static inline void cfqg_stats_update_completion(struct cfq_group *cfqg,
690 uint64_t start_time, uint64_t io_start_time, int rw) { }
691
692#endif /* CONFIG_CFQ_GROUP_IOSCHED */
693
389#define cfq_log(cfqd, fmt, args...) \ 694#define cfq_log(cfqd, fmt, args...) \
390 blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args) 695 blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args)
391 696
@@ -466,8 +771,9 @@ static inline int cfqg_busy_async_queues(struct cfq_data *cfqd,
466} 771}
467 772
468static void cfq_dispatch_insert(struct request_queue *, struct request *); 773static void cfq_dispatch_insert(struct request_queue *, struct request *);
469static struct cfq_queue *cfq_get_queue(struct cfq_data *, bool, 774static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, bool is_sync,
470 struct io_context *, gfp_t); 775 struct cfq_io_cq *cic, struct bio *bio,
776 gfp_t gfp_mask);
471 777
472static inline struct cfq_io_cq *icq_to_cic(struct io_cq *icq) 778static inline struct cfq_io_cq *icq_to_cic(struct io_cq *icq)
473{ 779{
@@ -545,7 +851,7 @@ static inline u64 cfq_scale_slice(unsigned long delta, struct cfq_group *cfqg)
545{ 851{
546 u64 d = delta << CFQ_SERVICE_SHIFT; 852 u64 d = delta << CFQ_SERVICE_SHIFT;
547 853
548 d = d * BLKIO_WEIGHT_DEFAULT; 854 d = d * CFQ_WEIGHT_DEFAULT;
549 do_div(d, cfqg->weight); 855 do_div(d, cfqg->weight);
550 return d; 856 return d;
551} 857}
@@ -872,9 +1178,9 @@ static void
872cfq_update_group_weight(struct cfq_group *cfqg) 1178cfq_update_group_weight(struct cfq_group *cfqg)
873{ 1179{
874 BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node)); 1180 BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node));
875 if (cfqg->needs_update) { 1181 if (cfqg->new_weight) {
876 cfqg->weight = cfqg->new_weight; 1182 cfqg->weight = cfqg->new_weight;
877 cfqg->needs_update = false; 1183 cfqg->new_weight = 0;
878 } 1184 }
879} 1185}
880 1186
@@ -936,7 +1242,7 @@ cfq_group_notify_queue_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
936 cfq_log_cfqg(cfqd, cfqg, "del_from_rr group"); 1242 cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");
937 cfq_group_service_tree_del(st, cfqg); 1243 cfq_group_service_tree_del(st, cfqg);
938 cfqg->saved_workload_slice = 0; 1244 cfqg->saved_workload_slice = 0;
939 cfq_blkiocg_update_dequeue_stats(&cfqg->blkg, 1); 1245 cfqg_stats_update_dequeue(cfqg);
940} 1246}
941 1247
942static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq, 1248static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq,
@@ -1008,178 +1314,59 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
1008 "sl_used=%u disp=%u charge=%u iops=%u sect=%lu", 1314 "sl_used=%u disp=%u charge=%u iops=%u sect=%lu",
1009 used_sl, cfqq->slice_dispatch, charge, 1315 used_sl, cfqq->slice_dispatch, charge,
1010 iops_mode(cfqd), cfqq->nr_sectors); 1316 iops_mode(cfqd), cfqq->nr_sectors);
1011 cfq_blkiocg_update_timeslice_used(&cfqg->blkg, used_sl, 1317 cfqg_stats_update_timeslice_used(cfqg, used_sl, unaccounted_sl);
1012 unaccounted_sl); 1318 cfqg_stats_set_start_empty_time(cfqg);
1013 cfq_blkiocg_set_start_empty_time(&cfqg->blkg);
1014} 1319}
1015 1320
1016#ifdef CONFIG_CFQ_GROUP_IOSCHED 1321/**
1017static inline struct cfq_group *cfqg_of_blkg(struct blkio_group *blkg) 1322 * cfq_init_cfqg_base - initialize base part of a cfq_group
1018{ 1323 * @cfqg: cfq_group to initialize
1019 if (blkg) 1324 *
1020 return container_of(blkg, struct cfq_group, blkg); 1325 * Initialize the base part which is used whether %CONFIG_CFQ_GROUP_IOSCHED
1021 return NULL; 1326 * is enabled or not.
1022}
1023
1024static void cfq_update_blkio_group_weight(void *key, struct blkio_group *blkg,
1025 unsigned int weight)
1026{
1027 struct cfq_group *cfqg = cfqg_of_blkg(blkg);
1028 cfqg->new_weight = weight;
1029 cfqg->needs_update = true;
1030}
1031
1032static void cfq_init_add_cfqg_lists(struct cfq_data *cfqd,
1033 struct cfq_group *cfqg, struct blkio_cgroup *blkcg)
1034{
1035 struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
1036 unsigned int major, minor;
1037
1038 /*
1039 * Add group onto cgroup list. It might happen that bdi->dev is
1040 * not initialized yet. Initialize this new group without major
1041 * and minor info and this info will be filled in once a new thread
1042 * comes for IO.
1043 */
1044 if (bdi->dev) {
1045 sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
1046 cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg,
1047 (void *)cfqd, MKDEV(major, minor));
1048 } else
1049 cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg,
1050 (void *)cfqd, 0);
1051
1052 cfqd->nr_blkcg_linked_grps++;
1053 cfqg->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
1054
1055 /* Add group on cfqd list */
1056 hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
1057}
1058
1059/*
1060 * Should be called from sleepable context. No request queue lock as per
1061 * cpu stats are allocated dynamically and alloc_percpu needs to be called
1062 * from sleepable context.
1063 */ 1327 */
1064static struct cfq_group * cfq_alloc_cfqg(struct cfq_data *cfqd) 1328static void cfq_init_cfqg_base(struct cfq_group *cfqg)
1065{ 1329{
1066 struct cfq_group *cfqg = NULL;
1067 int i, j, ret;
1068 struct cfq_rb_root *st; 1330 struct cfq_rb_root *st;
1069 1331 int i, j;
1070 cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node);
1071 if (!cfqg)
1072 return NULL;
1073 1332
1074 for_each_cfqg_st(cfqg, i, j, st) 1333 for_each_cfqg_st(cfqg, i, j, st)
1075 *st = CFQ_RB_ROOT; 1334 *st = CFQ_RB_ROOT;
1076 RB_CLEAR_NODE(&cfqg->rb_node); 1335 RB_CLEAR_NODE(&cfqg->rb_node);
1077 1336
1078 cfqg->ttime.last_end_request = jiffies; 1337 cfqg->ttime.last_end_request = jiffies;
1079
1080 /*
1081 * Take the initial reference that will be released on destroy
1082 * This can be thought of a joint reference by cgroup and
1083 * elevator which will be dropped by either elevator exit
1084 * or cgroup deletion path depending on who is exiting first.
1085 */
1086 cfqg->ref = 1;
1087
1088 ret = blkio_alloc_blkg_stats(&cfqg->blkg);
1089 if (ret) {
1090 kfree(cfqg);
1091 return NULL;
1092 }
1093
1094 return cfqg;
1095} 1338}
1096 1339
1097static struct cfq_group * 1340#ifdef CONFIG_CFQ_GROUP_IOSCHED
1098cfq_find_cfqg(struct cfq_data *cfqd, struct blkio_cgroup *blkcg) 1341static void cfq_pd_init(struct blkcg_gq *blkg)
1099{ 1342{
1100 struct cfq_group *cfqg = NULL; 1343 struct cfq_group *cfqg = blkg_to_cfqg(blkg);
1101 void *key = cfqd;
1102 struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
1103 unsigned int major, minor;
1104
1105 /*
1106 * This is the common case when there are no blkio cgroups.
1107 * Avoid lookup in this case
1108 */
1109 if (blkcg == &blkio_root_cgroup)
1110 cfqg = &cfqd->root_group;
1111 else
1112 cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
1113
1114 if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
1115 sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
1116 cfqg->blkg.dev = MKDEV(major, minor);
1117 }
1118 1344
1119 return cfqg; 1345 cfq_init_cfqg_base(cfqg);
1346 cfqg->weight = blkg->blkcg->cfq_weight;
1120} 1347}
1121 1348
1122/* 1349/*
1123 * Search for the cfq group current task belongs to. request_queue lock must 1350 * Search for the cfq group current task belongs to. request_queue lock must
1124 * be held. 1351 * be held.
1125 */ 1352 */
1126static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd) 1353static struct cfq_group *cfq_lookup_create_cfqg(struct cfq_data *cfqd,
1354 struct blkcg *blkcg)
1127{ 1355{
1128 struct blkio_cgroup *blkcg;
1129 struct cfq_group *cfqg = NULL, *__cfqg = NULL;
1130 struct request_queue *q = cfqd->queue; 1356 struct request_queue *q = cfqd->queue;
1357 struct cfq_group *cfqg = NULL;
1131 1358
1132 rcu_read_lock(); 1359 /* avoid lookup for the common case where there's no blkcg */
1133 blkcg = task_blkio_cgroup(current); 1360 if (blkcg == &blkcg_root) {
1134 cfqg = cfq_find_cfqg(cfqd, blkcg); 1361 cfqg = cfqd->root_group;
1135 if (cfqg) { 1362 } else {
1136 rcu_read_unlock(); 1363 struct blkcg_gq *blkg;
1137 return cfqg;
1138 }
1139
1140 /*
1141 * Need to allocate a group. Allocation of group also needs allocation
1142 * of per cpu stats which in-turn takes a mutex() and can block. Hence
1143 * we need to drop rcu lock and queue_lock before we call alloc.
1144 *
1145 * Not taking any queue reference here and assuming that queue is
1146 * around by the time we return. CFQ queue allocation code does
1147 * the same. It might be racy though.
1148 */
1149
1150 rcu_read_unlock();
1151 spin_unlock_irq(q->queue_lock);
1152
1153 cfqg = cfq_alloc_cfqg(cfqd);
1154
1155 spin_lock_irq(q->queue_lock);
1156
1157 rcu_read_lock();
1158 blkcg = task_blkio_cgroup(current);
1159
1160 /*
1161 * If some other thread already allocated the group while we were
1162 * not holding queue lock, free up the group
1163 */
1164 __cfqg = cfq_find_cfqg(cfqd, blkcg);
1165 1364
1166 if (__cfqg) { 1365 blkg = blkg_lookup_create(blkcg, q);
1167 kfree(cfqg); 1366 if (!IS_ERR(blkg))
1168 rcu_read_unlock(); 1367 cfqg = blkg_to_cfqg(blkg);
1169 return __cfqg;
1170 } 1368 }
1171 1369
1172 if (!cfqg)
1173 cfqg = &cfqd->root_group;
1174
1175 cfq_init_add_cfqg_lists(cfqd, cfqg, blkcg);
1176 rcu_read_unlock();
1177 return cfqg;
1178}
1179
1180static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg)
1181{
1182 cfqg->ref++;
1183 return cfqg; 1370 return cfqg;
1184} 1371}
1185 1372
@@ -1187,94 +1374,224 @@ static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg)
1187{ 1374{
1188 /* Currently, all async queues are mapped to root group */ 1375 /* Currently, all async queues are mapped to root group */
1189 if (!cfq_cfqq_sync(cfqq)) 1376 if (!cfq_cfqq_sync(cfqq))
1190 cfqg = &cfqq->cfqd->root_group; 1377 cfqg = cfqq->cfqd->root_group;
1191 1378
1192 cfqq->cfqg = cfqg; 1379 cfqq->cfqg = cfqg;
1193 /* cfqq reference on cfqg */ 1380 /* cfqq reference on cfqg */
1194 cfqq->cfqg->ref++; 1381 cfqg_get(cfqg);
1195} 1382}
1196 1383
1197static void cfq_put_cfqg(struct cfq_group *cfqg) 1384static u64 cfqg_prfill_weight_device(struct seq_file *sf,
1385 struct blkg_policy_data *pd, int off)
1198{ 1386{
1199 struct cfq_rb_root *st; 1387 struct cfq_group *cfqg = pd_to_cfqg(pd);
1200 int i, j;
1201 1388
1202 BUG_ON(cfqg->ref <= 0); 1389 if (!cfqg->dev_weight)
1203 cfqg->ref--; 1390 return 0;
1204 if (cfqg->ref) 1391 return __blkg_prfill_u64(sf, pd, cfqg->dev_weight);
1205 return;
1206 for_each_cfqg_st(cfqg, i, j, st)
1207 BUG_ON(!RB_EMPTY_ROOT(&st->rb));
1208 free_percpu(cfqg->blkg.stats_cpu);
1209 kfree(cfqg);
1210} 1392}
1211 1393
1212static void cfq_destroy_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg) 1394static int cfqg_print_weight_device(struct cgroup *cgrp, struct cftype *cft,
1395 struct seq_file *sf)
1213{ 1396{
1214 /* Something wrong if we are trying to remove same group twice */ 1397 blkcg_print_blkgs(sf, cgroup_to_blkcg(cgrp),
1215 BUG_ON(hlist_unhashed(&cfqg->cfqd_node)); 1398 cfqg_prfill_weight_device, &blkcg_policy_cfq, 0,
1399 false);
1400 return 0;
1401}
1216 1402
1217 hlist_del_init(&cfqg->cfqd_node); 1403static int cfq_print_weight(struct cgroup *cgrp, struct cftype *cft,
1404 struct seq_file *sf)
1405{
1406 seq_printf(sf, "%u\n", cgroup_to_blkcg(cgrp)->cfq_weight);
1407 return 0;
1408}
1218 1409
1219 BUG_ON(cfqd->nr_blkcg_linked_grps <= 0); 1410static int cfqg_set_weight_device(struct cgroup *cgrp, struct cftype *cft,
1220 cfqd->nr_blkcg_linked_grps--; 1411 const char *buf)
1412{
1413 struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
1414 struct blkg_conf_ctx ctx;
1415 struct cfq_group *cfqg;
1416 int ret;
1221 1417
1222 /* 1418 ret = blkg_conf_prep(blkcg, &blkcg_policy_cfq, buf, &ctx);
1223 * Put the reference taken at the time of creation so that when all 1419 if (ret)
1224 * queues are gone, group can be destroyed. 1420 return ret;
1225 */ 1421
1226 cfq_put_cfqg(cfqg); 1422 ret = -EINVAL;
1423 cfqg = blkg_to_cfqg(ctx.blkg);
1424 if (!ctx.v || (ctx.v >= CFQ_WEIGHT_MIN && ctx.v <= CFQ_WEIGHT_MAX)) {
1425 cfqg->dev_weight = ctx.v;
1426 cfqg->new_weight = cfqg->dev_weight ?: blkcg->cfq_weight;
1427 ret = 0;
1428 }
1429
1430 blkg_conf_finish(&ctx);
1431 return ret;
1227} 1432}
1228 1433
1229static void cfq_release_cfq_groups(struct cfq_data *cfqd) 1434static int cfq_set_weight(struct cgroup *cgrp, struct cftype *cft, u64 val)
1230{ 1435{
1231 struct hlist_node *pos, *n; 1436 struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
1232 struct cfq_group *cfqg; 1437 struct blkcg_gq *blkg;
1438 struct hlist_node *n;
1233 1439
1234 hlist_for_each_entry_safe(cfqg, pos, n, &cfqd->cfqg_list, cfqd_node) { 1440 if (val < CFQ_WEIGHT_MIN || val > CFQ_WEIGHT_MAX)
1235 /* 1441 return -EINVAL;
1236 * If cgroup removal path got to blk_group first and removed 1442
1237 * it from cgroup list, then it will take care of destroying 1443 spin_lock_irq(&blkcg->lock);
1238 * cfqg also. 1444 blkcg->cfq_weight = (unsigned int)val;
1239 */ 1445
1240 if (!cfq_blkiocg_del_blkio_group(&cfqg->blkg)) 1446 hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
1241 cfq_destroy_cfqg(cfqd, cfqg); 1447 struct cfq_group *cfqg = blkg_to_cfqg(blkg);
1448
1449 if (cfqg && !cfqg->dev_weight)
1450 cfqg->new_weight = blkcg->cfq_weight;
1242 } 1451 }
1452
1453 spin_unlock_irq(&blkcg->lock);
1454 return 0;
1243} 1455}
1244 1456
1245/* 1457static int cfqg_print_stat(struct cgroup *cgrp, struct cftype *cft,
1246 * Blk cgroup controller notification saying that blkio_group object is being 1458 struct seq_file *sf)
1247 * delinked as associated cgroup object is going away. That also means that
1248 * no new IO will come in this group. So get rid of this group as soon as
1249 * any pending IO in the group is finished.
1250 *
1251 * This function is called under rcu_read_lock(). key is the rcu protected
1252 * pointer. That means "key" is a valid cfq_data pointer as long as we are rcu
1253 * read lock.
1254 *
1255 * "key" was fetched from blkio_group under blkio_cgroup->lock. That means
1256 * it should not be NULL as even if elevator was exiting, cgroup deltion
1257 * path got to it first.
1258 */
1259static void cfq_unlink_blkio_group(void *key, struct blkio_group *blkg)
1260{ 1459{
1261 unsigned long flags; 1460 struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
1262 struct cfq_data *cfqd = key;
1263 1461
1264 spin_lock_irqsave(cfqd->queue->queue_lock, flags); 1462 blkcg_print_blkgs(sf, blkcg, blkg_prfill_stat, &blkcg_policy_cfq,
1265 cfq_destroy_cfqg(cfqd, cfqg_of_blkg(blkg)); 1463 cft->private, false);
1266 spin_unlock_irqrestore(cfqd->queue->queue_lock, flags); 1464 return 0;
1267} 1465}
1268 1466
1269#else /* GROUP_IOSCHED */ 1467static int cfqg_print_rwstat(struct cgroup *cgrp, struct cftype *cft,
1270static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd) 1468 struct seq_file *sf)
1271{ 1469{
1272 return &cfqd->root_group; 1470 struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
1471
1472 blkcg_print_blkgs(sf, blkcg, blkg_prfill_rwstat, &blkcg_policy_cfq,
1473 cft->private, true);
1474 return 0;
1273} 1475}
1274 1476
1275static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg) 1477#ifdef CONFIG_DEBUG_BLK_CGROUP
1478static u64 cfqg_prfill_avg_queue_size(struct seq_file *sf,
1479 struct blkg_policy_data *pd, int off)
1276{ 1480{
1277 return cfqg; 1481 struct cfq_group *cfqg = pd_to_cfqg(pd);
1482 u64 samples = blkg_stat_read(&cfqg->stats.avg_queue_size_samples);
1483 u64 v = 0;
1484
1485 if (samples) {
1486 v = blkg_stat_read(&cfqg->stats.avg_queue_size_sum);
1487 do_div(v, samples);
1488 }
1489 __blkg_prfill_u64(sf, pd, v);
1490 return 0;
1491}
1492
1493/* print avg_queue_size */
1494static int cfqg_print_avg_queue_size(struct cgroup *cgrp, struct cftype *cft,
1495 struct seq_file *sf)
1496{
1497 struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
1498
1499 blkcg_print_blkgs(sf, blkcg, cfqg_prfill_avg_queue_size,
1500 &blkcg_policy_cfq, 0, false);
1501 return 0;
1502}
1503#endif /* CONFIG_DEBUG_BLK_CGROUP */
1504
1505static struct cftype cfq_blkcg_files[] = {
1506 {
1507 .name = "weight_device",
1508 .read_seq_string = cfqg_print_weight_device,
1509 .write_string = cfqg_set_weight_device,
1510 .max_write_len = 256,
1511 },
1512 {
1513 .name = "weight",
1514 .read_seq_string = cfq_print_weight,
1515 .write_u64 = cfq_set_weight,
1516 },
1517 {
1518 .name = "time",
1519 .private = offsetof(struct cfq_group, stats.time),
1520 .read_seq_string = cfqg_print_stat,
1521 },
1522 {
1523 .name = "sectors",
1524 .private = offsetof(struct cfq_group, stats.sectors),
1525 .read_seq_string = cfqg_print_stat,
1526 },
1527 {
1528 .name = "io_service_bytes",
1529 .private = offsetof(struct cfq_group, stats.service_bytes),
1530 .read_seq_string = cfqg_print_rwstat,
1531 },
1532 {
1533 .name = "io_serviced",
1534 .private = offsetof(struct cfq_group, stats.serviced),
1535 .read_seq_string = cfqg_print_rwstat,
1536 },
1537 {
1538 .name = "io_service_time",
1539 .private = offsetof(struct cfq_group, stats.service_time),
1540 .read_seq_string = cfqg_print_rwstat,
1541 },
1542 {
1543 .name = "io_wait_time",
1544 .private = offsetof(struct cfq_group, stats.wait_time),
1545 .read_seq_string = cfqg_print_rwstat,
1546 },
1547 {
1548 .name = "io_merged",
1549 .private = offsetof(struct cfq_group, stats.merged),
1550 .read_seq_string = cfqg_print_rwstat,
1551 },
1552 {
1553 .name = "io_queued",
1554 .private = offsetof(struct cfq_group, stats.queued),
1555 .read_seq_string = cfqg_print_rwstat,
1556 },
1557#ifdef CONFIG_DEBUG_BLK_CGROUP
1558 {
1559 .name = "avg_queue_size",
1560 .read_seq_string = cfqg_print_avg_queue_size,
1561 },
1562 {
1563 .name = "group_wait_time",
1564 .private = offsetof(struct cfq_group, stats.group_wait_time),
1565 .read_seq_string = cfqg_print_stat,
1566 },
1567 {
1568 .name = "idle_time",
1569 .private = offsetof(struct cfq_group, stats.idle_time),
1570 .read_seq_string = cfqg_print_stat,
1571 },
1572 {
1573 .name = "empty_time",
1574 .private = offsetof(struct cfq_group, stats.empty_time),
1575 .read_seq_string = cfqg_print_stat,
1576 },
1577 {
1578 .name = "dequeue",
1579 .private = offsetof(struct cfq_group, stats.dequeue),
1580 .read_seq_string = cfqg_print_stat,
1581 },
1582 {
1583 .name = "unaccounted_time",
1584 .private = offsetof(struct cfq_group, stats.unaccounted_time),
1585 .read_seq_string = cfqg_print_stat,
1586 },
1587#endif /* CONFIG_DEBUG_BLK_CGROUP */
1588 { } /* terminate */
1589};
1590#else /* GROUP_IOSCHED */
1591static struct cfq_group *cfq_lookup_create_cfqg(struct cfq_data *cfqd,
1592 struct blkcg *blkcg)
1593{
1594 return cfqd->root_group;
1278} 1595}
1279 1596
1280static inline void 1597static inline void
@@ -1282,9 +1599,6 @@ cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) {
1282 cfqq->cfqg = cfqg; 1599 cfqq->cfqg = cfqg;
1283} 1600}
1284 1601
1285static void cfq_release_cfq_groups(struct cfq_data *cfqd) {}
1286static inline void cfq_put_cfqg(struct cfq_group *cfqg) {}
1287
1288#endif /* GROUP_IOSCHED */ 1602#endif /* GROUP_IOSCHED */
1289 1603
1290/* 1604/*
@@ -1551,12 +1865,10 @@ static void cfq_reposition_rq_rb(struct cfq_queue *cfqq, struct request *rq)
1551{ 1865{
1552 elv_rb_del(&cfqq->sort_list, rq); 1866 elv_rb_del(&cfqq->sort_list, rq);
1553 cfqq->queued[rq_is_sync(rq)]--; 1867 cfqq->queued[rq_is_sync(rq)]--;
1554 cfq_blkiocg_update_io_remove_stats(&(RQ_CFQG(rq))->blkg, 1868 cfqg_stats_update_io_remove(RQ_CFQG(rq), rq->cmd_flags);
1555 rq_data_dir(rq), rq_is_sync(rq));
1556 cfq_add_rq_rb(rq); 1869 cfq_add_rq_rb(rq);
1557 cfq_blkiocg_update_io_add_stats(&(RQ_CFQG(rq))->blkg, 1870 cfqg_stats_update_io_add(RQ_CFQG(rq), cfqq->cfqd->serving_group,
1558 &cfqq->cfqd->serving_group->blkg, rq_data_dir(rq), 1871 rq->cmd_flags);
1559 rq_is_sync(rq));
1560} 1872}
1561 1873
1562static struct request * 1874static struct request *
@@ -1612,8 +1924,7 @@ static void cfq_remove_request(struct request *rq)
1612 cfq_del_rq_rb(rq); 1924 cfq_del_rq_rb(rq);
1613 1925
1614 cfqq->cfqd->rq_queued--; 1926 cfqq->cfqd->rq_queued--;
1615 cfq_blkiocg_update_io_remove_stats(&(RQ_CFQG(rq))->blkg, 1927 cfqg_stats_update_io_remove(RQ_CFQG(rq), rq->cmd_flags);
1616 rq_data_dir(rq), rq_is_sync(rq));
1617 if (rq->cmd_flags & REQ_PRIO) { 1928 if (rq->cmd_flags & REQ_PRIO) {
1618 WARN_ON(!cfqq->prio_pending); 1929 WARN_ON(!cfqq->prio_pending);
1619 cfqq->prio_pending--; 1930 cfqq->prio_pending--;
@@ -1648,8 +1959,7 @@ static void cfq_merged_request(struct request_queue *q, struct request *req,
1648static void cfq_bio_merged(struct request_queue *q, struct request *req, 1959static void cfq_bio_merged(struct request_queue *q, struct request *req,
1649 struct bio *bio) 1960 struct bio *bio)
1650{ 1961{
1651 cfq_blkiocg_update_io_merged_stats(&(RQ_CFQG(req))->blkg, 1962 cfqg_stats_update_io_merged(RQ_CFQG(req), bio->bi_rw);
1652 bio_data_dir(bio), cfq_bio_sync(bio));
1653} 1963}
1654 1964
1655static void 1965static void
@@ -1671,8 +1981,7 @@ cfq_merged_requests(struct request_queue *q, struct request *rq,
1671 if (cfqq->next_rq == next) 1981 if (cfqq->next_rq == next)
1672 cfqq->next_rq = rq; 1982 cfqq->next_rq = rq;
1673 cfq_remove_request(next); 1983 cfq_remove_request(next);
1674 cfq_blkiocg_update_io_merged_stats(&(RQ_CFQG(rq))->blkg, 1984 cfqg_stats_update_io_merged(RQ_CFQG(rq), next->cmd_flags);
1675 rq_data_dir(next), rq_is_sync(next));
1676 1985
1677 cfqq = RQ_CFQQ(next); 1986 cfqq = RQ_CFQQ(next);
1678 /* 1987 /*
@@ -1713,7 +2022,7 @@ static int cfq_allow_merge(struct request_queue *q, struct request *rq,
1713static inline void cfq_del_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq) 2022static inline void cfq_del_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1714{ 2023{
1715 del_timer(&cfqd->idle_slice_timer); 2024 del_timer(&cfqd->idle_slice_timer);
1716 cfq_blkiocg_update_idle_time_stats(&cfqq->cfqg->blkg); 2025 cfqg_stats_update_idle_time(cfqq->cfqg);
1717} 2026}
1718 2027
1719static void __cfq_set_active_queue(struct cfq_data *cfqd, 2028static void __cfq_set_active_queue(struct cfq_data *cfqd,
@@ -1722,7 +2031,7 @@ static void __cfq_set_active_queue(struct cfq_data *cfqd,
1722 if (cfqq) { 2031 if (cfqq) {
1723 cfq_log_cfqq(cfqd, cfqq, "set_active wl_prio:%d wl_type:%d", 2032 cfq_log_cfqq(cfqd, cfqq, "set_active wl_prio:%d wl_type:%d",
1724 cfqd->serving_prio, cfqd->serving_type); 2033 cfqd->serving_prio, cfqd->serving_type);
1725 cfq_blkiocg_update_avg_queue_size_stats(&cfqq->cfqg->blkg); 2034 cfqg_stats_update_avg_queue_size(cfqq->cfqg);
1726 cfqq->slice_start = 0; 2035 cfqq->slice_start = 0;
1727 cfqq->dispatch_start = jiffies; 2036 cfqq->dispatch_start = jiffies;
1728 cfqq->allocated_slice = 0; 2037 cfqq->allocated_slice = 0;
@@ -2043,7 +2352,7 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
2043 * task has exited, don't wait 2352 * task has exited, don't wait
2044 */ 2353 */
2045 cic = cfqd->active_cic; 2354 cic = cfqd->active_cic;
2046 if (!cic || !atomic_read(&cic->icq.ioc->nr_tasks)) 2355 if (!cic || !atomic_read(&cic->icq.ioc->active_ref))
2047 return; 2356 return;
2048 2357
2049 /* 2358 /*
@@ -2070,7 +2379,7 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
2070 sl = cfqd->cfq_slice_idle; 2379 sl = cfqd->cfq_slice_idle;
2071 2380
2072 mod_timer(&cfqd->idle_slice_timer, jiffies + sl); 2381 mod_timer(&cfqd->idle_slice_timer, jiffies + sl);
2073 cfq_blkiocg_update_set_idle_time_stats(&cfqq->cfqg->blkg); 2382 cfqg_stats_set_start_idle_time(cfqq->cfqg);
2074 cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu group_idle: %d", sl, 2383 cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu group_idle: %d", sl,
2075 group_idle ? 1 : 0); 2384 group_idle ? 1 : 0);
2076} 2385}
@@ -2093,8 +2402,7 @@ static void cfq_dispatch_insert(struct request_queue *q, struct request *rq)
2093 2402
2094 cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++; 2403 cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++;
2095 cfqq->nr_sectors += blk_rq_sectors(rq); 2404 cfqq->nr_sectors += blk_rq_sectors(rq);
2096 cfq_blkiocg_update_dispatch_stats(&cfqq->cfqg->blkg, blk_rq_bytes(rq), 2405 cfqg_stats_update_dispatch(cfqq->cfqg, blk_rq_bytes(rq), rq->cmd_flags);
2097 rq_data_dir(rq), rq_is_sync(rq));
2098} 2406}
2099 2407
2100/* 2408/*
@@ -2677,7 +2985,7 @@ static void cfq_put_queue(struct cfq_queue *cfqq)
2677 2985
2678 BUG_ON(cfq_cfqq_on_rr(cfqq)); 2986 BUG_ON(cfq_cfqq_on_rr(cfqq));
2679 kmem_cache_free(cfq_pool, cfqq); 2987 kmem_cache_free(cfq_pool, cfqq);
2680 cfq_put_cfqg(cfqg); 2988 cfqg_put(cfqg);
2681} 2989}
2682 2990
2683static void cfq_put_cooperator(struct cfq_queue *cfqq) 2991static void cfq_put_cooperator(struct cfq_queue *cfqq)
@@ -2736,7 +3044,7 @@ static void cfq_exit_icq(struct io_cq *icq)
2736 } 3044 }
2737} 3045}
2738 3046
2739static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc) 3047static void cfq_init_prio_data(struct cfq_queue *cfqq, struct cfq_io_cq *cic)
2740{ 3048{
2741 struct task_struct *tsk = current; 3049 struct task_struct *tsk = current;
2742 int ioprio_class; 3050 int ioprio_class;
@@ -2744,7 +3052,7 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc)
2744 if (!cfq_cfqq_prio_changed(cfqq)) 3052 if (!cfq_cfqq_prio_changed(cfqq))
2745 return; 3053 return;
2746 3054
2747 ioprio_class = IOPRIO_PRIO_CLASS(ioc->ioprio); 3055 ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio);
2748 switch (ioprio_class) { 3056 switch (ioprio_class) {
2749 default: 3057 default:
2750 printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class); 3058 printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class);
@@ -2756,11 +3064,11 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc)
2756 cfqq->ioprio_class = task_nice_ioclass(tsk); 3064 cfqq->ioprio_class = task_nice_ioclass(tsk);
2757 break; 3065 break;
2758 case IOPRIO_CLASS_RT: 3066 case IOPRIO_CLASS_RT:
2759 cfqq->ioprio = task_ioprio(ioc); 3067 cfqq->ioprio = IOPRIO_PRIO_DATA(cic->ioprio);
2760 cfqq->ioprio_class = IOPRIO_CLASS_RT; 3068 cfqq->ioprio_class = IOPRIO_CLASS_RT;
2761 break; 3069 break;
2762 case IOPRIO_CLASS_BE: 3070 case IOPRIO_CLASS_BE:
2763 cfqq->ioprio = task_ioprio(ioc); 3071 cfqq->ioprio = IOPRIO_PRIO_DATA(cic->ioprio);
2764 cfqq->ioprio_class = IOPRIO_CLASS_BE; 3072 cfqq->ioprio_class = IOPRIO_CLASS_BE;
2765 break; 3073 break;
2766 case IOPRIO_CLASS_IDLE: 3074 case IOPRIO_CLASS_IDLE:
@@ -2778,19 +3086,24 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc)
2778 cfq_clear_cfqq_prio_changed(cfqq); 3086 cfq_clear_cfqq_prio_changed(cfqq);
2779} 3087}
2780 3088
2781static void changed_ioprio(struct cfq_io_cq *cic) 3089static void check_ioprio_changed(struct cfq_io_cq *cic, struct bio *bio)
2782{ 3090{
3091 int ioprio = cic->icq.ioc->ioprio;
2783 struct cfq_data *cfqd = cic_to_cfqd(cic); 3092 struct cfq_data *cfqd = cic_to_cfqd(cic);
2784 struct cfq_queue *cfqq; 3093 struct cfq_queue *cfqq;
2785 3094
2786 if (unlikely(!cfqd)) 3095 /*
3096 * Check whether ioprio has changed. The condition may trigger
3097 * spuriously on a newly created cic but there's no harm.
3098 */
3099 if (unlikely(!cfqd) || likely(cic->ioprio == ioprio))
2787 return; 3100 return;
2788 3101
2789 cfqq = cic->cfqq[BLK_RW_ASYNC]; 3102 cfqq = cic->cfqq[BLK_RW_ASYNC];
2790 if (cfqq) { 3103 if (cfqq) {
2791 struct cfq_queue *new_cfqq; 3104 struct cfq_queue *new_cfqq;
2792 new_cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic->icq.ioc, 3105 new_cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic, bio,
2793 GFP_ATOMIC); 3106 GFP_ATOMIC);
2794 if (new_cfqq) { 3107 if (new_cfqq) {
2795 cic->cfqq[BLK_RW_ASYNC] = new_cfqq; 3108 cic->cfqq[BLK_RW_ASYNC] = new_cfqq;
2796 cfq_put_queue(cfqq); 3109 cfq_put_queue(cfqq);
@@ -2800,6 +3113,8 @@ static void changed_ioprio(struct cfq_io_cq *cic)
2800 cfqq = cic->cfqq[BLK_RW_SYNC]; 3113 cfqq = cic->cfqq[BLK_RW_SYNC];
2801 if (cfqq) 3114 if (cfqq)
2802 cfq_mark_cfqq_prio_changed(cfqq); 3115 cfq_mark_cfqq_prio_changed(cfqq);
3116
3117 cic->ioprio = ioprio;
2803} 3118}
2804 3119
2805static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq, 3120static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
@@ -2823,17 +3138,24 @@ static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
2823} 3138}
2824 3139
2825#ifdef CONFIG_CFQ_GROUP_IOSCHED 3140#ifdef CONFIG_CFQ_GROUP_IOSCHED
2826static void changed_cgroup(struct cfq_io_cq *cic) 3141static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)
2827{ 3142{
2828 struct cfq_queue *sync_cfqq = cic_to_cfqq(cic, 1);
2829 struct cfq_data *cfqd = cic_to_cfqd(cic); 3143 struct cfq_data *cfqd = cic_to_cfqd(cic);
2830 struct request_queue *q; 3144 struct cfq_queue *sync_cfqq;
3145 uint64_t id;
2831 3146
2832 if (unlikely(!cfqd)) 3147 rcu_read_lock();
2833 return; 3148 id = bio_blkcg(bio)->id;
3149 rcu_read_unlock();
2834 3150
2835 q = cfqd->queue; 3151 /*
3152 * Check whether blkcg has changed. The condition may trigger
3153 * spuriously on a newly created cic but there's no harm.
3154 */
3155 if (unlikely(!cfqd) || likely(cic->blkcg_id == id))
3156 return;
2836 3157
3158 sync_cfqq = cic_to_cfqq(cic, 1);
2837 if (sync_cfqq) { 3159 if (sync_cfqq) {
2838 /* 3160 /*
2839 * Drop reference to sync queue. A new sync queue will be 3161 * Drop reference to sync queue. A new sync queue will be
@@ -2843,21 +3165,26 @@ static void changed_cgroup(struct cfq_io_cq *cic)
2843 cic_set_cfqq(cic, NULL, 1); 3165 cic_set_cfqq(cic, NULL, 1);
2844 cfq_put_queue(sync_cfqq); 3166 cfq_put_queue(sync_cfqq);
2845 } 3167 }
3168
3169 cic->blkcg_id = id;
2846} 3170}
3171#else
3172static inline void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) { }
2847#endif /* CONFIG_CFQ_GROUP_IOSCHED */ 3173#endif /* CONFIG_CFQ_GROUP_IOSCHED */
2848 3174
2849static struct cfq_queue * 3175static struct cfq_queue *
2850cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync, 3176cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic,
2851 struct io_context *ioc, gfp_t gfp_mask) 3177 struct bio *bio, gfp_t gfp_mask)
2852{ 3178{
3179 struct blkcg *blkcg;
2853 struct cfq_queue *cfqq, *new_cfqq = NULL; 3180 struct cfq_queue *cfqq, *new_cfqq = NULL;
2854 struct cfq_io_cq *cic;
2855 struct cfq_group *cfqg; 3181 struct cfq_group *cfqg;
2856 3182
2857retry: 3183retry:
2858 cfqg = cfq_get_cfqg(cfqd); 3184 rcu_read_lock();
2859 cic = cfq_cic_lookup(cfqd, ioc); 3185
2860 /* cic always exists here */ 3186 blkcg = bio_blkcg(bio);
3187 cfqg = cfq_lookup_create_cfqg(cfqd, blkcg);
2861 cfqq = cic_to_cfqq(cic, is_sync); 3188 cfqq = cic_to_cfqq(cic, is_sync);
2862 3189
2863 /* 3190 /*
@@ -2870,6 +3197,7 @@ retry:
2870 cfqq = new_cfqq; 3197 cfqq = new_cfqq;
2871 new_cfqq = NULL; 3198 new_cfqq = NULL;
2872 } else if (gfp_mask & __GFP_WAIT) { 3199 } else if (gfp_mask & __GFP_WAIT) {
3200 rcu_read_unlock();
2873 spin_unlock_irq(cfqd->queue->queue_lock); 3201 spin_unlock_irq(cfqd->queue->queue_lock);
2874 new_cfqq = kmem_cache_alloc_node(cfq_pool, 3202 new_cfqq = kmem_cache_alloc_node(cfq_pool,
2875 gfp_mask | __GFP_ZERO, 3203 gfp_mask | __GFP_ZERO,
@@ -2885,7 +3213,7 @@ retry:
2885 3213
2886 if (cfqq) { 3214 if (cfqq) {
2887 cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync); 3215 cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync);
2888 cfq_init_prio_data(cfqq, ioc); 3216 cfq_init_prio_data(cfqq, cic);
2889 cfq_link_cfqq_cfqg(cfqq, cfqg); 3217 cfq_link_cfqq_cfqg(cfqq, cfqg);
2890 cfq_log_cfqq(cfqd, cfqq, "alloced"); 3218 cfq_log_cfqq(cfqd, cfqq, "alloced");
2891 } else 3219 } else
@@ -2895,6 +3223,7 @@ retry:
2895 if (new_cfqq) 3223 if (new_cfqq)
2896 kmem_cache_free(cfq_pool, new_cfqq); 3224 kmem_cache_free(cfq_pool, new_cfqq);
2897 3225
3226 rcu_read_unlock();
2898 return cfqq; 3227 return cfqq;
2899} 3228}
2900 3229
@@ -2904,6 +3233,9 @@ cfq_async_queue_prio(struct cfq_data *cfqd, int ioprio_class, int ioprio)
2904 switch (ioprio_class) { 3233 switch (ioprio_class) {
2905 case IOPRIO_CLASS_RT: 3234 case IOPRIO_CLASS_RT:
2906 return &cfqd->async_cfqq[0][ioprio]; 3235 return &cfqd->async_cfqq[0][ioprio];
3236 case IOPRIO_CLASS_NONE:
3237 ioprio = IOPRIO_NORM;
3238 /* fall through */
2907 case IOPRIO_CLASS_BE: 3239 case IOPRIO_CLASS_BE:
2908 return &cfqd->async_cfqq[1][ioprio]; 3240 return &cfqd->async_cfqq[1][ioprio];
2909 case IOPRIO_CLASS_IDLE: 3241 case IOPRIO_CLASS_IDLE:
@@ -2914,11 +3246,11 @@ cfq_async_queue_prio(struct cfq_data *cfqd, int ioprio_class, int ioprio)
2914} 3246}
2915 3247
2916static struct cfq_queue * 3248static struct cfq_queue *
2917cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct io_context *ioc, 3249cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic,
2918 gfp_t gfp_mask) 3250 struct bio *bio, gfp_t gfp_mask)
2919{ 3251{
2920 const int ioprio = task_ioprio(ioc); 3252 const int ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio);
2921 const int ioprio_class = task_ioprio_class(ioc); 3253 const int ioprio = IOPRIO_PRIO_DATA(cic->ioprio);
2922 struct cfq_queue **async_cfqq = NULL; 3254 struct cfq_queue **async_cfqq = NULL;
2923 struct cfq_queue *cfqq = NULL; 3255 struct cfq_queue *cfqq = NULL;
2924 3256
@@ -2928,7 +3260,7 @@ cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct io_context *ioc,
2928 } 3260 }
2929 3261
2930 if (!cfqq) 3262 if (!cfqq)
2931 cfqq = cfq_find_alloc_queue(cfqd, is_sync, ioc, gfp_mask); 3263 cfqq = cfq_find_alloc_queue(cfqd, is_sync, cic, bio, gfp_mask);
2932 3264
2933 /* 3265 /*
2934 * pin the queue now that it's allocated, scheduler exit will prune it 3266 * pin the queue now that it's allocated, scheduler exit will prune it
@@ -3010,7 +3342,7 @@ cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
3010 3342
3011 if (cfqq->next_rq && (cfqq->next_rq->cmd_flags & REQ_NOIDLE)) 3343 if (cfqq->next_rq && (cfqq->next_rq->cmd_flags & REQ_NOIDLE))
3012 enable_idle = 0; 3344 enable_idle = 0;
3013 else if (!atomic_read(&cic->icq.ioc->nr_tasks) || 3345 else if (!atomic_read(&cic->icq.ioc->active_ref) ||
3014 !cfqd->cfq_slice_idle || 3346 !cfqd->cfq_slice_idle ||
3015 (!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq))) 3347 (!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq)))
3016 enable_idle = 0; 3348 enable_idle = 0;
@@ -3174,8 +3506,7 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
3174 cfq_clear_cfqq_wait_request(cfqq); 3506 cfq_clear_cfqq_wait_request(cfqq);
3175 __blk_run_queue(cfqd->queue); 3507 __blk_run_queue(cfqd->queue);
3176 } else { 3508 } else {
3177 cfq_blkiocg_update_idle_time_stats( 3509 cfqg_stats_update_idle_time(cfqq->cfqg);
3178 &cfqq->cfqg->blkg);
3179 cfq_mark_cfqq_must_dispatch(cfqq); 3510 cfq_mark_cfqq_must_dispatch(cfqq);
3180 } 3511 }
3181 } 3512 }
@@ -3197,14 +3528,13 @@ static void cfq_insert_request(struct request_queue *q, struct request *rq)
3197 struct cfq_queue *cfqq = RQ_CFQQ(rq); 3528 struct cfq_queue *cfqq = RQ_CFQQ(rq);
3198 3529
3199 cfq_log_cfqq(cfqd, cfqq, "insert_request"); 3530 cfq_log_cfqq(cfqd, cfqq, "insert_request");
3200 cfq_init_prio_data(cfqq, RQ_CIC(rq)->icq.ioc); 3531 cfq_init_prio_data(cfqq, RQ_CIC(rq));
3201 3532
3202 rq_set_fifo_time(rq, jiffies + cfqd->cfq_fifo_expire[rq_is_sync(rq)]); 3533 rq_set_fifo_time(rq, jiffies + cfqd->cfq_fifo_expire[rq_is_sync(rq)]);
3203 list_add_tail(&rq->queuelist, &cfqq->fifo); 3534 list_add_tail(&rq->queuelist, &cfqq->fifo);
3204 cfq_add_rq_rb(rq); 3535 cfq_add_rq_rb(rq);
3205 cfq_blkiocg_update_io_add_stats(&(RQ_CFQG(rq))->blkg, 3536 cfqg_stats_update_io_add(RQ_CFQG(rq), cfqd->serving_group,
3206 &cfqd->serving_group->blkg, rq_data_dir(rq), 3537 rq->cmd_flags);
3207 rq_is_sync(rq));
3208 cfq_rq_enqueued(cfqd, cfqq, rq); 3538 cfq_rq_enqueued(cfqd, cfqq, rq);
3209} 3539}
3210 3540
@@ -3300,9 +3630,8 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
3300 cfqd->rq_in_driver--; 3630 cfqd->rq_in_driver--;
3301 cfqq->dispatched--; 3631 cfqq->dispatched--;
3302 (RQ_CFQG(rq))->dispatched--; 3632 (RQ_CFQG(rq))->dispatched--;
3303 cfq_blkiocg_update_completion_stats(&cfqq->cfqg->blkg, 3633 cfqg_stats_update_completion(cfqq->cfqg, rq_start_time_ns(rq),
3304 rq_start_time_ns(rq), rq_io_start_time_ns(rq), 3634 rq_io_start_time_ns(rq), rq->cmd_flags);
3305 rq_data_dir(rq), rq_is_sync(rq));
3306 3635
3307 cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--; 3636 cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--;
3308 3637
@@ -3399,7 +3728,7 @@ static int cfq_may_queue(struct request_queue *q, int rw)
3399 3728
3400 cfqq = cic_to_cfqq(cic, rw_is_sync(rw)); 3729 cfqq = cic_to_cfqq(cic, rw_is_sync(rw));
3401 if (cfqq) { 3730 if (cfqq) {
3402 cfq_init_prio_data(cfqq, cic->icq.ioc); 3731 cfq_init_prio_data(cfqq, cic);
3403 3732
3404 return __cfq_may_queue(cfqq); 3733 return __cfq_may_queue(cfqq);
3405 } 3734 }
@@ -3421,7 +3750,7 @@ static void cfq_put_request(struct request *rq)
3421 cfqq->allocated[rw]--; 3750 cfqq->allocated[rw]--;
3422 3751
3423 /* Put down rq reference on cfqg */ 3752 /* Put down rq reference on cfqg */
3424 cfq_put_cfqg(RQ_CFQG(rq)); 3753 cfqg_put(RQ_CFQG(rq));
3425 rq->elv.priv[0] = NULL; 3754 rq->elv.priv[0] = NULL;
3426 rq->elv.priv[1] = NULL; 3755 rq->elv.priv[1] = NULL;
3427 3756
@@ -3465,32 +3794,25 @@ split_cfqq(struct cfq_io_cq *cic, struct cfq_queue *cfqq)
3465 * Allocate cfq data structures associated with this request. 3794 * Allocate cfq data structures associated with this request.
3466 */ 3795 */
3467static int 3796static int
3468cfq_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask) 3797cfq_set_request(struct request_queue *q, struct request *rq, struct bio *bio,
3798 gfp_t gfp_mask)
3469{ 3799{
3470 struct cfq_data *cfqd = q->elevator->elevator_data; 3800 struct cfq_data *cfqd = q->elevator->elevator_data;
3471 struct cfq_io_cq *cic = icq_to_cic(rq->elv.icq); 3801 struct cfq_io_cq *cic = icq_to_cic(rq->elv.icq);
3472 const int rw = rq_data_dir(rq); 3802 const int rw = rq_data_dir(rq);
3473 const bool is_sync = rq_is_sync(rq); 3803 const bool is_sync = rq_is_sync(rq);
3474 struct cfq_queue *cfqq; 3804 struct cfq_queue *cfqq;
3475 unsigned int changed;
3476 3805
3477 might_sleep_if(gfp_mask & __GFP_WAIT); 3806 might_sleep_if(gfp_mask & __GFP_WAIT);
3478 3807
3479 spin_lock_irq(q->queue_lock); 3808 spin_lock_irq(q->queue_lock);
3480 3809
3481 /* handle changed notifications */ 3810 check_ioprio_changed(cic, bio);
3482 changed = icq_get_changed(&cic->icq); 3811 check_blkcg_changed(cic, bio);
3483 if (unlikely(changed & ICQ_IOPRIO_CHANGED))
3484 changed_ioprio(cic);
3485#ifdef CONFIG_CFQ_GROUP_IOSCHED
3486 if (unlikely(changed & ICQ_CGROUP_CHANGED))
3487 changed_cgroup(cic);
3488#endif
3489
3490new_queue: 3812new_queue:
3491 cfqq = cic_to_cfqq(cic, is_sync); 3813 cfqq = cic_to_cfqq(cic, is_sync);
3492 if (!cfqq || cfqq == &cfqd->oom_cfqq) { 3814 if (!cfqq || cfqq == &cfqd->oom_cfqq) {
3493 cfqq = cfq_get_queue(cfqd, is_sync, cic->icq.ioc, gfp_mask); 3815 cfqq = cfq_get_queue(cfqd, is_sync, cic, bio, gfp_mask);
3494 cic_set_cfqq(cic, cfqq, is_sync); 3816 cic_set_cfqq(cic, cfqq, is_sync);
3495 } else { 3817 } else {
3496 /* 3818 /*
@@ -3516,8 +3838,9 @@ new_queue:
3516 cfqq->allocated[rw]++; 3838 cfqq->allocated[rw]++;
3517 3839
3518 cfqq->ref++; 3840 cfqq->ref++;
3841 cfqg_get(cfqq->cfqg);
3519 rq->elv.priv[0] = cfqq; 3842 rq->elv.priv[0] = cfqq;
3520 rq->elv.priv[1] = cfq_ref_get_cfqg(cfqq->cfqg); 3843 rq->elv.priv[1] = cfqq->cfqg;
3521 spin_unlock_irq(q->queue_lock); 3844 spin_unlock_irq(q->queue_lock);
3522 return 0; 3845 return 0;
3523} 3846}
@@ -3614,7 +3937,6 @@ static void cfq_exit_queue(struct elevator_queue *e)
3614{ 3937{
3615 struct cfq_data *cfqd = e->elevator_data; 3938 struct cfq_data *cfqd = e->elevator_data;
3616 struct request_queue *q = cfqd->queue; 3939 struct request_queue *q = cfqd->queue;
3617 bool wait = false;
3618 3940
3619 cfq_shutdown_timer_wq(cfqd); 3941 cfq_shutdown_timer_wq(cfqd);
3620 3942
@@ -3624,89 +3946,52 @@ static void cfq_exit_queue(struct elevator_queue *e)
3624 __cfq_slice_expired(cfqd, cfqd->active_queue, 0); 3946 __cfq_slice_expired(cfqd, cfqd->active_queue, 0);
3625 3947
3626 cfq_put_async_queues(cfqd); 3948 cfq_put_async_queues(cfqd);
3627 cfq_release_cfq_groups(cfqd);
3628
3629 /*
3630 * If there are groups which we could not unlink from blkcg list,
3631 * wait for a rcu period for them to be freed.
3632 */
3633 if (cfqd->nr_blkcg_linked_grps)
3634 wait = true;
3635 3949
3636 spin_unlock_irq(q->queue_lock); 3950 spin_unlock_irq(q->queue_lock);
3637 3951
3638 cfq_shutdown_timer_wq(cfqd); 3952 cfq_shutdown_timer_wq(cfqd);
3639 3953
3640 /* 3954#ifndef CONFIG_CFQ_GROUP_IOSCHED
3641 * Wait for cfqg->blkg->key accessors to exit their grace periods. 3955 kfree(cfqd->root_group);
3642 * Do this wait only if there are other unlinked groups out
3643 * there. This can happen if cgroup deletion path claimed the
3644 * responsibility of cleaning up a group before queue cleanup code
3645 * get to the group.
3646 *
3647 * Do not call synchronize_rcu() unconditionally as there are drivers
3648 * which create/delete request queue hundreds of times during scan/boot
3649 * and synchronize_rcu() can take significant time and slow down boot.
3650 */
3651 if (wait)
3652 synchronize_rcu();
3653
3654#ifdef CONFIG_CFQ_GROUP_IOSCHED
3655 /* Free up per cpu stats for root group */
3656 free_percpu(cfqd->root_group.blkg.stats_cpu);
3657#endif 3956#endif
3957 blkcg_deactivate_policy(q, &blkcg_policy_cfq);
3658 kfree(cfqd); 3958 kfree(cfqd);
3659} 3959}
3660 3960
3661static void *cfq_init_queue(struct request_queue *q) 3961static int cfq_init_queue(struct request_queue *q)
3662{ 3962{
3663 struct cfq_data *cfqd; 3963 struct cfq_data *cfqd;
3664 int i, j; 3964 struct blkcg_gq *blkg __maybe_unused;
3665 struct cfq_group *cfqg; 3965 int i, ret;
3666 struct cfq_rb_root *st;
3667 3966
3668 cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node); 3967 cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node);
3669 if (!cfqd) 3968 if (!cfqd)
3670 return NULL; 3969 return -ENOMEM;
3970
3971 cfqd->queue = q;
3972 q->elevator->elevator_data = cfqd;
3671 3973
3672 /* Init root service tree */ 3974 /* Init root service tree */
3673 cfqd->grp_service_tree = CFQ_RB_ROOT; 3975 cfqd->grp_service_tree = CFQ_RB_ROOT;
3674 3976
3675 /* Init root group */ 3977 /* Init root group and prefer root group over other groups by default */
3676 cfqg = &cfqd->root_group;
3677 for_each_cfqg_st(cfqg, i, j, st)
3678 *st = CFQ_RB_ROOT;
3679 RB_CLEAR_NODE(&cfqg->rb_node);
3680
3681 /* Give preference to root group over other groups */
3682 cfqg->weight = 2*BLKIO_WEIGHT_DEFAULT;
3683
3684#ifdef CONFIG_CFQ_GROUP_IOSCHED 3978#ifdef CONFIG_CFQ_GROUP_IOSCHED
3685 /* 3979 ret = blkcg_activate_policy(q, &blkcg_policy_cfq);
3686 * Set root group reference to 2. One reference will be dropped when 3980 if (ret)
3687 * all groups on cfqd->cfqg_list are being deleted during queue exit. 3981 goto out_free;
3688 * Other reference will remain there as we don't want to delete this
3689 * group as it is statically allocated and gets destroyed when
3690 * throtl_data goes away.
3691 */
3692 cfqg->ref = 2;
3693
3694 if (blkio_alloc_blkg_stats(&cfqg->blkg)) {
3695 kfree(cfqg);
3696 kfree(cfqd);
3697 return NULL;
3698 }
3699
3700 rcu_read_lock();
3701 3982
3702 cfq_blkiocg_add_blkio_group(&blkio_root_cgroup, &cfqg->blkg, 3983 cfqd->root_group = blkg_to_cfqg(q->root_blkg);
3703 (void *)cfqd, 0); 3984#else
3704 rcu_read_unlock(); 3985 ret = -ENOMEM;
3705 cfqd->nr_blkcg_linked_grps++; 3986 cfqd->root_group = kzalloc_node(sizeof(*cfqd->root_group),
3987 GFP_KERNEL, cfqd->queue->node);
3988 if (!cfqd->root_group)
3989 goto out_free;
3706 3990
3707 /* Add group on cfqd->cfqg_list */ 3991 cfq_init_cfqg_base(cfqd->root_group);
3708 hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
3709#endif 3992#endif
3993 cfqd->root_group->weight = 2 * CFQ_WEIGHT_DEFAULT;
3994
3710 /* 3995 /*
3711 * Not strictly needed (since RB_ROOT just clears the node and we 3996 * Not strictly needed (since RB_ROOT just clears the node and we
3712 * zeroed cfqd on alloc), but better be safe in case someone decides 3997 * zeroed cfqd on alloc), but better be safe in case someone decides
@@ -3718,13 +4003,17 @@ static void *cfq_init_queue(struct request_queue *q)
3718 /* 4003 /*
3719 * Our fallback cfqq if cfq_find_alloc_queue() runs into OOM issues. 4004 * Our fallback cfqq if cfq_find_alloc_queue() runs into OOM issues.
3720 * Grab a permanent reference to it, so that the normal code flow 4005 * Grab a permanent reference to it, so that the normal code flow
3721 * will not attempt to free it. 4006 * will not attempt to free it. oom_cfqq is linked to root_group
4007 * but shouldn't hold a reference as it'll never be unlinked. Lose
4008 * the reference from linking right away.
3722 */ 4009 */
3723 cfq_init_cfqq(cfqd, &cfqd->oom_cfqq, 1, 0); 4010 cfq_init_cfqq(cfqd, &cfqd->oom_cfqq, 1, 0);
3724 cfqd->oom_cfqq.ref++; 4011 cfqd->oom_cfqq.ref++;
3725 cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, &cfqd->root_group);
3726 4012
3727 cfqd->queue = q; 4013 spin_lock_irq(q->queue_lock);
4014 cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, cfqd->root_group);
4015 cfqg_put(cfqd->root_group);
4016 spin_unlock_irq(q->queue_lock);
3728 4017
3729 init_timer(&cfqd->idle_slice_timer); 4018 init_timer(&cfqd->idle_slice_timer);
3730 cfqd->idle_slice_timer.function = cfq_idle_slice_timer; 4019 cfqd->idle_slice_timer.function = cfq_idle_slice_timer;
@@ -3750,7 +4039,11 @@ static void *cfq_init_queue(struct request_queue *q)
3750 * second, in order to have larger depth for async operations. 4039 * second, in order to have larger depth for async operations.
3751 */ 4040 */
3752 cfqd->last_delayed_sync = jiffies - HZ; 4041 cfqd->last_delayed_sync = jiffies - HZ;
3753 return cfqd; 4042 return 0;
4043
4044out_free:
4045 kfree(cfqd);
4046 return ret;
3754} 4047}
3755 4048
3756/* 4049/*
@@ -3877,15 +4170,13 @@ static struct elevator_type iosched_cfq = {
3877}; 4170};
3878 4171
3879#ifdef CONFIG_CFQ_GROUP_IOSCHED 4172#ifdef CONFIG_CFQ_GROUP_IOSCHED
3880static struct blkio_policy_type blkio_policy_cfq = { 4173static struct blkcg_policy blkcg_policy_cfq = {
3881 .ops = { 4174 .pd_size = sizeof(struct cfq_group),
3882 .blkio_unlink_group_fn = cfq_unlink_blkio_group, 4175 .cftypes = cfq_blkcg_files,
3883 .blkio_update_group_weight_fn = cfq_update_blkio_group_weight, 4176
3884 }, 4177 .pd_init_fn = cfq_pd_init,
3885 .plid = BLKIO_POLICY_PROP, 4178 .pd_reset_stats_fn = cfq_pd_reset_stats,
3886}; 4179};
3887#else
3888static struct blkio_policy_type blkio_policy_cfq;
3889#endif 4180#endif
3890 4181
3891static int __init cfq_init(void) 4182static int __init cfq_init(void)
@@ -3906,24 +4197,31 @@ static int __init cfq_init(void)
3906#else 4197#else
3907 cfq_group_idle = 0; 4198 cfq_group_idle = 0;
3908#endif 4199#endif
4200
4201 ret = blkcg_policy_register(&blkcg_policy_cfq);
4202 if (ret)
4203 return ret;
4204
3909 cfq_pool = KMEM_CACHE(cfq_queue, 0); 4205 cfq_pool = KMEM_CACHE(cfq_queue, 0);
3910 if (!cfq_pool) 4206 if (!cfq_pool)
3911 return -ENOMEM; 4207 goto err_pol_unreg;
3912 4208
3913 ret = elv_register(&iosched_cfq); 4209 ret = elv_register(&iosched_cfq);
3914 if (ret) { 4210 if (ret)
3915 kmem_cache_destroy(cfq_pool); 4211 goto err_free_pool;
3916 return ret;
3917 }
3918
3919 blkio_policy_register(&blkio_policy_cfq);
3920 4212
3921 return 0; 4213 return 0;
4214
4215err_free_pool:
4216 kmem_cache_destroy(cfq_pool);
4217err_pol_unreg:
4218 blkcg_policy_unregister(&blkcg_policy_cfq);
4219 return ret;
3922} 4220}
3923 4221
3924static void __exit cfq_exit(void) 4222static void __exit cfq_exit(void)
3925{ 4223{
3926 blkio_policy_unregister(&blkio_policy_cfq); 4224 blkcg_policy_unregister(&blkcg_policy_cfq);
3927 elv_unregister(&iosched_cfq); 4225 elv_unregister(&iosched_cfq);
3928 kmem_cache_destroy(cfq_pool); 4226 kmem_cache_destroy(cfq_pool);
3929} 4227}
diff --git a/block/cfq.h b/block/cfq.h
deleted file mode 100644
index 2a155927e37c..000000000000
--- a/block/cfq.h
+++ /dev/null
@@ -1,115 +0,0 @@
1#ifndef _CFQ_H
2#define _CFQ_H
3#include "blk-cgroup.h"
4
5#ifdef CONFIG_CFQ_GROUP_IOSCHED
6static inline void cfq_blkiocg_update_io_add_stats(struct blkio_group *blkg,
7 struct blkio_group *curr_blkg, bool direction, bool sync)
8{
9 blkiocg_update_io_add_stats(blkg, curr_blkg, direction, sync);
10}
11
12static inline void cfq_blkiocg_update_dequeue_stats(struct blkio_group *blkg,
13 unsigned long dequeue)
14{
15 blkiocg_update_dequeue_stats(blkg, dequeue);
16}
17
18static inline void cfq_blkiocg_update_timeslice_used(struct blkio_group *blkg,
19 unsigned long time, unsigned long unaccounted_time)
20{
21 blkiocg_update_timeslice_used(blkg, time, unaccounted_time);
22}
23
24static inline void cfq_blkiocg_set_start_empty_time(struct blkio_group *blkg)
25{
26 blkiocg_set_start_empty_time(blkg);
27}
28
29static inline void cfq_blkiocg_update_io_remove_stats(struct blkio_group *blkg,
30 bool direction, bool sync)
31{
32 blkiocg_update_io_remove_stats(blkg, direction, sync);
33}
34
35static inline void cfq_blkiocg_update_io_merged_stats(struct blkio_group *blkg,
36 bool direction, bool sync)
37{
38 blkiocg_update_io_merged_stats(blkg, direction, sync);
39}
40
41static inline void cfq_blkiocg_update_idle_time_stats(struct blkio_group *blkg)
42{
43 blkiocg_update_idle_time_stats(blkg);
44}
45
46static inline void
47cfq_blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg)
48{
49 blkiocg_update_avg_queue_size_stats(blkg);
50}
51
52static inline void
53cfq_blkiocg_update_set_idle_time_stats(struct blkio_group *blkg)
54{
55 blkiocg_update_set_idle_time_stats(blkg);
56}
57
58static inline void cfq_blkiocg_update_dispatch_stats(struct blkio_group *blkg,
59 uint64_t bytes, bool direction, bool sync)
60{
61 blkiocg_update_dispatch_stats(blkg, bytes, direction, sync);
62}
63
64static inline void cfq_blkiocg_update_completion_stats(struct blkio_group *blkg, uint64_t start_time, uint64_t io_start_time, bool direction, bool sync)
65{
66 blkiocg_update_completion_stats(blkg, start_time, io_start_time,
67 direction, sync);
68}
69
70static inline void cfq_blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
71 struct blkio_group *blkg, void *key, dev_t dev) {
72 blkiocg_add_blkio_group(blkcg, blkg, key, dev, BLKIO_POLICY_PROP);
73}
74
75static inline int cfq_blkiocg_del_blkio_group(struct blkio_group *blkg)
76{
77 return blkiocg_del_blkio_group(blkg);
78}
79
80#else /* CFQ_GROUP_IOSCHED */
81static inline void cfq_blkiocg_update_io_add_stats(struct blkio_group *blkg,
82 struct blkio_group *curr_blkg, bool direction, bool sync) {}
83
84static inline void cfq_blkiocg_update_dequeue_stats(struct blkio_group *blkg,
85 unsigned long dequeue) {}
86
87static inline void cfq_blkiocg_update_timeslice_used(struct blkio_group *blkg,
88 unsigned long time, unsigned long unaccounted_time) {}
89static inline void cfq_blkiocg_set_start_empty_time(struct blkio_group *blkg) {}
90static inline void cfq_blkiocg_update_io_remove_stats(struct blkio_group *blkg,
91 bool direction, bool sync) {}
92static inline void cfq_blkiocg_update_io_merged_stats(struct blkio_group *blkg,
93 bool direction, bool sync) {}
94static inline void cfq_blkiocg_update_idle_time_stats(struct blkio_group *blkg)
95{
96}
97static inline void
98cfq_blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg) {}
99
100static inline void
101cfq_blkiocg_update_set_idle_time_stats(struct blkio_group *blkg) {}
102
103static inline void cfq_blkiocg_update_dispatch_stats(struct blkio_group *blkg,
104 uint64_t bytes, bool direction, bool sync) {}
105static inline void cfq_blkiocg_update_completion_stats(struct blkio_group *blkg, uint64_t start_time, uint64_t io_start_time, bool direction, bool sync) {}
106
107static inline void cfq_blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
108 struct blkio_group *blkg, void *key, dev_t dev) {}
109static inline int cfq_blkiocg_del_blkio_group(struct blkio_group *blkg)
110{
111 return 0;
112}
113
114#endif /* CFQ_GROUP_IOSCHED */
115#endif
diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c
index 7bf12d793fcd..599b12e5380f 100644
--- a/block/deadline-iosched.c
+++ b/block/deadline-iosched.c
@@ -337,13 +337,13 @@ static void deadline_exit_queue(struct elevator_queue *e)
337/* 337/*
338 * initialize elevator private data (deadline_data). 338 * initialize elevator private data (deadline_data).
339 */ 339 */
340static void *deadline_init_queue(struct request_queue *q) 340static int deadline_init_queue(struct request_queue *q)
341{ 341{
342 struct deadline_data *dd; 342 struct deadline_data *dd;
343 343
344 dd = kmalloc_node(sizeof(*dd), GFP_KERNEL | __GFP_ZERO, q->node); 344 dd = kmalloc_node(sizeof(*dd), GFP_KERNEL | __GFP_ZERO, q->node);
345 if (!dd) 345 if (!dd)
346 return NULL; 346 return -ENOMEM;
347 347
348 INIT_LIST_HEAD(&dd->fifo_list[READ]); 348 INIT_LIST_HEAD(&dd->fifo_list[READ]);
349 INIT_LIST_HEAD(&dd->fifo_list[WRITE]); 349 INIT_LIST_HEAD(&dd->fifo_list[WRITE]);
@@ -354,7 +354,9 @@ static void *deadline_init_queue(struct request_queue *q)
354 dd->writes_starved = writes_starved; 354 dd->writes_starved = writes_starved;
355 dd->front_merges = 1; 355 dd->front_merges = 1;
356 dd->fifo_batch = fifo_batch; 356 dd->fifo_batch = fifo_batch;
357 return dd; 357
358 q->elevator->elevator_data = dd;
359 return 0;
358} 360}
359 361
360/* 362/*
diff --git a/block/elevator.c b/block/elevator.c
index f016855a46b0..6a55d418896f 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -38,6 +38,7 @@
38#include <trace/events/block.h> 38#include <trace/events/block.h>
39 39
40#include "blk.h" 40#include "blk.h"
41#include "blk-cgroup.h"
41 42
42static DEFINE_SPINLOCK(elv_list_lock); 43static DEFINE_SPINLOCK(elv_list_lock);
43static LIST_HEAD(elv_list); 44static LIST_HEAD(elv_list);
@@ -121,15 +122,6 @@ static struct elevator_type *elevator_get(const char *name)
121 return e; 122 return e;
122} 123}
123 124
124static int elevator_init_queue(struct request_queue *q,
125 struct elevator_queue *eq)
126{
127 eq->elevator_data = eq->type->ops.elevator_init_fn(q);
128 if (eq->elevator_data)
129 return 0;
130 return -ENOMEM;
131}
132
133static char chosen_elevator[ELV_NAME_MAX]; 125static char chosen_elevator[ELV_NAME_MAX];
134 126
135static int __init elevator_setup(char *str) 127static int __init elevator_setup(char *str)
@@ -188,7 +180,6 @@ static void elevator_release(struct kobject *kobj)
188int elevator_init(struct request_queue *q, char *name) 180int elevator_init(struct request_queue *q, char *name)
189{ 181{
190 struct elevator_type *e = NULL; 182 struct elevator_type *e = NULL;
191 struct elevator_queue *eq;
192 int err; 183 int err;
193 184
194 if (unlikely(q->elevator)) 185 if (unlikely(q->elevator))
@@ -222,17 +213,16 @@ int elevator_init(struct request_queue *q, char *name)
222 } 213 }
223 } 214 }
224 215
225 eq = elevator_alloc(q, e); 216 q->elevator = elevator_alloc(q, e);
226 if (!eq) 217 if (!q->elevator)
227 return -ENOMEM; 218 return -ENOMEM;
228 219
229 err = elevator_init_queue(q, eq); 220 err = e->ops.elevator_init_fn(q);
230 if (err) { 221 if (err) {
231 kobject_put(&eq->kobj); 222 kobject_put(&q->elevator->kobj);
232 return err; 223 return err;
233 } 224 }
234 225
235 q->elevator = eq;
236 return 0; 226 return 0;
237} 227}
238EXPORT_SYMBOL(elevator_init); 228EXPORT_SYMBOL(elevator_init);
@@ -564,25 +554,6 @@ void elv_drain_elevator(struct request_queue *q)
564 } 554 }
565} 555}
566 556
567void elv_quiesce_start(struct request_queue *q)
568{
569 if (!q->elevator)
570 return;
571
572 spin_lock_irq(q->queue_lock);
573 queue_flag_set(QUEUE_FLAG_ELVSWITCH, q);
574 spin_unlock_irq(q->queue_lock);
575
576 blk_drain_queue(q, false);
577}
578
579void elv_quiesce_end(struct request_queue *q)
580{
581 spin_lock_irq(q->queue_lock);
582 queue_flag_clear(QUEUE_FLAG_ELVSWITCH, q);
583 spin_unlock_irq(q->queue_lock);
584}
585
586void __elv_add_request(struct request_queue *q, struct request *rq, int where) 557void __elv_add_request(struct request_queue *q, struct request *rq, int where)
587{ 558{
588 trace_block_rq_insert(q, rq); 559 trace_block_rq_insert(q, rq);
@@ -692,12 +663,13 @@ struct request *elv_former_request(struct request_queue *q, struct request *rq)
692 return NULL; 663 return NULL;
693} 664}
694 665
695int elv_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask) 666int elv_set_request(struct request_queue *q, struct request *rq,
667 struct bio *bio, gfp_t gfp_mask)
696{ 668{
697 struct elevator_queue *e = q->elevator; 669 struct elevator_queue *e = q->elevator;
698 670
699 if (e->type->ops.elevator_set_req_fn) 671 if (e->type->ops.elevator_set_req_fn)
700 return e->type->ops.elevator_set_req_fn(q, rq, gfp_mask); 672 return e->type->ops.elevator_set_req_fn(q, rq, bio, gfp_mask);
701 return 0; 673 return 0;
702} 674}
703 675
@@ -801,8 +773,9 @@ static struct kobj_type elv_ktype = {
801 .release = elevator_release, 773 .release = elevator_release,
802}; 774};
803 775
804int __elv_register_queue(struct request_queue *q, struct elevator_queue *e) 776int elv_register_queue(struct request_queue *q)
805{ 777{
778 struct elevator_queue *e = q->elevator;
806 int error; 779 int error;
807 780
808 error = kobject_add(&e->kobj, &q->kobj, "%s", "iosched"); 781 error = kobject_add(&e->kobj, &q->kobj, "%s", "iosched");
@@ -820,11 +793,6 @@ int __elv_register_queue(struct request_queue *q, struct elevator_queue *e)
820 } 793 }
821 return error; 794 return error;
822} 795}
823
824int elv_register_queue(struct request_queue *q)
825{
826 return __elv_register_queue(q, q->elevator);
827}
828EXPORT_SYMBOL(elv_register_queue); 796EXPORT_SYMBOL(elv_register_queue);
829 797
830void elv_unregister_queue(struct request_queue *q) 798void elv_unregister_queue(struct request_queue *q)
@@ -907,53 +875,60 @@ EXPORT_SYMBOL_GPL(elv_unregister);
907 */ 875 */
908static int elevator_switch(struct request_queue *q, struct elevator_type *new_e) 876static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
909{ 877{
910 struct elevator_queue *old_elevator, *e; 878 struct elevator_queue *old = q->elevator;
879 bool registered = old->registered;
911 int err; 880 int err;
912 881
913 /* allocate new elevator */ 882 /*
914 e = elevator_alloc(q, new_e); 883 * Turn on BYPASS and drain all requests w/ elevator private data.
915 if (!e) 884 * Block layer doesn't call into a quiesced elevator - all requests
916 return -ENOMEM; 885 * are directly put on the dispatch list without elevator data
886 * using INSERT_BACK. All requests have SOFTBARRIER set and no
887 * merge happens either.
888 */
889 blk_queue_bypass_start(q);
890
891 /* unregister and clear all auxiliary data of the old elevator */
892 if (registered)
893 elv_unregister_queue(q);
894
895 spin_lock_irq(q->queue_lock);
896 ioc_clear_queue(q);
897 spin_unlock_irq(q->queue_lock);
917 898
918 err = elevator_init_queue(q, e); 899 /* allocate, init and register new elevator */
900 err = -ENOMEM;
901 q->elevator = elevator_alloc(q, new_e);
902 if (!q->elevator)
903 goto fail_init;
904
905 err = new_e->ops.elevator_init_fn(q);
919 if (err) { 906 if (err) {
920 kobject_put(&e->kobj); 907 kobject_put(&q->elevator->kobj);
921 return err; 908 goto fail_init;
922 } 909 }
923 910
924 /* turn on BYPASS and drain all requests w/ elevator private data */ 911 if (registered) {
925 elv_quiesce_start(q); 912 err = elv_register_queue(q);
926
927 /* unregister old queue, register new one and kill old elevator */
928 if (q->elevator->registered) {
929 elv_unregister_queue(q);
930 err = __elv_register_queue(q, e);
931 if (err) 913 if (err)
932 goto fail_register; 914 goto fail_register;
933 } 915 }
934 916
935 /* done, clear io_cq's, switch elevators and turn off BYPASS */ 917 /* done, kill the old one and finish */
936 spin_lock_irq(q->queue_lock); 918 elevator_exit(old);
937 ioc_clear_queue(q); 919 blk_queue_bypass_end(q);
938 old_elevator = q->elevator;
939 q->elevator = e;
940 spin_unlock_irq(q->queue_lock);
941
942 elevator_exit(old_elevator);
943 elv_quiesce_end(q);
944 920
945 blk_add_trace_msg(q, "elv switch: %s", e->type->elevator_name); 921 blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name);
946 922
947 return 0; 923 return 0;
948 924
949fail_register: 925fail_register:
950 /* 926 elevator_exit(q->elevator);
951 * switch failed, exit the new io scheduler and reattach the old 927fail_init:
952 * one again (along with re-adding the sysfs dir) 928 /* switch failed, restore and re-register old elevator */
953 */ 929 q->elevator = old;
954 elevator_exit(e);
955 elv_register_queue(q); 930 elv_register_queue(q);
956 elv_quiesce_end(q); 931 blk_queue_bypass_end(q);
957 932
958 return err; 933 return err;
959} 934}
diff --git a/block/noop-iosched.c b/block/noop-iosched.c
index 413a0b1d788c..5d1bf70e33d5 100644
--- a/block/noop-iosched.c
+++ b/block/noop-iosched.c
@@ -59,15 +59,17 @@ noop_latter_request(struct request_queue *q, struct request *rq)
59 return list_entry(rq->queuelist.next, struct request, queuelist); 59 return list_entry(rq->queuelist.next, struct request, queuelist);
60} 60}
61 61
62static void *noop_init_queue(struct request_queue *q) 62static int noop_init_queue(struct request_queue *q)
63{ 63{
64 struct noop_data *nd; 64 struct noop_data *nd;
65 65
66 nd = kmalloc_node(sizeof(*nd), GFP_KERNEL, q->node); 66 nd = kmalloc_node(sizeof(*nd), GFP_KERNEL, q->node);
67 if (!nd) 67 if (!nd)
68 return NULL; 68 return -ENOMEM;
69
69 INIT_LIST_HEAD(&nd->queue); 70 INIT_LIST_HEAD(&nd->queue);
70 return nd; 71 q->elevator->elevator_data = nd;
72 return 0;
71} 73}
72 74
73static void noop_exit_queue(struct elevator_queue *e) 75static void noop_exit_queue(struct elevator_queue *e)
diff --git a/fs/bio.c b/fs/bio.c
index 84da88539046..73922abba832 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -19,12 +19,14 @@
19#include <linux/swap.h> 19#include <linux/swap.h>
20#include <linux/bio.h> 20#include <linux/bio.h>
21#include <linux/blkdev.h> 21#include <linux/blkdev.h>
22#include <linux/iocontext.h>
22#include <linux/slab.h> 23#include <linux/slab.h>
23#include <linux/init.h> 24#include <linux/init.h>
24#include <linux/kernel.h> 25#include <linux/kernel.h>
25#include <linux/export.h> 26#include <linux/export.h>
26#include <linux/mempool.h> 27#include <linux/mempool.h>
27#include <linux/workqueue.h> 28#include <linux/workqueue.h>
29#include <linux/cgroup.h>
28#include <scsi/sg.h> /* for struct sg_iovec */ 30#include <scsi/sg.h> /* for struct sg_iovec */
29 31
30#include <trace/events/block.h> 32#include <trace/events/block.h>
@@ -418,6 +420,7 @@ void bio_put(struct bio *bio)
418 * last put frees it 420 * last put frees it
419 */ 421 */
420 if (atomic_dec_and_test(&bio->bi_cnt)) { 422 if (atomic_dec_and_test(&bio->bi_cnt)) {
423 bio_disassociate_task(bio);
421 bio->bi_next = NULL; 424 bio->bi_next = NULL;
422 bio->bi_destructor(bio); 425 bio->bi_destructor(bio);
423 } 426 }
@@ -1646,6 +1649,64 @@ bad:
1646} 1649}
1647EXPORT_SYMBOL(bioset_create); 1650EXPORT_SYMBOL(bioset_create);
1648 1651
1652#ifdef CONFIG_BLK_CGROUP
1653/**
1654 * bio_associate_current - associate a bio with %current
1655 * @bio: target bio
1656 *
1657 * Associate @bio with %current if it hasn't been associated yet. Block
1658 * layer will treat @bio as if it were issued by %current no matter which
1659 * task actually issues it.
1660 *
1661 * This function takes an extra reference of @task's io_context and blkcg
1662 * which will be put when @bio is released. The caller must own @bio,
1663 * ensure %current->io_context exists, and is responsible for synchronizing
1664 * calls to this function.
1665 */
1666int bio_associate_current(struct bio *bio)
1667{
1668 struct io_context *ioc;
1669 struct cgroup_subsys_state *css;
1670
1671 if (bio->bi_ioc)
1672 return -EBUSY;
1673
1674 ioc = current->io_context;
1675 if (!ioc)
1676 return -ENOENT;
1677
1678 /* acquire active ref on @ioc and associate */
1679 get_io_context_active(ioc);
1680 bio->bi_ioc = ioc;
1681
1682 /* associate blkcg if exists */
1683 rcu_read_lock();
1684 css = task_subsys_state(current, blkio_subsys_id);
1685 if (css && css_tryget(css))
1686 bio->bi_css = css;
1687 rcu_read_unlock();
1688
1689 return 0;
1690}
1691
1692/**
1693 * bio_disassociate_task - undo bio_associate_current()
1694 * @bio: target bio
1695 */
1696void bio_disassociate_task(struct bio *bio)
1697{
1698 if (bio->bi_ioc) {
1699 put_io_context(bio->bi_ioc);
1700 bio->bi_ioc = NULL;
1701 }
1702 if (bio->bi_css) {
1703 css_put(bio->bi_css);
1704 bio->bi_css = NULL;
1705 }
1706}
1707
1708#endif /* CONFIG_BLK_CGROUP */
1709
1649static void __init biovec_init_slabs(void) 1710static void __init biovec_init_slabs(void)
1650{ 1711{
1651 int i; 1712 int i;
diff --git a/fs/ioprio.c b/fs/ioprio.c
index 5e6dbe8958fc..e50170ca7c33 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -50,7 +50,7 @@ int set_task_ioprio(struct task_struct *task, int ioprio)
50 50
51 ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE); 51 ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE);
52 if (ioc) { 52 if (ioc) {
53 ioc_ioprio_changed(ioc, ioprio); 53 ioc->ioprio = ioprio;
54 put_io_context(ioc); 54 put_io_context(ioc);
55 } 55 }
56 56
diff --git a/fs/splice.c b/fs/splice.c
index f8476841eb04..406ef2b792c2 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1388,7 +1388,7 @@ static long do_splice(struct file *in, loff_t __user *off_in,
1388 */ 1388 */
1389static int get_iovec_page_array(const struct iovec __user *iov, 1389static int get_iovec_page_array(const struct iovec __user *iov,
1390 unsigned int nr_vecs, struct page **pages, 1390 unsigned int nr_vecs, struct page **pages,
1391 struct partial_page *partial, int aligned, 1391 struct partial_page *partial, bool aligned,
1392 unsigned int pipe_buffers) 1392 unsigned int pipe_buffers)
1393{ 1393{
1394 int buffers = 0, error = 0; 1394 int buffers = 0, error = 0;
@@ -1626,7 +1626,7 @@ static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov,
1626 return -ENOMEM; 1626 return -ENOMEM;
1627 1627
1628 spd.nr_pages = get_iovec_page_array(iov, nr_segs, spd.pages, 1628 spd.nr_pages = get_iovec_page_array(iov, nr_segs, spd.pages,
1629 spd.partial, flags & SPLICE_F_GIFT, 1629 spd.partial, false,
1630 pipe->buffers); 1630 pipe->buffers);
1631 if (spd.nr_pages <= 0) 1631 if (spd.nr_pages <= 0)
1632 ret = spd.nr_pages; 1632 ret = spd.nr_pages;
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 4d94eb8bcbcc..26435890dc87 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -269,6 +269,14 @@ extern struct bio_vec *bvec_alloc_bs(gfp_t, int, unsigned long *, struct bio_set
269extern void bvec_free_bs(struct bio_set *, struct bio_vec *, unsigned int); 269extern void bvec_free_bs(struct bio_set *, struct bio_vec *, unsigned int);
270extern unsigned int bvec_nr_vecs(unsigned short idx); 270extern unsigned int bvec_nr_vecs(unsigned short idx);
271 271
272#ifdef CONFIG_BLK_CGROUP
273int bio_associate_current(struct bio *bio);
274void bio_disassociate_task(struct bio *bio);
275#else /* CONFIG_BLK_CGROUP */
276static inline int bio_associate_current(struct bio *bio) { return -ENOENT; }
277static inline void bio_disassociate_task(struct bio *bio) { }
278#endif /* CONFIG_BLK_CGROUP */
279
272/* 280/*
273 * bio_set is used to allow other portions of the IO system to 281 * bio_set is used to allow other portions of the IO system to
274 * allocate their own private memory pools for bio and iovec structures. 282 * allocate their own private memory pools for bio and iovec structures.
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 4053cbd4490e..0edb65dd8edd 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -14,6 +14,8 @@ struct bio;
14struct bio_integrity_payload; 14struct bio_integrity_payload;
15struct page; 15struct page;
16struct block_device; 16struct block_device;
17struct io_context;
18struct cgroup_subsys_state;
17typedef void (bio_end_io_t) (struct bio *, int); 19typedef void (bio_end_io_t) (struct bio *, int);
18typedef void (bio_destructor_t) (struct bio *); 20typedef void (bio_destructor_t) (struct bio *);
19 21
@@ -66,6 +68,14 @@ struct bio {
66 bio_end_io_t *bi_end_io; 68 bio_end_io_t *bi_end_io;
67 69
68 void *bi_private; 70 void *bi_private;
71#ifdef CONFIG_BLK_CGROUP
72 /*
73 * Optional ioc and css associated with this bio. Put on bio
74 * release. Read comment on top of bio_associate_current().
75 */
76 struct io_context *bi_ioc;
77 struct cgroup_subsys_state *bi_css;
78#endif
69#if defined(CONFIG_BLK_DEV_INTEGRITY) 79#if defined(CONFIG_BLK_DEV_INTEGRITY)
70 struct bio_integrity_payload *bi_integrity; /* data integrity */ 80 struct bio_integrity_payload *bi_integrity; /* data integrity */
71#endif 81#endif
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 4d4ac24a263e..ba43f408baa3 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -32,10 +32,17 @@ struct blk_trace;
32struct request; 32struct request;
33struct sg_io_hdr; 33struct sg_io_hdr;
34struct bsg_job; 34struct bsg_job;
35struct blkcg_gq;
35 36
36#define BLKDEV_MIN_RQ 4 37#define BLKDEV_MIN_RQ 4
37#define BLKDEV_MAX_RQ 128 /* Default maximum */ 38#define BLKDEV_MAX_RQ 128 /* Default maximum */
38 39
40/*
41 * Maximum number of blkcg policies allowed to be registered concurrently.
42 * Defined here to simplify include dependency.
43 */
44#define BLKCG_MAX_POLS 2
45
39struct request; 46struct request;
40typedef void (rq_end_io_fn)(struct request *, int); 47typedef void (rq_end_io_fn)(struct request *, int);
41 48
@@ -363,6 +370,11 @@ struct request_queue {
363 struct list_head timeout_list; 370 struct list_head timeout_list;
364 371
365 struct list_head icq_list; 372 struct list_head icq_list;
373#ifdef CONFIG_BLK_CGROUP
374 DECLARE_BITMAP (blkcg_pols, BLKCG_MAX_POLS);
375 struct blkcg_gq *root_blkg;
376 struct list_head blkg_list;
377#endif
366 378
367 struct queue_limits limits; 379 struct queue_limits limits;
368 380
@@ -390,12 +402,17 @@ struct request_queue {
390 402
391 struct mutex sysfs_lock; 403 struct mutex sysfs_lock;
392 404
405 int bypass_depth;
406
393#if defined(CONFIG_BLK_DEV_BSG) 407#if defined(CONFIG_BLK_DEV_BSG)
394 bsg_job_fn *bsg_job_fn; 408 bsg_job_fn *bsg_job_fn;
395 int bsg_job_size; 409 int bsg_job_size;
396 struct bsg_class_device bsg_dev; 410 struct bsg_class_device bsg_dev;
397#endif 411#endif
398 412
413#ifdef CONFIG_BLK_CGROUP
414 struct list_head all_q_node;
415#endif
399#ifdef CONFIG_BLK_DEV_THROTTLING 416#ifdef CONFIG_BLK_DEV_THROTTLING
400 /* Throttle data */ 417 /* Throttle data */
401 struct throtl_data *td; 418 struct throtl_data *td;
@@ -407,7 +424,7 @@ struct request_queue {
407#define QUEUE_FLAG_SYNCFULL 3 /* read queue has been filled */ 424#define QUEUE_FLAG_SYNCFULL 3 /* read queue has been filled */
408#define QUEUE_FLAG_ASYNCFULL 4 /* write queue has been filled */ 425#define QUEUE_FLAG_ASYNCFULL 4 /* write queue has been filled */
409#define QUEUE_FLAG_DEAD 5 /* queue being torn down */ 426#define QUEUE_FLAG_DEAD 5 /* queue being torn down */
410#define QUEUE_FLAG_ELVSWITCH 6 /* don't use elevator, just do FIFO */ 427#define QUEUE_FLAG_BYPASS 6 /* act as dumb FIFO queue */
411#define QUEUE_FLAG_BIDI 7 /* queue supports bidi requests */ 428#define QUEUE_FLAG_BIDI 7 /* queue supports bidi requests */
412#define QUEUE_FLAG_NOMERGES 8 /* disable merge attempts */ 429#define QUEUE_FLAG_NOMERGES 8 /* disable merge attempts */
413#define QUEUE_FLAG_SAME_COMP 9 /* complete on same CPU-group */ 430#define QUEUE_FLAG_SAME_COMP 9 /* complete on same CPU-group */
@@ -491,6 +508,7 @@ static inline void queue_flag_clear(unsigned int flag, struct request_queue *q)
491#define blk_queue_tagged(q) test_bit(QUEUE_FLAG_QUEUED, &(q)->queue_flags) 508#define blk_queue_tagged(q) test_bit(QUEUE_FLAG_QUEUED, &(q)->queue_flags)
492#define blk_queue_stopped(q) test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags) 509#define blk_queue_stopped(q) test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags)
493#define blk_queue_dead(q) test_bit(QUEUE_FLAG_DEAD, &(q)->queue_flags) 510#define blk_queue_dead(q) test_bit(QUEUE_FLAG_DEAD, &(q)->queue_flags)
511#define blk_queue_bypass(q) test_bit(QUEUE_FLAG_BYPASS, &(q)->queue_flags)
494#define blk_queue_nomerges(q) test_bit(QUEUE_FLAG_NOMERGES, &(q)->queue_flags) 512#define blk_queue_nomerges(q) test_bit(QUEUE_FLAG_NOMERGES, &(q)->queue_flags)
495#define blk_queue_noxmerges(q) \ 513#define blk_queue_noxmerges(q) \
496 test_bit(QUEUE_FLAG_NOXMERGES, &(q)->queue_flags) 514 test_bit(QUEUE_FLAG_NOXMERGES, &(q)->queue_flags)
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index 7d4e0356f329..c03af7687bb4 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -28,12 +28,13 @@ typedef int (elevator_may_queue_fn) (struct request_queue *, int);
28 28
29typedef void (elevator_init_icq_fn) (struct io_cq *); 29typedef void (elevator_init_icq_fn) (struct io_cq *);
30typedef void (elevator_exit_icq_fn) (struct io_cq *); 30typedef void (elevator_exit_icq_fn) (struct io_cq *);
31typedef int (elevator_set_req_fn) (struct request_queue *, struct request *, gfp_t); 31typedef int (elevator_set_req_fn) (struct request_queue *, struct request *,
32 struct bio *, gfp_t);
32typedef void (elevator_put_req_fn) (struct request *); 33typedef void (elevator_put_req_fn) (struct request *);
33typedef void (elevator_activate_req_fn) (struct request_queue *, struct request *); 34typedef void (elevator_activate_req_fn) (struct request_queue *, struct request *);
34typedef void (elevator_deactivate_req_fn) (struct request_queue *, struct request *); 35typedef void (elevator_deactivate_req_fn) (struct request_queue *, struct request *);
35 36
36typedef void *(elevator_init_fn) (struct request_queue *); 37typedef int (elevator_init_fn) (struct request_queue *);
37typedef void (elevator_exit_fn) (struct elevator_queue *); 38typedef void (elevator_exit_fn) (struct elevator_queue *);
38 39
39struct elevator_ops 40struct elevator_ops
@@ -129,7 +130,8 @@ extern void elv_unregister_queue(struct request_queue *q);
129extern int elv_may_queue(struct request_queue *, int); 130extern int elv_may_queue(struct request_queue *, int);
130extern void elv_abort_queue(struct request_queue *); 131extern void elv_abort_queue(struct request_queue *);
131extern void elv_completed_request(struct request_queue *, struct request *); 132extern void elv_completed_request(struct request_queue *, struct request *);
132extern int elv_set_request(struct request_queue *, struct request *, gfp_t); 133extern int elv_set_request(struct request_queue *q, struct request *rq,
134 struct bio *bio, gfp_t gfp_mask);
133extern void elv_put_request(struct request_queue *, struct request *); 135extern void elv_put_request(struct request_queue *, struct request *);
134extern void elv_drain_elevator(struct request_queue *); 136extern void elv_drain_elevator(struct request_queue *);
135 137
diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h
index 1a3018063034..df38db2ef45b 100644
--- a/include/linux/iocontext.h
+++ b/include/linux/iocontext.h
@@ -6,11 +6,7 @@
6#include <linux/workqueue.h> 6#include <linux/workqueue.h>
7 7
8enum { 8enum {
9 ICQ_IOPRIO_CHANGED = 1 << 0,
10 ICQ_CGROUP_CHANGED = 1 << 1,
11 ICQ_EXITED = 1 << 2, 9 ICQ_EXITED = 1 << 2,
12
13 ICQ_CHANGED_MASK = ICQ_IOPRIO_CHANGED | ICQ_CGROUP_CHANGED,
14}; 10};
15 11
16/* 12/*
@@ -100,6 +96,7 @@ struct io_cq {
100 */ 96 */
101struct io_context { 97struct io_context {
102 atomic_long_t refcount; 98 atomic_long_t refcount;
99 atomic_t active_ref;
103 atomic_t nr_tasks; 100 atomic_t nr_tasks;
104 101
105 /* all the fields below are protected by this lock */ 102 /* all the fields below are protected by this lock */
@@ -120,29 +117,37 @@ struct io_context {
120 struct work_struct release_work; 117 struct work_struct release_work;
121}; 118};
122 119
123static inline struct io_context *ioc_task_link(struct io_context *ioc) 120/**
121 * get_io_context_active - get active reference on ioc
122 * @ioc: ioc of interest
123 *
124 * Only iocs with active reference can issue new IOs. This function
125 * acquires an active reference on @ioc. The caller must already have an
126 * active reference on @ioc.
127 */
128static inline void get_io_context_active(struct io_context *ioc)
124{ 129{
125 /* 130 WARN_ON_ONCE(atomic_long_read(&ioc->refcount) <= 0);
126 * if ref count is zero, don't allow sharing (ioc is going away, it's 131 WARN_ON_ONCE(atomic_read(&ioc->active_ref) <= 0);
127 * a race). 132 atomic_long_inc(&ioc->refcount);
128 */ 133 atomic_inc(&ioc->active_ref);
129 if (ioc && atomic_long_inc_not_zero(&ioc->refcount)) { 134}
130 atomic_inc(&ioc->nr_tasks); 135
131 return ioc; 136static inline void ioc_task_link(struct io_context *ioc)
132 } 137{
138 get_io_context_active(ioc);
133 139
134 return NULL; 140 WARN_ON_ONCE(atomic_read(&ioc->nr_tasks) <= 0);
141 atomic_inc(&ioc->nr_tasks);
135} 142}
136 143
137struct task_struct; 144struct task_struct;
138#ifdef CONFIG_BLOCK 145#ifdef CONFIG_BLOCK
139void put_io_context(struct io_context *ioc); 146void put_io_context(struct io_context *ioc);
147void put_io_context_active(struct io_context *ioc);
140void exit_io_context(struct task_struct *task); 148void exit_io_context(struct task_struct *task);
141struct io_context *get_task_io_context(struct task_struct *task, 149struct io_context *get_task_io_context(struct task_struct *task,
142 gfp_t gfp_flags, int node); 150 gfp_t gfp_flags, int node);
143void ioc_ioprio_changed(struct io_context *ioc, int ioprio);
144void ioc_cgroup_changed(struct io_context *ioc);
145unsigned int icq_get_changed(struct io_cq *icq);
146#else 151#else
147struct io_context; 152struct io_context;
148static inline void put_io_context(struct io_context *ioc) { } 153static inline void put_io_context(struct io_context *ioc) { }
diff --git a/include/linux/ioprio.h b/include/linux/ioprio.h
index 76dad4808847..beb9ce1c2c23 100644
--- a/include/linux/ioprio.h
+++ b/include/linux/ioprio.h
@@ -42,26 +42,14 @@ enum {
42}; 42};
43 43
44/* 44/*
45 * if process has set io priority explicitly, use that. if not, convert 45 * Fallback BE priority
46 * the cpu scheduler nice value to an io priority
47 */ 46 */
48#define IOPRIO_NORM (4) 47#define IOPRIO_NORM (4)
49static inline int task_ioprio(struct io_context *ioc)
50{
51 if (ioprio_valid(ioc->ioprio))
52 return IOPRIO_PRIO_DATA(ioc->ioprio);
53
54 return IOPRIO_NORM;
55}
56
57static inline int task_ioprio_class(struct io_context *ioc)
58{
59 if (ioprio_valid(ioc->ioprio))
60 return IOPRIO_PRIO_CLASS(ioc->ioprio);
61
62 return IOPRIO_CLASS_BE;
63}
64 48
49/*
50 * if process has set io priority explicitly, use that. if not, convert
51 * the cpu scheduler nice value to an io priority
52 */
65static inline int task_nice_ioprio(struct task_struct *task) 53static inline int task_nice_ioprio(struct task_struct *task)
66{ 54{
67 return (task_nice(task) + 20) / 5; 55 return (task_nice(task) + 20) / 5;
diff --git a/init/Kconfig b/init/Kconfig
index 81816b82860b..1e004d057468 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -803,7 +803,7 @@ config RT_GROUP_SCHED
803endif #CGROUP_SCHED 803endif #CGROUP_SCHED
804 804
805config BLK_CGROUP 805config BLK_CGROUP
806 tristate "Block IO controller" 806 bool "Block IO controller"
807 depends on BLOCK 807 depends on BLOCK
808 default n 808 default n
809 ---help--- 809 ---help---
diff --git a/kernel/fork.c b/kernel/fork.c
index 017fb23d5983..31a32c7dd169 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -976,9 +976,8 @@ static int copy_io(unsigned long clone_flags, struct task_struct *tsk)
976 * Share io context with parent, if CLONE_IO is set 976 * Share io context with parent, if CLONE_IO is set
977 */ 977 */
978 if (clone_flags & CLONE_IO) { 978 if (clone_flags & CLONE_IO) {
979 tsk->io_context = ioc_task_link(ioc); 979 ioc_task_link(ioc);
980 if (unlikely(!tsk->io_context)) 980 tsk->io_context = ioc;
981 return -ENOMEM;
982 } else if (ioprio_valid(ioc->ioprio)) { 981 } else if (ioprio_valid(ioc->ioprio)) {
983 new_ioc = get_task_io_context(tsk, GFP_KERNEL, NUMA_NO_NODE); 982 new_ioc = get_task_io_context(tsk, GFP_KERNEL, NUMA_NO_NODE);
984 if (unlikely(!new_ioc)) 983 if (unlikely(!new_ioc))