aboutsummaryrefslogtreecommitdiffstats
path: root/block
diff options
context:
space:
mode:
Diffstat (limited to 'block')
-rw-r--r--block/Kconfig.iosched4
-rw-r--r--block/blk-cgroup.c2109
-rw-r--r--block/blk-cgroup.h647
-rw-r--r--block/blk-core.c281
-rw-r--r--block/blk-ioc.c126
-rw-r--r--block/blk-sysfs.c6
-rw-r--r--block/blk-throttle.c695
-rw-r--r--block/blk.h32
-rw-r--r--block/cfq-iosched.c1072
-rw-r--r--block/cfq.h115
-rw-r--r--block/deadline-iosched.c8
-rw-r--r--block/elevator.c121
-rw-r--r--block/noop-iosched.c8
13 files changed, 2307 insertions, 2917 deletions
diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
index 3199b76f795d..421bef9c4c48 100644
--- a/block/Kconfig.iosched
+++ b/block/Kconfig.iosched
@@ -23,8 +23,6 @@ config IOSCHED_DEADLINE
23 23
24config IOSCHED_CFQ 24config IOSCHED_CFQ
25 tristate "CFQ I/O scheduler" 25 tristate "CFQ I/O scheduler"
26 # If BLK_CGROUP is a module, CFQ has to be built as module.
27 depends on (BLK_CGROUP=m && m) || !BLK_CGROUP || BLK_CGROUP=y
28 default y 26 default y
29 ---help--- 27 ---help---
30 The CFQ I/O scheduler tries to distribute bandwidth equally 28 The CFQ I/O scheduler tries to distribute bandwidth equally
@@ -34,8 +32,6 @@ config IOSCHED_CFQ
34 32
35 This is the default I/O scheduler. 33 This is the default I/O scheduler.
36 34
37 Note: If BLK_CGROUP=m, then CFQ can be built only as module.
38
39config CFQ_GROUP_IOSCHED 35config CFQ_GROUP_IOSCHED
40 bool "CFQ Group Scheduling support" 36 bool "CFQ Group Scheduling support"
41 depends on IOSCHED_CFQ && BLK_CGROUP 37 depends on IOSCHED_CFQ && BLK_CGROUP
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index ea84a23d5e68..02cf6335e9bd 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -11,1679 +11,906 @@
11 * Nauman Rafique <nauman@google.com> 11 * Nauman Rafique <nauman@google.com>
12 */ 12 */
13#include <linux/ioprio.h> 13#include <linux/ioprio.h>
14#include <linux/seq_file.h>
15#include <linux/kdev_t.h> 14#include <linux/kdev_t.h>
16#include <linux/module.h> 15#include <linux/module.h>
17#include <linux/err.h> 16#include <linux/err.h>
18#include <linux/blkdev.h> 17#include <linux/blkdev.h>
19#include <linux/slab.h> 18#include <linux/slab.h>
20#include "blk-cgroup.h"
21#include <linux/genhd.h> 19#include <linux/genhd.h>
20#include <linux/delay.h>
21#include <linux/atomic.h>
22#include "blk-cgroup.h"
23#include "blk.h"
22 24
23#define MAX_KEY_LEN 100 25#define MAX_KEY_LEN 100
24 26
25static DEFINE_SPINLOCK(blkio_list_lock); 27static DEFINE_MUTEX(blkcg_pol_mutex);
26static LIST_HEAD(blkio_list);
27 28
28struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT }; 29struct blkcg blkcg_root = { .cfq_weight = 2 * CFQ_WEIGHT_DEFAULT };
29EXPORT_SYMBOL_GPL(blkio_root_cgroup); 30EXPORT_SYMBOL_GPL(blkcg_root);
30 31
31static struct cgroup_subsys_state *blkiocg_create(struct cgroup *); 32static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];
32static int blkiocg_can_attach(struct cgroup *, struct cgroup_taskset *);
33static void blkiocg_attach(struct cgroup *, struct cgroup_taskset *);
34static void blkiocg_destroy(struct cgroup *);
35static int blkiocg_populate(struct cgroup_subsys *, struct cgroup *);
36 33
37/* for encoding cft->private value on file */ 34struct blkcg *cgroup_to_blkcg(struct cgroup *cgroup)
38#define BLKIOFILE_PRIVATE(x, val) (((x) << 16) | (val))
39/* What policy owns the file, proportional or throttle */
40#define BLKIOFILE_POLICY(val) (((val) >> 16) & 0xffff)
41#define BLKIOFILE_ATTR(val) ((val) & 0xffff)
42
43struct cgroup_subsys blkio_subsys = {
44 .name = "blkio",
45 .create = blkiocg_create,
46 .can_attach = blkiocg_can_attach,
47 .attach = blkiocg_attach,
48 .destroy = blkiocg_destroy,
49 .populate = blkiocg_populate,
50#ifdef CONFIG_BLK_CGROUP
51 /* note: blkio_subsys_id is otherwise defined in blk-cgroup.h */
52 .subsys_id = blkio_subsys_id,
53#endif
54 .use_id = 1,
55 .module = THIS_MODULE,
56};
57EXPORT_SYMBOL_GPL(blkio_subsys);
58
59static inline void blkio_policy_insert_node(struct blkio_cgroup *blkcg,
60 struct blkio_policy_node *pn)
61{ 35{
62 list_add(&pn->node, &blkcg->policy_list); 36 return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id),
37 struct blkcg, css);
63} 38}
39EXPORT_SYMBOL_GPL(cgroup_to_blkcg);
64 40
65static inline bool cftype_blkg_same_policy(struct cftype *cft, 41static struct blkcg *task_blkcg(struct task_struct *tsk)
66 struct blkio_group *blkg)
67{ 42{
68 enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); 43 return container_of(task_subsys_state(tsk, blkio_subsys_id),
69 44 struct blkcg, css);
70 if (blkg->plid == plid)
71 return 1;
72
73 return 0;
74} 45}
75 46
76/* Determines if policy node matches cgroup file being accessed */ 47struct blkcg *bio_blkcg(struct bio *bio)
77static inline bool pn_matches_cftype(struct cftype *cft,
78 struct blkio_policy_node *pn)
79{ 48{
80 enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); 49 if (bio && bio->bi_css)
81 int fileid = BLKIOFILE_ATTR(cft->private); 50 return container_of(bio->bi_css, struct blkcg, css);
82 51 return task_blkcg(current);
83 return (plid == pn->plid && fileid == pn->fileid);
84} 52}
53EXPORT_SYMBOL_GPL(bio_blkcg);
85 54
86/* Must be called with blkcg->lock held */ 55static bool blkcg_policy_enabled(struct request_queue *q,
87static inline void blkio_policy_delete_node(struct blkio_policy_node *pn) 56 const struct blkcg_policy *pol)
88{ 57{
89 list_del(&pn->node); 58 return pol && test_bit(pol->plid, q->blkcg_pols);
90} 59}
91 60
92/* Must be called with blkcg->lock held */ 61/**
93static struct blkio_policy_node * 62 * blkg_free - free a blkg
94blkio_policy_search_node(const struct blkio_cgroup *blkcg, dev_t dev, 63 * @blkg: blkg to free
95 enum blkio_policy_id plid, int fileid) 64 *
65 * Free @blkg which may be partially allocated.
66 */
67static void blkg_free(struct blkcg_gq *blkg)
96{ 68{
97 struct blkio_policy_node *pn; 69 int i;
98
99 list_for_each_entry(pn, &blkcg->policy_list, node) {
100 if (pn->dev == dev && pn->plid == plid && pn->fileid == fileid)
101 return pn;
102 }
103 70
104 return NULL; 71 if (!blkg)
105} 72 return;
106 73
107struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup) 74 for (i = 0; i < BLKCG_MAX_POLS; i++) {
108{ 75 struct blkcg_policy *pol = blkcg_policy[i];
109 return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id), 76 struct blkg_policy_data *pd = blkg->pd[i];
110 struct blkio_cgroup, css);
111}
112EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup);
113 77
114struct blkio_cgroup *task_blkio_cgroup(struct task_struct *tsk) 78 if (!pd)
115{ 79 continue;
116 return container_of(task_subsys_state(tsk, blkio_subsys_id),
117 struct blkio_cgroup, css);
118}
119EXPORT_SYMBOL_GPL(task_blkio_cgroup);
120 80
121static inline void 81 if (pol && pol->pd_exit_fn)
122blkio_update_group_weight(struct blkio_group *blkg, unsigned int weight) 82 pol->pd_exit_fn(blkg);
123{
124 struct blkio_policy_type *blkiop;
125 83
126 list_for_each_entry(blkiop, &blkio_list, list) { 84 kfree(pd);
127 /* If this policy does not own the blkg, do not send updates */
128 if (blkiop->plid != blkg->plid)
129 continue;
130 if (blkiop->ops.blkio_update_group_weight_fn)
131 blkiop->ops.blkio_update_group_weight_fn(blkg->key,
132 blkg, weight);
133 } 85 }
86
87 kfree(blkg);
134} 88}
135 89
136static inline void blkio_update_group_bps(struct blkio_group *blkg, u64 bps, 90/**
137 int fileid) 91 * blkg_alloc - allocate a blkg
92 * @blkcg: block cgroup the new blkg is associated with
93 * @q: request_queue the new blkg is associated with
94 *
95 * Allocate a new blkg assocating @blkcg and @q.
96 */
97static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q)
138{ 98{
139 struct blkio_policy_type *blkiop; 99 struct blkcg_gq *blkg;
140 100 int i;
141 list_for_each_entry(blkiop, &blkio_list, list) {
142
143 /* If this policy does not own the blkg, do not send updates */
144 if (blkiop->plid != blkg->plid)
145 continue;
146 101
147 if (fileid == BLKIO_THROTL_read_bps_device 102 /* alloc and init base part */
148 && blkiop->ops.blkio_update_group_read_bps_fn) 103 blkg = kzalloc_node(sizeof(*blkg), GFP_ATOMIC, q->node);
149 blkiop->ops.blkio_update_group_read_bps_fn(blkg->key, 104 if (!blkg)
150 blkg, bps); 105 return NULL;
151 106
152 if (fileid == BLKIO_THROTL_write_bps_device 107 blkg->q = q;
153 && blkiop->ops.blkio_update_group_write_bps_fn) 108 INIT_LIST_HEAD(&blkg->q_node);
154 blkiop->ops.blkio_update_group_write_bps_fn(blkg->key, 109 blkg->blkcg = blkcg;
155 blkg, bps); 110 blkg->refcnt = 1;
156 }
157}
158
159static inline void blkio_update_group_iops(struct blkio_group *blkg,
160 unsigned int iops, int fileid)
161{
162 struct blkio_policy_type *blkiop;
163 111
164 list_for_each_entry(blkiop, &blkio_list, list) { 112 for (i = 0; i < BLKCG_MAX_POLS; i++) {
113 struct blkcg_policy *pol = blkcg_policy[i];
114 struct blkg_policy_data *pd;
165 115
166 /* If this policy does not own the blkg, do not send updates */ 116 if (!blkcg_policy_enabled(q, pol))
167 if (blkiop->plid != blkg->plid)
168 continue; 117 continue;
169 118
170 if (fileid == BLKIO_THROTL_read_iops_device 119 /* alloc per-policy data and attach it to blkg */
171 && blkiop->ops.blkio_update_group_read_iops_fn) 120 pd = kzalloc_node(pol->pd_size, GFP_ATOMIC, q->node);
172 blkiop->ops.blkio_update_group_read_iops_fn(blkg->key, 121 if (!pd) {
173 blkg, iops); 122 blkg_free(blkg);
123 return NULL;
124 }
174 125
175 if (fileid == BLKIO_THROTL_write_iops_device 126 blkg->pd[i] = pd;
176 && blkiop->ops.blkio_update_group_write_iops_fn) 127 pd->blkg = blkg;
177 blkiop->ops.blkio_update_group_write_iops_fn(blkg->key,
178 blkg,iops);
179 } 128 }
180}
181 129
182/* 130 /* invoke per-policy init */
183 * Add to the appropriate stat variable depending on the request type. 131 for (i = 0; i < BLKCG_MAX_POLS; i++) {
184 * This should be called with the blkg->stats_lock held. 132 struct blkcg_policy *pol = blkcg_policy[i];
185 */
186static void blkio_add_stat(uint64_t *stat, uint64_t add, bool direction,
187 bool sync)
188{
189 if (direction)
190 stat[BLKIO_STAT_WRITE] += add;
191 else
192 stat[BLKIO_STAT_READ] += add;
193 if (sync)
194 stat[BLKIO_STAT_SYNC] += add;
195 else
196 stat[BLKIO_STAT_ASYNC] += add;
197}
198 133
199/* 134 if (blkcg_policy_enabled(blkg->q, pol))
200 * Decrements the appropriate stat variable if non-zero depending on the 135 pol->pd_init_fn(blkg);
201 * request type. Panics on value being zero.
202 * This should be called with the blkg->stats_lock held.
203 */
204static void blkio_check_and_dec_stat(uint64_t *stat, bool direction, bool sync)
205{
206 if (direction) {
207 BUG_ON(stat[BLKIO_STAT_WRITE] == 0);
208 stat[BLKIO_STAT_WRITE]--;
209 } else {
210 BUG_ON(stat[BLKIO_STAT_READ] == 0);
211 stat[BLKIO_STAT_READ]--;
212 } 136 }
213 if (sync) {
214 BUG_ON(stat[BLKIO_STAT_SYNC] == 0);
215 stat[BLKIO_STAT_SYNC]--;
216 } else {
217 BUG_ON(stat[BLKIO_STAT_ASYNC] == 0);
218 stat[BLKIO_STAT_ASYNC]--;
219 }
220}
221 137
222#ifdef CONFIG_DEBUG_BLK_CGROUP 138 return blkg;
223/* This should be called with the blkg->stats_lock held. */
224static void blkio_set_start_group_wait_time(struct blkio_group *blkg,
225 struct blkio_group *curr_blkg)
226{
227 if (blkio_blkg_waiting(&blkg->stats))
228 return;
229 if (blkg == curr_blkg)
230 return;
231 blkg->stats.start_group_wait_time = sched_clock();
232 blkio_mark_blkg_waiting(&blkg->stats);
233} 139}
234 140
235/* This should be called with the blkg->stats_lock held. */ 141static struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg,
236static void blkio_update_group_wait_time(struct blkio_group_stats *stats) 142 struct request_queue *q)
237{ 143{
238 unsigned long long now; 144 struct blkcg_gq *blkg;
239 145
240 if (!blkio_blkg_waiting(stats)) 146 blkg = rcu_dereference(blkcg->blkg_hint);
241 return; 147 if (blkg && blkg->q == q)
148 return blkg;
149
150 /*
151 * Hint didn't match. Look up from the radix tree. Note that we
152 * may not be holding queue_lock and thus are not sure whether
153 * @blkg from blkg_tree has already been removed or not, so we
154 * can't update hint to the lookup result. Leave it to the caller.
155 */
156 blkg = radix_tree_lookup(&blkcg->blkg_tree, q->id);
157 if (blkg && blkg->q == q)
158 return blkg;
242 159
243 now = sched_clock(); 160 return NULL;
244 if (time_after64(now, stats->start_group_wait_time))
245 stats->group_wait_time += now - stats->start_group_wait_time;
246 blkio_clear_blkg_waiting(stats);
247} 161}
248 162
249/* This should be called with the blkg->stats_lock held. */ 163/**
250static void blkio_end_empty_time(struct blkio_group_stats *stats) 164 * blkg_lookup - lookup blkg for the specified blkcg - q pair
165 * @blkcg: blkcg of interest
166 * @q: request_queue of interest
167 *
168 * Lookup blkg for the @blkcg - @q pair. This function should be called
169 * under RCU read lock and is guaranteed to return %NULL if @q is bypassing
170 * - see blk_queue_bypass_start() for details.
171 */
172struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q)
251{ 173{
252 unsigned long long now; 174 WARN_ON_ONCE(!rcu_read_lock_held());
253
254 if (!blkio_blkg_empty(stats))
255 return;
256 175
257 now = sched_clock(); 176 if (unlikely(blk_queue_bypass(q)))
258 if (time_after64(now, stats->start_empty_time)) 177 return NULL;
259 stats->empty_time += now - stats->start_empty_time; 178 return __blkg_lookup(blkcg, q);
260 blkio_clear_blkg_empty(stats);
261} 179}
180EXPORT_SYMBOL_GPL(blkg_lookup);
262 181
263void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg) 182static struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg,
183 struct request_queue *q)
184 __releases(q->queue_lock) __acquires(q->queue_lock)
264{ 185{
265 unsigned long flags; 186 struct blkcg_gq *blkg;
187 int ret;
266 188
267 spin_lock_irqsave(&blkg->stats_lock, flags); 189 WARN_ON_ONCE(!rcu_read_lock_held());
268 BUG_ON(blkio_blkg_idling(&blkg->stats)); 190 lockdep_assert_held(q->queue_lock);
269 blkg->stats.start_idle_time = sched_clock();
270 blkio_mark_blkg_idling(&blkg->stats);
271 spin_unlock_irqrestore(&blkg->stats_lock, flags);
272}
273EXPORT_SYMBOL_GPL(blkiocg_update_set_idle_time_stats);
274 191
275void blkiocg_update_idle_time_stats(struct blkio_group *blkg) 192 /* lookup and update hint on success, see __blkg_lookup() for details */
276{ 193 blkg = __blkg_lookup(blkcg, q);
277 unsigned long flags; 194 if (blkg) {
278 unsigned long long now; 195 rcu_assign_pointer(blkcg->blkg_hint, blkg);
279 struct blkio_group_stats *stats; 196 return blkg;
280
281 spin_lock_irqsave(&blkg->stats_lock, flags);
282 stats = &blkg->stats;
283 if (blkio_blkg_idling(stats)) {
284 now = sched_clock();
285 if (time_after64(now, stats->start_idle_time))
286 stats->idle_time += now - stats->start_idle_time;
287 blkio_clear_blkg_idling(stats);
288 } 197 }
289 spin_unlock_irqrestore(&blkg->stats_lock, flags);
290}
291EXPORT_SYMBOL_GPL(blkiocg_update_idle_time_stats);
292 198
293void blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg) 199 /* blkg holds a reference to blkcg */
294{ 200 if (!css_tryget(&blkcg->css))
295 unsigned long flags; 201 return ERR_PTR(-EINVAL);
296 struct blkio_group_stats *stats;
297
298 spin_lock_irqsave(&blkg->stats_lock, flags);
299 stats = &blkg->stats;
300 stats->avg_queue_size_sum +=
301 stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] +
302 stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE];
303 stats->avg_queue_size_samples++;
304 blkio_update_group_wait_time(stats);
305 spin_unlock_irqrestore(&blkg->stats_lock, flags);
306}
307EXPORT_SYMBOL_GPL(blkiocg_update_avg_queue_size_stats);
308 202
309void blkiocg_set_start_empty_time(struct blkio_group *blkg) 203 /* allocate */
310{ 204 ret = -ENOMEM;
311 unsigned long flags; 205 blkg = blkg_alloc(blkcg, q);
312 struct blkio_group_stats *stats; 206 if (unlikely(!blkg))
207 goto err_put;
313 208
314 spin_lock_irqsave(&blkg->stats_lock, flags); 209 /* insert */
315 stats = &blkg->stats; 210 ret = radix_tree_preload(GFP_ATOMIC);
211 if (ret)
212 goto err_free;
316 213
317 if (stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] || 214 spin_lock(&blkcg->lock);
318 stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE]) { 215 ret = radix_tree_insert(&blkcg->blkg_tree, q->id, blkg);
319 spin_unlock_irqrestore(&blkg->stats_lock, flags); 216 if (likely(!ret)) {
320 return; 217 hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
218 list_add(&blkg->q_node, &q->blkg_list);
321 } 219 }
220 spin_unlock(&blkcg->lock);
322 221
323 /* 222 radix_tree_preload_end();
324 * group is already marked empty. This can happen if cfqq got new
325 * request in parent group and moved to this group while being added
326 * to service tree. Just ignore the event and move on.
327 */
328 if(blkio_blkg_empty(stats)) {
329 spin_unlock_irqrestore(&blkg->stats_lock, flags);
330 return;
331 }
332 223
333 stats->start_empty_time = sched_clock(); 224 if (!ret)
334 blkio_mark_blkg_empty(stats); 225 return blkg;
335 spin_unlock_irqrestore(&blkg->stats_lock, flags); 226err_free:
227 blkg_free(blkg);
228err_put:
229 css_put(&blkcg->css);
230 return ERR_PTR(ret);
336} 231}
337EXPORT_SYMBOL_GPL(blkiocg_set_start_empty_time);
338 232
339void blkiocg_update_dequeue_stats(struct blkio_group *blkg, 233struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
340 unsigned long dequeue) 234 struct request_queue *q)
341{
342 blkg->stats.dequeue += dequeue;
343}
344EXPORT_SYMBOL_GPL(blkiocg_update_dequeue_stats);
345#else
346static inline void blkio_set_start_group_wait_time(struct blkio_group *blkg,
347 struct blkio_group *curr_blkg) {}
348static inline void blkio_end_empty_time(struct blkio_group_stats *stats) {}
349#endif
350
351void blkiocg_update_io_add_stats(struct blkio_group *blkg,
352 struct blkio_group *curr_blkg, bool direction,
353 bool sync)
354{ 235{
355 unsigned long flags; 236 /*
356 237 * This could be the first entry point of blkcg implementation and
357 spin_lock_irqsave(&blkg->stats_lock, flags); 238 * we shouldn't allow anything to go through for a bypassing queue.
358 blkio_add_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED], 1, direction, 239 */
359 sync); 240 if (unlikely(blk_queue_bypass(q)))
360 blkio_end_empty_time(&blkg->stats); 241 return ERR_PTR(blk_queue_dead(q) ? -EINVAL : -EBUSY);
361 blkio_set_start_group_wait_time(blkg, curr_blkg); 242 return __blkg_lookup_create(blkcg, q);
362 spin_unlock_irqrestore(&blkg->stats_lock, flags);
363} 243}
364EXPORT_SYMBOL_GPL(blkiocg_update_io_add_stats); 244EXPORT_SYMBOL_GPL(blkg_lookup_create);
365 245
366void blkiocg_update_io_remove_stats(struct blkio_group *blkg, 246static void blkg_destroy(struct blkcg_gq *blkg)
367 bool direction, bool sync)
368{ 247{
369 unsigned long flags; 248 struct request_queue *q = blkg->q;
249 struct blkcg *blkcg = blkg->blkcg;
370 250
371 spin_lock_irqsave(&blkg->stats_lock, flags); 251 lockdep_assert_held(q->queue_lock);
372 blkio_check_and_dec_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED], 252 lockdep_assert_held(&blkcg->lock);
373 direction, sync);
374 spin_unlock_irqrestore(&blkg->stats_lock, flags);
375}
376EXPORT_SYMBOL_GPL(blkiocg_update_io_remove_stats);
377 253
378void blkiocg_update_timeslice_used(struct blkio_group *blkg, unsigned long time, 254 /* Something wrong if we are trying to remove same group twice */
379 unsigned long unaccounted_time) 255 WARN_ON_ONCE(list_empty(&blkg->q_node));
380{ 256 WARN_ON_ONCE(hlist_unhashed(&blkg->blkcg_node));
381 unsigned long flags;
382
383 spin_lock_irqsave(&blkg->stats_lock, flags);
384 blkg->stats.time += time;
385#ifdef CONFIG_DEBUG_BLK_CGROUP
386 blkg->stats.unaccounted_time += unaccounted_time;
387#endif
388 spin_unlock_irqrestore(&blkg->stats_lock, flags);
389}
390EXPORT_SYMBOL_GPL(blkiocg_update_timeslice_used);
391 257
392/* 258 radix_tree_delete(&blkcg->blkg_tree, blkg->q->id);
393 * should be called under rcu read lock or queue lock to make sure blkg pointer 259 list_del_init(&blkg->q_node);
394 * is valid. 260 hlist_del_init_rcu(&blkg->blkcg_node);
395 */
396void blkiocg_update_dispatch_stats(struct blkio_group *blkg,
397 uint64_t bytes, bool direction, bool sync)
398{
399 struct blkio_group_stats_cpu *stats_cpu;
400 unsigned long flags;
401 261
402 /* 262 /*
403 * Disabling interrupts to provide mutual exclusion between two 263 * Both setting lookup hint to and clearing it from @blkg are done
404 * writes on same cpu. It probably is not needed for 64bit. Not 264 * under queue_lock. If it's not pointing to @blkg now, it never
405 * optimizing that case yet. 265 * will. Hint assignment itself can race safely.
406 */ 266 */
407 local_irq_save(flags); 267 if (rcu_dereference_raw(blkcg->blkg_hint) == blkg)
408 268 rcu_assign_pointer(blkcg->blkg_hint, NULL);
409 stats_cpu = this_cpu_ptr(blkg->stats_cpu);
410
411 u64_stats_update_begin(&stats_cpu->syncp);
412 stats_cpu->sectors += bytes >> 9;
413 blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_SERVICED],
414 1, direction, sync);
415 blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_SERVICE_BYTES],
416 bytes, direction, sync);
417 u64_stats_update_end(&stats_cpu->syncp);
418 local_irq_restore(flags);
419}
420EXPORT_SYMBOL_GPL(blkiocg_update_dispatch_stats);
421
422void blkiocg_update_completion_stats(struct blkio_group *blkg,
423 uint64_t start_time, uint64_t io_start_time, bool direction, bool sync)
424{
425 struct blkio_group_stats *stats;
426 unsigned long flags;
427 unsigned long long now = sched_clock();
428
429 spin_lock_irqsave(&blkg->stats_lock, flags);
430 stats = &blkg->stats;
431 if (time_after64(now, io_start_time))
432 blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICE_TIME],
433 now - io_start_time, direction, sync);
434 if (time_after64(io_start_time, start_time))
435 blkio_add_stat(stats->stat_arr[BLKIO_STAT_WAIT_TIME],
436 io_start_time - start_time, direction, sync);
437 spin_unlock_irqrestore(&blkg->stats_lock, flags);
438}
439EXPORT_SYMBOL_GPL(blkiocg_update_completion_stats);
440
441/* Merged stats are per cpu. */
442void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction,
443 bool sync)
444{
445 struct blkio_group_stats_cpu *stats_cpu;
446 unsigned long flags;
447 269
448 /* 270 /*
449 * Disabling interrupts to provide mutual exclusion between two 271 * Put the reference taken at the time of creation so that when all
450 * writes on same cpu. It probably is not needed for 64bit. Not 272 * queues are gone, group can be destroyed.
451 * optimizing that case yet.
452 */ 273 */
453 local_irq_save(flags); 274 blkg_put(blkg);
454
455 stats_cpu = this_cpu_ptr(blkg->stats_cpu);
456
457 u64_stats_update_begin(&stats_cpu->syncp);
458 blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_MERGED], 1,
459 direction, sync);
460 u64_stats_update_end(&stats_cpu->syncp);
461 local_irq_restore(flags);
462} 275}
463EXPORT_SYMBOL_GPL(blkiocg_update_io_merged_stats);
464 276
465/* 277/**
466 * This function allocates the per cpu stats for blkio_group. Should be called 278 * blkg_destroy_all - destroy all blkgs associated with a request_queue
467 * from sleepable context as alloc_per_cpu() requires that. 279 * @q: request_queue of interest
280 *
281 * Destroy all blkgs associated with @q.
468 */ 282 */
469int blkio_alloc_blkg_stats(struct blkio_group *blkg) 283static void blkg_destroy_all(struct request_queue *q)
470{ 284{
471 /* Allocate memory for per cpu stats */ 285 struct blkcg_gq *blkg, *n;
472 blkg->stats_cpu = alloc_percpu(struct blkio_group_stats_cpu);
473 if (!blkg->stats_cpu)
474 return -ENOMEM;
475 return 0;
476}
477EXPORT_SYMBOL_GPL(blkio_alloc_blkg_stats);
478 286
479void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, 287 lockdep_assert_held(q->queue_lock);
480 struct blkio_group *blkg, void *key, dev_t dev,
481 enum blkio_policy_id plid)
482{
483 unsigned long flags;
484
485 spin_lock_irqsave(&blkcg->lock, flags);
486 spin_lock_init(&blkg->stats_lock);
487 rcu_assign_pointer(blkg->key, key);
488 blkg->blkcg_id = css_id(&blkcg->css);
489 hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
490 blkg->plid = plid;
491 spin_unlock_irqrestore(&blkcg->lock, flags);
492 /* Need to take css reference ? */
493 cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path));
494 blkg->dev = dev;
495}
496EXPORT_SYMBOL_GPL(blkiocg_add_blkio_group);
497
498static void __blkiocg_del_blkio_group(struct blkio_group *blkg)
499{
500 hlist_del_init_rcu(&blkg->blkcg_node);
501 blkg->blkcg_id = 0;
502}
503 288
504/* 289 list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) {
505 * returns 0 if blkio_group was still on cgroup list. Otherwise returns 1 290 struct blkcg *blkcg = blkg->blkcg;
506 * indicating that blk_group was unhashed by the time we got to it.
507 */
508int blkiocg_del_blkio_group(struct blkio_group *blkg)
509{
510 struct blkio_cgroup *blkcg;
511 unsigned long flags;
512 struct cgroup_subsys_state *css;
513 int ret = 1;
514 291
515 rcu_read_lock(); 292 spin_lock(&blkcg->lock);
516 css = css_lookup(&blkio_subsys, blkg->blkcg_id); 293 blkg_destroy(blkg);
517 if (css) { 294 spin_unlock(&blkcg->lock);
518 blkcg = container_of(css, struct blkio_cgroup, css);
519 spin_lock_irqsave(&blkcg->lock, flags);
520 if (!hlist_unhashed(&blkg->blkcg_node)) {
521 __blkiocg_del_blkio_group(blkg);
522 ret = 0;
523 }
524 spin_unlock_irqrestore(&blkcg->lock, flags);
525 } 295 }
526
527 rcu_read_unlock();
528 return ret;
529} 296}
530EXPORT_SYMBOL_GPL(blkiocg_del_blkio_group);
531 297
532/* called under rcu_read_lock(). */ 298static void blkg_rcu_free(struct rcu_head *rcu_head)
533struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key)
534{ 299{
535 struct blkio_group *blkg; 300 blkg_free(container_of(rcu_head, struct blkcg_gq, rcu_head));
536 struct hlist_node *n;
537 void *__key;
538
539 hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {
540 __key = blkg->key;
541 if (__key == key)
542 return blkg;
543 }
544
545 return NULL;
546} 301}
547EXPORT_SYMBOL_GPL(blkiocg_lookup_group);
548 302
549static void blkio_reset_stats_cpu(struct blkio_group *blkg) 303void __blkg_release(struct blkcg_gq *blkg)
550{ 304{
551 struct blkio_group_stats_cpu *stats_cpu; 305 /* release the extra blkcg reference this blkg has been holding */
552 int i, j, k; 306 css_put(&blkg->blkcg->css);
307
553 /* 308 /*
554 * Note: On 64 bit arch this should not be an issue. This has the 309 * A group is freed in rcu manner. But having an rcu lock does not
555 * possibility of returning some inconsistent value on 32bit arch 310 * mean that one can access all the fields of blkg and assume these
556 * as 64bit update on 32bit is non atomic. Taking care of this 311 * are valid. For example, don't try to follow throtl_data and
557 * corner case makes code very complicated, like sending IPIs to 312 * request queue links.
558 * cpus, taking care of stats of offline cpus etc.
559 * 313 *
560 * reset stats is anyway more of a debug feature and this sounds a 314 * Having a reference to blkg under an rcu allows acess to only
561 * corner case. So I am not complicating the code yet until and 315 * values local to groups like group stats and group rate limits
562 * unless this becomes a real issue.
563 */ 316 */
564 for_each_possible_cpu(i) { 317 call_rcu(&blkg->rcu_head, blkg_rcu_free);
565 stats_cpu = per_cpu_ptr(blkg->stats_cpu, i);
566 stats_cpu->sectors = 0;
567 for(j = 0; j < BLKIO_STAT_CPU_NR; j++)
568 for (k = 0; k < BLKIO_STAT_TOTAL; k++)
569 stats_cpu->stat_arr_cpu[j][k] = 0;
570 }
571} 318}
319EXPORT_SYMBOL_GPL(__blkg_release);
572 320
573static int 321static int blkcg_reset_stats(struct cgroup *cgroup, struct cftype *cftype,
574blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val) 322 u64 val)
575{ 323{
576 struct blkio_cgroup *blkcg; 324 struct blkcg *blkcg = cgroup_to_blkcg(cgroup);
577 struct blkio_group *blkg; 325 struct blkcg_gq *blkg;
578 struct blkio_group_stats *stats;
579 struct hlist_node *n; 326 struct hlist_node *n;
580 uint64_t queued[BLKIO_STAT_TOTAL];
581 int i; 327 int i;
582#ifdef CONFIG_DEBUG_BLK_CGROUP
583 bool idling, waiting, empty;
584 unsigned long long now = sched_clock();
585#endif
586 328
587 blkcg = cgroup_to_blkio_cgroup(cgroup); 329 mutex_lock(&blkcg_pol_mutex);
588 spin_lock_irq(&blkcg->lock); 330 spin_lock_irq(&blkcg->lock);
331
332 /*
333 * Note that stat reset is racy - it doesn't synchronize against
334 * stat updates. This is a debug feature which shouldn't exist
335 * anyway. If you get hit by a race, retry.
336 */
589 hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) { 337 hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
590 spin_lock(&blkg->stats_lock); 338 for (i = 0; i < BLKCG_MAX_POLS; i++) {
591 stats = &blkg->stats; 339 struct blkcg_policy *pol = blkcg_policy[i];
592#ifdef CONFIG_DEBUG_BLK_CGROUP
593 idling = blkio_blkg_idling(stats);
594 waiting = blkio_blkg_waiting(stats);
595 empty = blkio_blkg_empty(stats);
596#endif
597 for (i = 0; i < BLKIO_STAT_TOTAL; i++)
598 queued[i] = stats->stat_arr[BLKIO_STAT_QUEUED][i];
599 memset(stats, 0, sizeof(struct blkio_group_stats));
600 for (i = 0; i < BLKIO_STAT_TOTAL; i++)
601 stats->stat_arr[BLKIO_STAT_QUEUED][i] = queued[i];
602#ifdef CONFIG_DEBUG_BLK_CGROUP
603 if (idling) {
604 blkio_mark_blkg_idling(stats);
605 stats->start_idle_time = now;
606 }
607 if (waiting) {
608 blkio_mark_blkg_waiting(stats);
609 stats->start_group_wait_time = now;
610 }
611 if (empty) {
612 blkio_mark_blkg_empty(stats);
613 stats->start_empty_time = now;
614 }
615#endif
616 spin_unlock(&blkg->stats_lock);
617 340
618 /* Reset Per cpu stats which don't take blkg->stats_lock */ 341 if (blkcg_policy_enabled(blkg->q, pol) &&
619 blkio_reset_stats_cpu(blkg); 342 pol->pd_reset_stats_fn)
343 pol->pd_reset_stats_fn(blkg);
344 }
620 } 345 }
621 346
622 spin_unlock_irq(&blkcg->lock); 347 spin_unlock_irq(&blkcg->lock);
348 mutex_unlock(&blkcg_pol_mutex);
623 return 0; 349 return 0;
624} 350}
625 351
626static void blkio_get_key_name(enum stat_sub_type type, dev_t dev, char *str, 352static const char *blkg_dev_name(struct blkcg_gq *blkg)
627 int chars_left, bool diskname_only)
628{ 353{
629 snprintf(str, chars_left, "%d:%d", MAJOR(dev), MINOR(dev)); 354 /* some drivers (floppy) instantiate a queue w/o disk registered */
630 chars_left -= strlen(str); 355 if (blkg->q->backing_dev_info.dev)
631 if (chars_left <= 0) { 356 return dev_name(blkg->q->backing_dev_info.dev);
632 printk(KERN_WARNING 357 return NULL;
633 "Possibly incorrect cgroup stat display format");
634 return;
635 }
636 if (diskname_only)
637 return;
638 switch (type) {
639 case BLKIO_STAT_READ:
640 strlcat(str, " Read", chars_left);
641 break;
642 case BLKIO_STAT_WRITE:
643 strlcat(str, " Write", chars_left);
644 break;
645 case BLKIO_STAT_SYNC:
646 strlcat(str, " Sync", chars_left);
647 break;
648 case BLKIO_STAT_ASYNC:
649 strlcat(str, " Async", chars_left);
650 break;
651 case BLKIO_STAT_TOTAL:
652 strlcat(str, " Total", chars_left);
653 break;
654 default:
655 strlcat(str, " Invalid", chars_left);
656 }
657} 358}
658 359
659static uint64_t blkio_fill_stat(char *str, int chars_left, uint64_t val, 360/**
660 struct cgroup_map_cb *cb, dev_t dev) 361 * blkcg_print_blkgs - helper for printing per-blkg data
362 * @sf: seq_file to print to
363 * @blkcg: blkcg of interest
364 * @prfill: fill function to print out a blkg
365 * @pol: policy in question
366 * @data: data to be passed to @prfill
367 * @show_total: to print out sum of prfill return values or not
368 *
369 * This function invokes @prfill on each blkg of @blkcg if pd for the
370 * policy specified by @pol exists. @prfill is invoked with @sf, the
371 * policy data and @data. If @show_total is %true, the sum of the return
372 * values from @prfill is printed with "Total" label at the end.
373 *
374 * This is to be used to construct print functions for
375 * cftype->read_seq_string method.
376 */
377void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg,
378 u64 (*prfill)(struct seq_file *,
379 struct blkg_policy_data *, int),
380 const struct blkcg_policy *pol, int data,
381 bool show_total)
661{ 382{
662 blkio_get_key_name(0, dev, str, chars_left, true); 383 struct blkcg_gq *blkg;
663 cb->fill(cb, str, val); 384 struct hlist_node *n;
664 return val; 385 u64 total = 0;
665}
666
667 386
668static uint64_t blkio_read_stat_cpu(struct blkio_group *blkg, 387 spin_lock_irq(&blkcg->lock);
669 enum stat_type_cpu type, enum stat_sub_type sub_type) 388 hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node)
670{ 389 if (blkcg_policy_enabled(blkg->q, pol))
671 int cpu; 390 total += prfill(sf, blkg->pd[pol->plid], data);
672 struct blkio_group_stats_cpu *stats_cpu; 391 spin_unlock_irq(&blkcg->lock);
673 u64 val = 0, tval;
674
675 for_each_possible_cpu(cpu) {
676 unsigned int start;
677 stats_cpu = per_cpu_ptr(blkg->stats_cpu, cpu);
678
679 do {
680 start = u64_stats_fetch_begin(&stats_cpu->syncp);
681 if (type == BLKIO_STAT_CPU_SECTORS)
682 tval = stats_cpu->sectors;
683 else
684 tval = stats_cpu->stat_arr_cpu[type][sub_type];
685 } while(u64_stats_fetch_retry(&stats_cpu->syncp, start));
686
687 val += tval;
688 }
689 392
690 return val; 393 if (show_total)
394 seq_printf(sf, "Total %llu\n", (unsigned long long)total);
691} 395}
396EXPORT_SYMBOL_GPL(blkcg_print_blkgs);
692 397
693static uint64_t blkio_get_stat_cpu(struct blkio_group *blkg, 398/**
694 struct cgroup_map_cb *cb, dev_t dev, enum stat_type_cpu type) 399 * __blkg_prfill_u64 - prfill helper for a single u64 value
400 * @sf: seq_file to print to
401 * @pd: policy private data of interest
402 * @v: value to print
403 *
404 * Print @v to @sf for the device assocaited with @pd.
405 */
406u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v)
695{ 407{
696 uint64_t disk_total, val; 408 const char *dname = blkg_dev_name(pd->blkg);
697 char key_str[MAX_KEY_LEN];
698 enum stat_sub_type sub_type;
699
700 if (type == BLKIO_STAT_CPU_SECTORS) {
701 val = blkio_read_stat_cpu(blkg, type, 0);
702 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, val, cb, dev);
703 }
704 409
705 for (sub_type = BLKIO_STAT_READ; sub_type < BLKIO_STAT_TOTAL; 410 if (!dname)
706 sub_type++) { 411 return 0;
707 blkio_get_key_name(sub_type, dev, key_str, MAX_KEY_LEN, false);
708 val = blkio_read_stat_cpu(blkg, type, sub_type);
709 cb->fill(cb, key_str, val);
710 }
711 412
712 disk_total = blkio_read_stat_cpu(blkg, type, BLKIO_STAT_READ) + 413 seq_printf(sf, "%s %llu\n", dname, (unsigned long long)v);
713 blkio_read_stat_cpu(blkg, type, BLKIO_STAT_WRITE); 414 return v;
714
715 blkio_get_key_name(BLKIO_STAT_TOTAL, dev, key_str, MAX_KEY_LEN, false);
716 cb->fill(cb, key_str, disk_total);
717 return disk_total;
718} 415}
416EXPORT_SYMBOL_GPL(__blkg_prfill_u64);
719 417
720/* This should be called with blkg->stats_lock held */ 418/**
721static uint64_t blkio_get_stat(struct blkio_group *blkg, 419 * __blkg_prfill_rwstat - prfill helper for a blkg_rwstat
722 struct cgroup_map_cb *cb, dev_t dev, enum stat_type type) 420 * @sf: seq_file to print to
723{ 421 * @pd: policy private data of interest
724 uint64_t disk_total; 422 * @rwstat: rwstat to print
725 char key_str[MAX_KEY_LEN]; 423 *
726 enum stat_sub_type sub_type; 424 * Print @rwstat to @sf for the device assocaited with @pd.
727 425 */
728 if (type == BLKIO_STAT_TIME) 426u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
729 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, 427 const struct blkg_rwstat *rwstat)
730 blkg->stats.time, cb, dev); 428{
731#ifdef CONFIG_DEBUG_BLK_CGROUP 429 static const char *rwstr[] = {
732 if (type == BLKIO_STAT_UNACCOUNTED_TIME) 430 [BLKG_RWSTAT_READ] = "Read",
733 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, 431 [BLKG_RWSTAT_WRITE] = "Write",
734 blkg->stats.unaccounted_time, cb, dev); 432 [BLKG_RWSTAT_SYNC] = "Sync",
735 if (type == BLKIO_STAT_AVG_QUEUE_SIZE) { 433 [BLKG_RWSTAT_ASYNC] = "Async",
736 uint64_t sum = blkg->stats.avg_queue_size_sum; 434 };
737 uint64_t samples = blkg->stats.avg_queue_size_samples; 435 const char *dname = blkg_dev_name(pd->blkg);
738 if (samples) 436 u64 v;
739 do_div(sum, samples); 437 int i;
740 else
741 sum = 0;
742 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, sum, cb, dev);
743 }
744 if (type == BLKIO_STAT_GROUP_WAIT_TIME)
745 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
746 blkg->stats.group_wait_time, cb, dev);
747 if (type == BLKIO_STAT_IDLE_TIME)
748 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
749 blkg->stats.idle_time, cb, dev);
750 if (type == BLKIO_STAT_EMPTY_TIME)
751 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
752 blkg->stats.empty_time, cb, dev);
753 if (type == BLKIO_STAT_DEQUEUE)
754 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
755 blkg->stats.dequeue, cb, dev);
756#endif
757
758 for (sub_type = BLKIO_STAT_READ; sub_type < BLKIO_STAT_TOTAL;
759 sub_type++) {
760 blkio_get_key_name(sub_type, dev, key_str, MAX_KEY_LEN, false);
761 cb->fill(cb, key_str, blkg->stats.stat_arr[type][sub_type]);
762 }
763 disk_total = blkg->stats.stat_arr[type][BLKIO_STAT_READ] +
764 blkg->stats.stat_arr[type][BLKIO_STAT_WRITE];
765 blkio_get_key_name(BLKIO_STAT_TOTAL, dev, key_str, MAX_KEY_LEN, false);
766 cb->fill(cb, key_str, disk_total);
767 return disk_total;
768}
769
770static int blkio_policy_parse_and_set(char *buf,
771 struct blkio_policy_node *newpn, enum blkio_policy_id plid, int fileid)
772{
773 struct gendisk *disk = NULL;
774 char *s[4], *p, *major_s = NULL, *minor_s = NULL;
775 unsigned long major, minor;
776 int i = 0, ret = -EINVAL;
777 int part;
778 dev_t dev;
779 u64 temp;
780
781 memset(s, 0, sizeof(s));
782
783 while ((p = strsep(&buf, " ")) != NULL) {
784 if (!*p)
785 continue;
786
787 s[i++] = p;
788
789 /* Prevent from inputing too many things */
790 if (i == 3)
791 break;
792 }
793
794 if (i != 2)
795 goto out;
796
797 p = strsep(&s[0], ":");
798 if (p != NULL)
799 major_s = p;
800 else
801 goto out;
802
803 minor_s = s[0];
804 if (!minor_s)
805 goto out;
806
807 if (strict_strtoul(major_s, 10, &major))
808 goto out;
809
810 if (strict_strtoul(minor_s, 10, &minor))
811 goto out;
812
813 dev = MKDEV(major, minor);
814 438
815 if (strict_strtoull(s[1], 10, &temp)) 439 if (!dname)
816 goto out; 440 return 0;
817 441
818 /* For rule removal, do not check for device presence. */ 442 for (i = 0; i < BLKG_RWSTAT_NR; i++)
819 if (temp) { 443 seq_printf(sf, "%s %s %llu\n", dname, rwstr[i],
820 disk = get_gendisk(dev, &part); 444 (unsigned long long)rwstat->cnt[i]);
821 if (!disk || part) {
822 ret = -ENODEV;
823 goto out;
824 }
825 }
826 445
827 newpn->dev = dev; 446 v = rwstat->cnt[BLKG_RWSTAT_READ] + rwstat->cnt[BLKG_RWSTAT_WRITE];
828 447 seq_printf(sf, "%s Total %llu\n", dname, (unsigned long long)v);
829 switch (plid) { 448 return v;
830 case BLKIO_POLICY_PROP:
831 if ((temp < BLKIO_WEIGHT_MIN && temp > 0) ||
832 temp > BLKIO_WEIGHT_MAX)
833 goto out;
834
835 newpn->plid = plid;
836 newpn->fileid = fileid;
837 newpn->val.weight = temp;
838 break;
839 case BLKIO_POLICY_THROTL:
840 switch(fileid) {
841 case BLKIO_THROTL_read_bps_device:
842 case BLKIO_THROTL_write_bps_device:
843 newpn->plid = plid;
844 newpn->fileid = fileid;
845 newpn->val.bps = temp;
846 break;
847 case BLKIO_THROTL_read_iops_device:
848 case BLKIO_THROTL_write_iops_device:
849 if (temp > THROTL_IOPS_MAX)
850 goto out;
851
852 newpn->plid = plid;
853 newpn->fileid = fileid;
854 newpn->val.iops = (unsigned int)temp;
855 break;
856 }
857 break;
858 default:
859 BUG();
860 }
861 ret = 0;
862out:
863 put_disk(disk);
864 return ret;
865} 449}
866 450
867unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg, 451/**
868 dev_t dev) 452 * blkg_prfill_stat - prfill callback for blkg_stat
453 * @sf: seq_file to print to
454 * @pd: policy private data of interest
455 * @off: offset to the blkg_stat in @pd
456 *
457 * prfill callback for printing a blkg_stat.
458 */
459u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, int off)
869{ 460{
870 struct blkio_policy_node *pn; 461 return __blkg_prfill_u64(sf, pd, blkg_stat_read((void *)pd + off));
871 unsigned long flags;
872 unsigned int weight;
873
874 spin_lock_irqsave(&blkcg->lock, flags);
875
876 pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_PROP,
877 BLKIO_PROP_weight_device);
878 if (pn)
879 weight = pn->val.weight;
880 else
881 weight = blkcg->weight;
882
883 spin_unlock_irqrestore(&blkcg->lock, flags);
884
885 return weight;
886} 462}
887EXPORT_SYMBOL_GPL(blkcg_get_weight); 463EXPORT_SYMBOL_GPL(blkg_prfill_stat);
888 464
889uint64_t blkcg_get_read_bps(struct blkio_cgroup *blkcg, dev_t dev) 465/**
466 * blkg_prfill_rwstat - prfill callback for blkg_rwstat
467 * @sf: seq_file to print to
468 * @pd: policy private data of interest
469 * @off: offset to the blkg_rwstat in @pd
470 *
471 * prfill callback for printing a blkg_rwstat.
472 */
473u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
474 int off)
890{ 475{
891 struct blkio_policy_node *pn; 476 struct blkg_rwstat rwstat = blkg_rwstat_read((void *)pd + off);
892 unsigned long flags;
893 uint64_t bps = -1;
894
895 spin_lock_irqsave(&blkcg->lock, flags);
896 pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
897 BLKIO_THROTL_read_bps_device);
898 if (pn)
899 bps = pn->val.bps;
900 spin_unlock_irqrestore(&blkcg->lock, flags);
901
902 return bps;
903}
904 477
905uint64_t blkcg_get_write_bps(struct blkio_cgroup *blkcg, dev_t dev) 478 return __blkg_prfill_rwstat(sf, pd, &rwstat);
906{
907 struct blkio_policy_node *pn;
908 unsigned long flags;
909 uint64_t bps = -1;
910
911 spin_lock_irqsave(&blkcg->lock, flags);
912 pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
913 BLKIO_THROTL_write_bps_device);
914 if (pn)
915 bps = pn->val.bps;
916 spin_unlock_irqrestore(&blkcg->lock, flags);
917
918 return bps;
919} 479}
480EXPORT_SYMBOL_GPL(blkg_prfill_rwstat);
920 481
921unsigned int blkcg_get_read_iops(struct blkio_cgroup *blkcg, dev_t dev) 482/**
483 * blkg_conf_prep - parse and prepare for per-blkg config update
484 * @blkcg: target block cgroup
485 * @pol: target policy
486 * @input: input string
487 * @ctx: blkg_conf_ctx to be filled
488 *
489 * Parse per-blkg config update from @input and initialize @ctx with the
490 * result. @ctx->blkg points to the blkg to be updated and @ctx->v the new
491 * value. This function returns with RCU read lock and queue lock held and
492 * must be paired with blkg_conf_finish().
493 */
494int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
495 const char *input, struct blkg_conf_ctx *ctx)
496 __acquires(rcu) __acquires(disk->queue->queue_lock)
922{ 497{
923 struct blkio_policy_node *pn; 498 struct gendisk *disk;
924 unsigned long flags; 499 struct blkcg_gq *blkg;
925 unsigned int iops = -1; 500 unsigned int major, minor;
926 501 unsigned long long v;
927 spin_lock_irqsave(&blkcg->lock, flags); 502 int part, ret;
928 pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
929 BLKIO_THROTL_read_iops_device);
930 if (pn)
931 iops = pn->val.iops;
932 spin_unlock_irqrestore(&blkcg->lock, flags);
933
934 return iops;
935}
936 503
937unsigned int blkcg_get_write_iops(struct blkio_cgroup *blkcg, dev_t dev) 504 if (sscanf(input, "%u:%u %llu", &major, &minor, &v) != 3)
938{ 505 return -EINVAL;
939 struct blkio_policy_node *pn;
940 unsigned long flags;
941 unsigned int iops = -1;
942
943 spin_lock_irqsave(&blkcg->lock, flags);
944 pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
945 BLKIO_THROTL_write_iops_device);
946 if (pn)
947 iops = pn->val.iops;
948 spin_unlock_irqrestore(&blkcg->lock, flags);
949
950 return iops;
951}
952 506
953/* Checks whether user asked for deleting a policy rule */ 507 disk = get_gendisk(MKDEV(major, minor), &part);
954static bool blkio_delete_rule_command(struct blkio_policy_node *pn) 508 if (!disk || part)
955{ 509 return -EINVAL;
956 switch(pn->plid) {
957 case BLKIO_POLICY_PROP:
958 if (pn->val.weight == 0)
959 return 1;
960 break;
961 case BLKIO_POLICY_THROTL:
962 switch(pn->fileid) {
963 case BLKIO_THROTL_read_bps_device:
964 case BLKIO_THROTL_write_bps_device:
965 if (pn->val.bps == 0)
966 return 1;
967 break;
968 case BLKIO_THROTL_read_iops_device:
969 case BLKIO_THROTL_write_iops_device:
970 if (pn->val.iops == 0)
971 return 1;
972 }
973 break;
974 default:
975 BUG();
976 }
977 510
978 return 0; 511 rcu_read_lock();
979} 512 spin_lock_irq(disk->queue->queue_lock);
980 513
981static void blkio_update_policy_rule(struct blkio_policy_node *oldpn, 514 if (blkcg_policy_enabled(disk->queue, pol))
982 struct blkio_policy_node *newpn) 515 blkg = blkg_lookup_create(blkcg, disk->queue);
983{ 516 else
984 switch(oldpn->plid) { 517 blkg = ERR_PTR(-EINVAL);
985 case BLKIO_POLICY_PROP: 518
986 oldpn->val.weight = newpn->val.weight; 519 if (IS_ERR(blkg)) {
987 break; 520 ret = PTR_ERR(blkg);
988 case BLKIO_POLICY_THROTL: 521 rcu_read_unlock();
989 switch(newpn->fileid) { 522 spin_unlock_irq(disk->queue->queue_lock);
990 case BLKIO_THROTL_read_bps_device: 523 put_disk(disk);
991 case BLKIO_THROTL_write_bps_device: 524 /*
992 oldpn->val.bps = newpn->val.bps; 525 * If queue was bypassing, we should retry. Do so after a
993 break; 526 * short msleep(). It isn't strictly necessary but queue
994 case BLKIO_THROTL_read_iops_device: 527 * can be bypassing for some time and it's always nice to
995 case BLKIO_THROTL_write_iops_device: 528 * avoid busy looping.
996 oldpn->val.iops = newpn->val.iops; 529 */
530 if (ret == -EBUSY) {
531 msleep(10);
532 ret = restart_syscall();
997 } 533 }
998 break; 534 return ret;
999 default:
1000 BUG();
1001 } 535 }
536
537 ctx->disk = disk;
538 ctx->blkg = blkg;
539 ctx->v = v;
540 return 0;
1002} 541}
542EXPORT_SYMBOL_GPL(blkg_conf_prep);
1003 543
1004/* 544/**
1005 * Some rules/values in blkg have changed. Propagate those to respective 545 * blkg_conf_finish - finish up per-blkg config update
1006 * policies. 546 * @ctx: blkg_conf_ctx intiailized by blkg_conf_prep()
547 *
548 * Finish up after per-blkg config update. This function must be paired
549 * with blkg_conf_prep().
1007 */ 550 */
1008static void blkio_update_blkg_policy(struct blkio_cgroup *blkcg, 551void blkg_conf_finish(struct blkg_conf_ctx *ctx)
1009 struct blkio_group *blkg, struct blkio_policy_node *pn) 552 __releases(ctx->disk->queue->queue_lock) __releases(rcu)
1010{ 553{
1011 unsigned int weight, iops; 554 spin_unlock_irq(ctx->disk->queue->queue_lock);
1012 u64 bps; 555 rcu_read_unlock();
1013 556 put_disk(ctx->disk);
1014 switch(pn->plid) {
1015 case BLKIO_POLICY_PROP:
1016 weight = pn->val.weight ? pn->val.weight :
1017 blkcg->weight;
1018 blkio_update_group_weight(blkg, weight);
1019 break;
1020 case BLKIO_POLICY_THROTL:
1021 switch(pn->fileid) {
1022 case BLKIO_THROTL_read_bps_device:
1023 case BLKIO_THROTL_write_bps_device:
1024 bps = pn->val.bps ? pn->val.bps : (-1);
1025 blkio_update_group_bps(blkg, bps, pn->fileid);
1026 break;
1027 case BLKIO_THROTL_read_iops_device:
1028 case BLKIO_THROTL_write_iops_device:
1029 iops = pn->val.iops ? pn->val.iops : (-1);
1030 blkio_update_group_iops(blkg, iops, pn->fileid);
1031 break;
1032 }
1033 break;
1034 default:
1035 BUG();
1036 }
1037} 557}
558EXPORT_SYMBOL_GPL(blkg_conf_finish);
1038 559
1039/* 560struct cftype blkcg_files[] = {
1040 * A policy node rule has been updated. Propagate this update to all the 561 {
1041 * block groups which might be affected by this update. 562 .name = "reset_stats",
563 .write_u64 = blkcg_reset_stats,
564 },
565 { } /* terminate */
566};
567
568/**
569 * blkcg_pre_destroy - cgroup pre_destroy callback
570 * @cgroup: cgroup of interest
571 *
572 * This function is called when @cgroup is about to go away and responsible
573 * for shooting down all blkgs associated with @cgroup. blkgs should be
574 * removed while holding both q and blkcg locks. As blkcg lock is nested
575 * inside q lock, this function performs reverse double lock dancing.
576 *
577 * This is the blkcg counterpart of ioc_release_fn().
1042 */ 578 */
1043static void blkio_update_policy_node_blkg(struct blkio_cgroup *blkcg, 579static int blkcg_pre_destroy(struct cgroup *cgroup)
1044 struct blkio_policy_node *pn)
1045{ 580{
1046 struct blkio_group *blkg; 581 struct blkcg *blkcg = cgroup_to_blkcg(cgroup);
1047 struct hlist_node *n;
1048 582
1049 spin_lock(&blkio_list_lock);
1050 spin_lock_irq(&blkcg->lock); 583 spin_lock_irq(&blkcg->lock);
1051 584
1052 hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) { 585 while (!hlist_empty(&blkcg->blkg_list)) {
1053 if (pn->dev != blkg->dev || pn->plid != blkg->plid) 586 struct blkcg_gq *blkg = hlist_entry(blkcg->blkg_list.first,
1054 continue; 587 struct blkcg_gq, blkcg_node);
1055 blkio_update_blkg_policy(blkcg, blkg, pn); 588 struct request_queue *q = blkg->q;
589
590 if (spin_trylock(q->queue_lock)) {
591 blkg_destroy(blkg);
592 spin_unlock(q->queue_lock);
593 } else {
594 spin_unlock_irq(&blkcg->lock);
595 cpu_relax();
596 spin_lock_irq(&blkcg->lock);
597 }
1056 } 598 }
1057 599
1058 spin_unlock_irq(&blkcg->lock); 600 spin_unlock_irq(&blkcg->lock);
1059 spin_unlock(&blkio_list_lock); 601 return 0;
1060} 602}
1061 603
1062static int blkiocg_file_write(struct cgroup *cgrp, struct cftype *cft, 604static void blkcg_destroy(struct cgroup *cgroup)
1063 const char *buffer)
1064{ 605{
1065 int ret = 0; 606 struct blkcg *blkcg = cgroup_to_blkcg(cgroup);
1066 char *buf;
1067 struct blkio_policy_node *newpn, *pn;
1068 struct blkio_cgroup *blkcg;
1069 int keep_newpn = 0;
1070 enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
1071 int fileid = BLKIOFILE_ATTR(cft->private);
1072
1073 buf = kstrdup(buffer, GFP_KERNEL);
1074 if (!buf)
1075 return -ENOMEM;
1076
1077 newpn = kzalloc(sizeof(*newpn), GFP_KERNEL);
1078 if (!newpn) {
1079 ret = -ENOMEM;
1080 goto free_buf;
1081 }
1082 607
1083 ret = blkio_policy_parse_and_set(buf, newpn, plid, fileid); 608 if (blkcg != &blkcg_root)
1084 if (ret) 609 kfree(blkcg);
1085 goto free_newpn; 610}
1086
1087 blkcg = cgroup_to_blkio_cgroup(cgrp);
1088
1089 spin_lock_irq(&blkcg->lock);
1090 611
1091 pn = blkio_policy_search_node(blkcg, newpn->dev, plid, fileid); 612static struct cgroup_subsys_state *blkcg_create(struct cgroup *cgroup)
1092 if (!pn) { 613{
1093 if (!blkio_delete_rule_command(newpn)) { 614 static atomic64_t id_seq = ATOMIC64_INIT(0);
1094 blkio_policy_insert_node(blkcg, newpn); 615 struct blkcg *blkcg;
1095 keep_newpn = 1; 616 struct cgroup *parent = cgroup->parent;
1096 }
1097 spin_unlock_irq(&blkcg->lock);
1098 goto update_io_group;
1099 }
1100 617
1101 if (blkio_delete_rule_command(newpn)) { 618 if (!parent) {
1102 blkio_policy_delete_node(pn); 619 blkcg = &blkcg_root;
1103 kfree(pn); 620 goto done;
1104 spin_unlock_irq(&blkcg->lock);
1105 goto update_io_group;
1106 } 621 }
1107 spin_unlock_irq(&blkcg->lock);
1108 622
1109 blkio_update_policy_rule(pn, newpn); 623 blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
624 if (!blkcg)
625 return ERR_PTR(-ENOMEM);
1110 626
1111update_io_group: 627 blkcg->cfq_weight = CFQ_WEIGHT_DEFAULT;
1112 blkio_update_policy_node_blkg(blkcg, newpn); 628 blkcg->id = atomic64_inc_return(&id_seq); /* root is 0, start from 1 */
629done:
630 spin_lock_init(&blkcg->lock);
631 INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_ATOMIC);
632 INIT_HLIST_HEAD(&blkcg->blkg_list);
1113 633
1114free_newpn: 634 return &blkcg->css;
1115 if (!keep_newpn)
1116 kfree(newpn);
1117free_buf:
1118 kfree(buf);
1119 return ret;
1120} 635}
1121 636
1122static void 637/**
1123blkio_print_policy_node(struct seq_file *m, struct blkio_policy_node *pn) 638 * blkcg_init_queue - initialize blkcg part of request queue
639 * @q: request_queue to initialize
640 *
641 * Called from blk_alloc_queue_node(). Responsible for initializing blkcg
642 * part of new request_queue @q.
643 *
644 * RETURNS:
645 * 0 on success, -errno on failure.
646 */
647int blkcg_init_queue(struct request_queue *q)
1124{ 648{
1125 switch(pn->plid) { 649 might_sleep();
1126 case BLKIO_POLICY_PROP:
1127 if (pn->fileid == BLKIO_PROP_weight_device)
1128 seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev),
1129 MINOR(pn->dev), pn->val.weight);
1130 break;
1131 case BLKIO_POLICY_THROTL:
1132 switch(pn->fileid) {
1133 case BLKIO_THROTL_read_bps_device:
1134 case BLKIO_THROTL_write_bps_device:
1135 seq_printf(m, "%u:%u\t%llu\n", MAJOR(pn->dev),
1136 MINOR(pn->dev), pn->val.bps);
1137 break;
1138 case BLKIO_THROTL_read_iops_device:
1139 case BLKIO_THROTL_write_iops_device:
1140 seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev),
1141 MINOR(pn->dev), pn->val.iops);
1142 break;
1143 }
1144 break;
1145 default:
1146 BUG();
1147 }
1148}
1149 650
1150/* cgroup files which read their data from policy nodes end up here */ 651 return blk_throtl_init(q);
1151static void blkio_read_policy_node_files(struct cftype *cft,
1152 struct blkio_cgroup *blkcg, struct seq_file *m)
1153{
1154 struct blkio_policy_node *pn;
1155
1156 if (!list_empty(&blkcg->policy_list)) {
1157 spin_lock_irq(&blkcg->lock);
1158 list_for_each_entry(pn, &blkcg->policy_list, node) {
1159 if (!pn_matches_cftype(cft, pn))
1160 continue;
1161 blkio_print_policy_node(m, pn);
1162 }
1163 spin_unlock_irq(&blkcg->lock);
1164 }
1165} 652}
1166 653
1167static int blkiocg_file_read(struct cgroup *cgrp, struct cftype *cft, 654/**
1168 struct seq_file *m) 655 * blkcg_drain_queue - drain blkcg part of request_queue
656 * @q: request_queue to drain
657 *
658 * Called from blk_drain_queue(). Responsible for draining blkcg part.
659 */
660void blkcg_drain_queue(struct request_queue *q)
1169{ 661{
1170 struct blkio_cgroup *blkcg; 662 lockdep_assert_held(q->queue_lock);
1171 enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
1172 int name = BLKIOFILE_ATTR(cft->private);
1173
1174 blkcg = cgroup_to_blkio_cgroup(cgrp);
1175
1176 switch(plid) {
1177 case BLKIO_POLICY_PROP:
1178 switch(name) {
1179 case BLKIO_PROP_weight_device:
1180 blkio_read_policy_node_files(cft, blkcg, m);
1181 return 0;
1182 default:
1183 BUG();
1184 }
1185 break;
1186 case BLKIO_POLICY_THROTL:
1187 switch(name){
1188 case BLKIO_THROTL_read_bps_device:
1189 case BLKIO_THROTL_write_bps_device:
1190 case BLKIO_THROTL_read_iops_device:
1191 case BLKIO_THROTL_write_iops_device:
1192 blkio_read_policy_node_files(cft, blkcg, m);
1193 return 0;
1194 default:
1195 BUG();
1196 }
1197 break;
1198 default:
1199 BUG();
1200 }
1201 663
1202 return 0; 664 blk_throtl_drain(q);
1203} 665}
1204 666
1205static int blkio_read_blkg_stats(struct blkio_cgroup *blkcg, 667/**
1206 struct cftype *cft, struct cgroup_map_cb *cb, 668 * blkcg_exit_queue - exit and release blkcg part of request_queue
1207 enum stat_type type, bool show_total, bool pcpu) 669 * @q: request_queue being released
670 *
671 * Called from blk_release_queue(). Responsible for exiting blkcg part.
672 */
673void blkcg_exit_queue(struct request_queue *q)
1208{ 674{
1209 struct blkio_group *blkg; 675 spin_lock_irq(q->queue_lock);
1210 struct hlist_node *n; 676 blkg_destroy_all(q);
1211 uint64_t cgroup_total = 0; 677 spin_unlock_irq(q->queue_lock);
1212 678
1213 rcu_read_lock(); 679 blk_throtl_exit(q);
1214 hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {
1215 if (blkg->dev) {
1216 if (!cftype_blkg_same_policy(cft, blkg))
1217 continue;
1218 if (pcpu)
1219 cgroup_total += blkio_get_stat_cpu(blkg, cb,
1220 blkg->dev, type);
1221 else {
1222 spin_lock_irq(&blkg->stats_lock);
1223 cgroup_total += blkio_get_stat(blkg, cb,
1224 blkg->dev, type);
1225 spin_unlock_irq(&blkg->stats_lock);
1226 }
1227 }
1228 }
1229 if (show_total)
1230 cb->fill(cb, "Total", cgroup_total);
1231 rcu_read_unlock();
1232 return 0;
1233} 680}
1234 681
1235/* All map kind of cgroup file get serviced by this function */ 682/*
1236static int blkiocg_file_read_map(struct cgroup *cgrp, struct cftype *cft, 683 * We cannot support shared io contexts, as we have no mean to support
1237 struct cgroup_map_cb *cb) 684 * two tasks with the same ioc in two different groups without major rework
685 * of the main cic data structures. For now we allow a task to change
686 * its cgroup only if it's the only owner of its ioc.
687 */
688static int blkcg_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
1238{ 689{
1239 struct blkio_cgroup *blkcg; 690 struct task_struct *task;
1240 enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); 691 struct io_context *ioc;
1241 int name = BLKIOFILE_ATTR(cft->private); 692 int ret = 0;
1242
1243 blkcg = cgroup_to_blkio_cgroup(cgrp);
1244
1245 switch(plid) {
1246 case BLKIO_POLICY_PROP:
1247 switch(name) {
1248 case BLKIO_PROP_time:
1249 return blkio_read_blkg_stats(blkcg, cft, cb,
1250 BLKIO_STAT_TIME, 0, 0);
1251 case BLKIO_PROP_sectors:
1252 return blkio_read_blkg_stats(blkcg, cft, cb,
1253 BLKIO_STAT_CPU_SECTORS, 0, 1);
1254 case BLKIO_PROP_io_service_bytes:
1255 return blkio_read_blkg_stats(blkcg, cft, cb,
1256 BLKIO_STAT_CPU_SERVICE_BYTES, 1, 1);
1257 case BLKIO_PROP_io_serviced:
1258 return blkio_read_blkg_stats(blkcg, cft, cb,
1259 BLKIO_STAT_CPU_SERVICED, 1, 1);
1260 case BLKIO_PROP_io_service_time:
1261 return blkio_read_blkg_stats(blkcg, cft, cb,
1262 BLKIO_STAT_SERVICE_TIME, 1, 0);
1263 case BLKIO_PROP_io_wait_time:
1264 return blkio_read_blkg_stats(blkcg, cft, cb,
1265 BLKIO_STAT_WAIT_TIME, 1, 0);
1266 case BLKIO_PROP_io_merged:
1267 return blkio_read_blkg_stats(blkcg, cft, cb,
1268 BLKIO_STAT_CPU_MERGED, 1, 1);
1269 case BLKIO_PROP_io_queued:
1270 return blkio_read_blkg_stats(blkcg, cft, cb,
1271 BLKIO_STAT_QUEUED, 1, 0);
1272#ifdef CONFIG_DEBUG_BLK_CGROUP
1273 case BLKIO_PROP_unaccounted_time:
1274 return blkio_read_blkg_stats(blkcg, cft, cb,
1275 BLKIO_STAT_UNACCOUNTED_TIME, 0, 0);
1276 case BLKIO_PROP_dequeue:
1277 return blkio_read_blkg_stats(blkcg, cft, cb,
1278 BLKIO_STAT_DEQUEUE, 0, 0);
1279 case BLKIO_PROP_avg_queue_size:
1280 return blkio_read_blkg_stats(blkcg, cft, cb,
1281 BLKIO_STAT_AVG_QUEUE_SIZE, 0, 0);
1282 case BLKIO_PROP_group_wait_time:
1283 return blkio_read_blkg_stats(blkcg, cft, cb,
1284 BLKIO_STAT_GROUP_WAIT_TIME, 0, 0);
1285 case BLKIO_PROP_idle_time:
1286 return blkio_read_blkg_stats(blkcg, cft, cb,
1287 BLKIO_STAT_IDLE_TIME, 0, 0);
1288 case BLKIO_PROP_empty_time:
1289 return blkio_read_blkg_stats(blkcg, cft, cb,
1290 BLKIO_STAT_EMPTY_TIME, 0, 0);
1291#endif
1292 default:
1293 BUG();
1294 }
1295 break;
1296 case BLKIO_POLICY_THROTL:
1297 switch(name){
1298 case BLKIO_THROTL_io_service_bytes:
1299 return blkio_read_blkg_stats(blkcg, cft, cb,
1300 BLKIO_STAT_CPU_SERVICE_BYTES, 1, 1);
1301 case BLKIO_THROTL_io_serviced:
1302 return blkio_read_blkg_stats(blkcg, cft, cb,
1303 BLKIO_STAT_CPU_SERVICED, 1, 1);
1304 default:
1305 BUG();
1306 }
1307 break;
1308 default:
1309 BUG();
1310 }
1311 693
1312 return 0; 694 /* task_lock() is needed to avoid races with exit_io_context() */
695 cgroup_taskset_for_each(task, cgrp, tset) {
696 task_lock(task);
697 ioc = task->io_context;
698 if (ioc && atomic_read(&ioc->nr_tasks) > 1)
699 ret = -EINVAL;
700 task_unlock(task);
701 if (ret)
702 break;
703 }
704 return ret;
1313} 705}
1314 706
1315static int blkio_weight_write(struct blkio_cgroup *blkcg, u64 val) 707struct cgroup_subsys blkio_subsys = {
1316{ 708 .name = "blkio",
1317 struct blkio_group *blkg; 709 .create = blkcg_create,
1318 struct hlist_node *n; 710 .can_attach = blkcg_can_attach,
1319 struct blkio_policy_node *pn; 711 .pre_destroy = blkcg_pre_destroy,
1320 712 .destroy = blkcg_destroy,
1321 if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX) 713 .subsys_id = blkio_subsys_id,
1322 return -EINVAL; 714 .base_cftypes = blkcg_files,
715 .module = THIS_MODULE,
716};
717EXPORT_SYMBOL_GPL(blkio_subsys);
1323 718
1324 spin_lock(&blkio_list_lock); 719/**
1325 spin_lock_irq(&blkcg->lock); 720 * blkcg_activate_policy - activate a blkcg policy on a request_queue
1326 blkcg->weight = (unsigned int)val; 721 * @q: request_queue of interest
722 * @pol: blkcg policy to activate
723 *
724 * Activate @pol on @q. Requires %GFP_KERNEL context. @q goes through
725 * bypass mode to populate its blkgs with policy_data for @pol.
726 *
727 * Activation happens with @q bypassed, so nobody would be accessing blkgs
728 * from IO path. Update of each blkg is protected by both queue and blkcg
729 * locks so that holding either lock and testing blkcg_policy_enabled() is
730 * always enough for dereferencing policy data.
731 *
732 * The caller is responsible for synchronizing [de]activations and policy
733 * [un]registerations. Returns 0 on success, -errno on failure.
734 */
735int blkcg_activate_policy(struct request_queue *q,
736 const struct blkcg_policy *pol)
737{
738 LIST_HEAD(pds);
739 struct blkcg_gq *blkg;
740 struct blkg_policy_data *pd, *n;
741 int cnt = 0, ret;
1327 742
1328 hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) { 743 if (blkcg_policy_enabled(q, pol))
1329 pn = blkio_policy_search_node(blkcg, blkg->dev, 744 return 0;
1330 BLKIO_POLICY_PROP, BLKIO_PROP_weight_device);
1331 if (pn)
1332 continue;
1333 745
1334 blkio_update_group_weight(blkg, blkcg->weight); 746 blk_queue_bypass_start(q);
1335 }
1336 spin_unlock_irq(&blkcg->lock);
1337 spin_unlock(&blkio_list_lock);
1338 return 0;
1339}
1340 747
1341static u64 blkiocg_file_read_u64 (struct cgroup *cgrp, struct cftype *cft) { 748 /* make sure the root blkg exists and count the existing blkgs */
1342 struct blkio_cgroup *blkcg; 749 spin_lock_irq(q->queue_lock);
1343 enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
1344 int name = BLKIOFILE_ATTR(cft->private);
1345 750
1346 blkcg = cgroup_to_blkio_cgroup(cgrp); 751 rcu_read_lock();
752 blkg = __blkg_lookup_create(&blkcg_root, q);
753 rcu_read_unlock();
1347 754
1348 switch(plid) { 755 if (IS_ERR(blkg)) {
1349 case BLKIO_POLICY_PROP: 756 ret = PTR_ERR(blkg);
1350 switch(name) { 757 goto out_unlock;
1351 case BLKIO_PROP_weight:
1352 return (u64)blkcg->weight;
1353 }
1354 break;
1355 default:
1356 BUG();
1357 } 758 }
1358 return 0; 759 q->root_blkg = blkg;
1359}
1360 760
1361static int 761 list_for_each_entry(blkg, &q->blkg_list, q_node)
1362blkiocg_file_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val) 762 cnt++;
1363{
1364 struct blkio_cgroup *blkcg;
1365 enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
1366 int name = BLKIOFILE_ATTR(cft->private);
1367 763
1368 blkcg = cgroup_to_blkio_cgroup(cgrp); 764 spin_unlock_irq(q->queue_lock);
1369 765
1370 switch(plid) { 766 /* allocate policy_data for all existing blkgs */
1371 case BLKIO_POLICY_PROP: 767 while (cnt--) {
1372 switch(name) { 768 pd = kzalloc_node(pol->pd_size, GFP_KERNEL, q->node);
1373 case BLKIO_PROP_weight: 769 if (!pd) {
1374 return blkio_weight_write(blkcg, val); 770 ret = -ENOMEM;
771 goto out_free;
1375 } 772 }
1376 break; 773 list_add_tail(&pd->alloc_node, &pds);
1377 default:
1378 BUG();
1379 } 774 }
1380 775
1381 return 0; 776 /*
1382} 777 * Install the allocated pds. With @q bypassing, no new blkg
1383 778 * should have been created while the queue lock was dropped.
1384struct cftype blkio_files[] = { 779 */
1385 { 780 spin_lock_irq(q->queue_lock);
1386 .name = "weight_device",
1387 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1388 BLKIO_PROP_weight_device),
1389 .read_seq_string = blkiocg_file_read,
1390 .write_string = blkiocg_file_write,
1391 .max_write_len = 256,
1392 },
1393 {
1394 .name = "weight",
1395 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1396 BLKIO_PROP_weight),
1397 .read_u64 = blkiocg_file_read_u64,
1398 .write_u64 = blkiocg_file_write_u64,
1399 },
1400 {
1401 .name = "time",
1402 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1403 BLKIO_PROP_time),
1404 .read_map = blkiocg_file_read_map,
1405 },
1406 {
1407 .name = "sectors",
1408 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1409 BLKIO_PROP_sectors),
1410 .read_map = blkiocg_file_read_map,
1411 },
1412 {
1413 .name = "io_service_bytes",
1414 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1415 BLKIO_PROP_io_service_bytes),
1416 .read_map = blkiocg_file_read_map,
1417 },
1418 {
1419 .name = "io_serviced",
1420 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1421 BLKIO_PROP_io_serviced),
1422 .read_map = blkiocg_file_read_map,
1423 },
1424 {
1425 .name = "io_service_time",
1426 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1427 BLKIO_PROP_io_service_time),
1428 .read_map = blkiocg_file_read_map,
1429 },
1430 {
1431 .name = "io_wait_time",
1432 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1433 BLKIO_PROP_io_wait_time),
1434 .read_map = blkiocg_file_read_map,
1435 },
1436 {
1437 .name = "io_merged",
1438 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1439 BLKIO_PROP_io_merged),
1440 .read_map = blkiocg_file_read_map,
1441 },
1442 {
1443 .name = "io_queued",
1444 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1445 BLKIO_PROP_io_queued),
1446 .read_map = blkiocg_file_read_map,
1447 },
1448 {
1449 .name = "reset_stats",
1450 .write_u64 = blkiocg_reset_stats,
1451 },
1452#ifdef CONFIG_BLK_DEV_THROTTLING
1453 {
1454 .name = "throttle.read_bps_device",
1455 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1456 BLKIO_THROTL_read_bps_device),
1457 .read_seq_string = blkiocg_file_read,
1458 .write_string = blkiocg_file_write,
1459 .max_write_len = 256,
1460 },
1461 781
1462 { 782 list_for_each_entry(blkg, &q->blkg_list, q_node) {
1463 .name = "throttle.write_bps_device", 783 if (WARN_ON(list_empty(&pds))) {
1464 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL, 784 /* umm... this shouldn't happen, just abort */
1465 BLKIO_THROTL_write_bps_device), 785 ret = -ENOMEM;
1466 .read_seq_string = blkiocg_file_read, 786 goto out_unlock;
1467 .write_string = blkiocg_file_write, 787 }
1468 .max_write_len = 256, 788 pd = list_first_entry(&pds, struct blkg_policy_data, alloc_node);
1469 }, 789 list_del_init(&pd->alloc_node);
1470 790
1471 { 791 /* grab blkcg lock too while installing @pd on @blkg */
1472 .name = "throttle.read_iops_device", 792 spin_lock(&blkg->blkcg->lock);
1473 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1474 BLKIO_THROTL_read_iops_device),
1475 .read_seq_string = blkiocg_file_read,
1476 .write_string = blkiocg_file_write,
1477 .max_write_len = 256,
1478 },
1479 793
1480 { 794 blkg->pd[pol->plid] = pd;
1481 .name = "throttle.write_iops_device", 795 pd->blkg = blkg;
1482 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL, 796 pol->pd_init_fn(blkg);
1483 BLKIO_THROTL_write_iops_device),
1484 .read_seq_string = blkiocg_file_read,
1485 .write_string = blkiocg_file_write,
1486 .max_write_len = 256,
1487 },
1488 {
1489 .name = "throttle.io_service_bytes",
1490 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1491 BLKIO_THROTL_io_service_bytes),
1492 .read_map = blkiocg_file_read_map,
1493 },
1494 {
1495 .name = "throttle.io_serviced",
1496 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1497 BLKIO_THROTL_io_serviced),
1498 .read_map = blkiocg_file_read_map,
1499 },
1500#endif /* CONFIG_BLK_DEV_THROTTLING */
1501 797
1502#ifdef CONFIG_DEBUG_BLK_CGROUP 798 spin_unlock(&blkg->blkcg->lock);
1503 { 799 }
1504 .name = "avg_queue_size",
1505 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1506 BLKIO_PROP_avg_queue_size),
1507 .read_map = blkiocg_file_read_map,
1508 },
1509 {
1510 .name = "group_wait_time",
1511 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1512 BLKIO_PROP_group_wait_time),
1513 .read_map = blkiocg_file_read_map,
1514 },
1515 {
1516 .name = "idle_time",
1517 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1518 BLKIO_PROP_idle_time),
1519 .read_map = blkiocg_file_read_map,
1520 },
1521 {
1522 .name = "empty_time",
1523 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1524 BLKIO_PROP_empty_time),
1525 .read_map = blkiocg_file_read_map,
1526 },
1527 {
1528 .name = "dequeue",
1529 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1530 BLKIO_PROP_dequeue),
1531 .read_map = blkiocg_file_read_map,
1532 },
1533 {
1534 .name = "unaccounted_time",
1535 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1536 BLKIO_PROP_unaccounted_time),
1537 .read_map = blkiocg_file_read_map,
1538 },
1539#endif
1540};
1541 800
1542static int blkiocg_populate(struct cgroup_subsys *subsys, struct cgroup *cgroup) 801 __set_bit(pol->plid, q->blkcg_pols);
1543{ 802 ret = 0;
1544 return cgroup_add_files(cgroup, subsys, blkio_files, 803out_unlock:
1545 ARRAY_SIZE(blkio_files)); 804 spin_unlock_irq(q->queue_lock);
805out_free:
806 blk_queue_bypass_end(q);
807 list_for_each_entry_safe(pd, n, &pds, alloc_node)
808 kfree(pd);
809 return ret;
1546} 810}
811EXPORT_SYMBOL_GPL(blkcg_activate_policy);
1547 812
1548static void blkiocg_destroy(struct cgroup *cgroup) 813/**
814 * blkcg_deactivate_policy - deactivate a blkcg policy on a request_queue
815 * @q: request_queue of interest
816 * @pol: blkcg policy to deactivate
817 *
818 * Deactivate @pol on @q. Follows the same synchronization rules as
819 * blkcg_activate_policy().
820 */
821void blkcg_deactivate_policy(struct request_queue *q,
822 const struct blkcg_policy *pol)
1549{ 823{
1550 struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup); 824 struct blkcg_gq *blkg;
1551 unsigned long flags;
1552 struct blkio_group *blkg;
1553 void *key;
1554 struct blkio_policy_type *blkiop;
1555 struct blkio_policy_node *pn, *pntmp;
1556 825
1557 rcu_read_lock(); 826 if (!blkcg_policy_enabled(q, pol))
1558 do { 827 return;
1559 spin_lock_irqsave(&blkcg->lock, flags);
1560 828
1561 if (hlist_empty(&blkcg->blkg_list)) { 829 blk_queue_bypass_start(q);
1562 spin_unlock_irqrestore(&blkcg->lock, flags); 830 spin_lock_irq(q->queue_lock);
1563 break;
1564 }
1565 831
1566 blkg = hlist_entry(blkcg->blkg_list.first, struct blkio_group, 832 __clear_bit(pol->plid, q->blkcg_pols);
1567 blkcg_node);
1568 key = rcu_dereference(blkg->key);
1569 __blkiocg_del_blkio_group(blkg);
1570 833
1571 spin_unlock_irqrestore(&blkcg->lock, flags); 834 /* if no policy is left, no need for blkgs - shoot them down */
835 if (bitmap_empty(q->blkcg_pols, BLKCG_MAX_POLS))
836 blkg_destroy_all(q);
1572 837
1573 /* 838 list_for_each_entry(blkg, &q->blkg_list, q_node) {
1574 * This blkio_group is being unlinked as associated cgroup is 839 /* grab blkcg lock too while removing @pd from @blkg */
1575 * going away. Let all the IO controlling policies know about 840 spin_lock(&blkg->blkcg->lock);
1576 * this event.
1577 */
1578 spin_lock(&blkio_list_lock);
1579 list_for_each_entry(blkiop, &blkio_list, list) {
1580 if (blkiop->plid != blkg->plid)
1581 continue;
1582 blkiop->ops.blkio_unlink_group_fn(key, blkg);
1583 }
1584 spin_unlock(&blkio_list_lock);
1585 } while (1);
1586 841
1587 list_for_each_entry_safe(pn, pntmp, &blkcg->policy_list, node) { 842 if (pol->pd_exit_fn)
1588 blkio_policy_delete_node(pn); 843 pol->pd_exit_fn(blkg);
1589 kfree(pn);
1590 }
1591 844
1592 free_css_id(&blkio_subsys, &blkcg->css); 845 kfree(blkg->pd[pol->plid]);
1593 rcu_read_unlock(); 846 blkg->pd[pol->plid] = NULL;
1594 if (blkcg != &blkio_root_cgroup)
1595 kfree(blkcg);
1596}
1597
1598static struct cgroup_subsys_state *blkiocg_create(struct cgroup *cgroup)
1599{
1600 struct blkio_cgroup *blkcg;
1601 struct cgroup *parent = cgroup->parent;
1602 847
1603 if (!parent) { 848 spin_unlock(&blkg->blkcg->lock);
1604 blkcg = &blkio_root_cgroup;
1605 goto done;
1606 } 849 }
1607 850
1608 blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL); 851 spin_unlock_irq(q->queue_lock);
1609 if (!blkcg) 852 blk_queue_bypass_end(q);
1610 return ERR_PTR(-ENOMEM);
1611
1612 blkcg->weight = BLKIO_WEIGHT_DEFAULT;
1613done:
1614 spin_lock_init(&blkcg->lock);
1615 INIT_HLIST_HEAD(&blkcg->blkg_list);
1616
1617 INIT_LIST_HEAD(&blkcg->policy_list);
1618 return &blkcg->css;
1619} 853}
854EXPORT_SYMBOL_GPL(blkcg_deactivate_policy);
1620 855
1621/* 856/**
1622 * We cannot support shared io contexts, as we have no mean to support 857 * blkcg_policy_register - register a blkcg policy
1623 * two tasks with the same ioc in two different groups without major rework 858 * @pol: blkcg policy to register
1624 * of the main cic data structures. For now we allow a task to change 859 *
1625 * its cgroup only if it's the only owner of its ioc. 860 * Register @pol with blkcg core. Might sleep and @pol may be modified on
861 * successful registration. Returns 0 on success and -errno on failure.
1626 */ 862 */
1627static int blkiocg_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) 863int blkcg_policy_register(struct blkcg_policy *pol)
1628{ 864{
1629 struct task_struct *task; 865 int i, ret;
1630 struct io_context *ioc;
1631 int ret = 0;
1632 866
1633 /* task_lock() is needed to avoid races with exit_io_context() */ 867 if (WARN_ON(pol->pd_size < sizeof(struct blkg_policy_data)))
1634 cgroup_taskset_for_each(task, cgrp, tset) { 868 return -EINVAL;
1635 task_lock(task); 869
1636 ioc = task->io_context; 870 mutex_lock(&blkcg_pol_mutex);
1637 if (ioc && atomic_read(&ioc->nr_tasks) > 1) 871
1638 ret = -EINVAL; 872 /* find an empty slot */
1639 task_unlock(task); 873 ret = -ENOSPC;
1640 if (ret) 874 for (i = 0; i < BLKCG_MAX_POLS; i++)
875 if (!blkcg_policy[i])
1641 break; 876 break;
1642 } 877 if (i >= BLKCG_MAX_POLS)
1643 return ret; 878 goto out_unlock;
1644}
1645 879
1646static void blkiocg_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) 880 /* register and update blkgs */
1647{ 881 pol->plid = i;
1648 struct task_struct *task; 882 blkcg_policy[i] = pol;
1649 struct io_context *ioc;
1650 883
1651 cgroup_taskset_for_each(task, cgrp, tset) { 884 /* everything is in place, add intf files for the new policy */
1652 /* we don't lose anything even if ioc allocation fails */ 885 if (pol->cftypes)
1653 ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE); 886 WARN_ON(cgroup_add_cftypes(&blkio_subsys, pol->cftypes));
1654 if (ioc) { 887 ret = 0;
1655 ioc_cgroup_changed(ioc); 888out_unlock:
1656 put_io_context(ioc); 889 mutex_unlock(&blkcg_pol_mutex);
1657 } 890 return ret;
1658 }
1659} 891}
892EXPORT_SYMBOL_GPL(blkcg_policy_register);
1660 893
1661void blkio_policy_register(struct blkio_policy_type *blkiop) 894/**
895 * blkcg_policy_unregister - unregister a blkcg policy
896 * @pol: blkcg policy to unregister
897 *
898 * Undo blkcg_policy_register(@pol). Might sleep.
899 */
900void blkcg_policy_unregister(struct blkcg_policy *pol)
1662{ 901{
1663 spin_lock(&blkio_list_lock); 902 mutex_lock(&blkcg_pol_mutex);
1664 list_add_tail(&blkiop->list, &blkio_list);
1665 spin_unlock(&blkio_list_lock);
1666}
1667EXPORT_SYMBOL_GPL(blkio_policy_register);
1668 903
1669void blkio_policy_unregister(struct blkio_policy_type *blkiop) 904 if (WARN_ON(blkcg_policy[pol->plid] != pol))
1670{ 905 goto out_unlock;
1671 spin_lock(&blkio_list_lock);
1672 list_del_init(&blkiop->list);
1673 spin_unlock(&blkio_list_lock);
1674}
1675EXPORT_SYMBOL_GPL(blkio_policy_unregister);
1676 906
1677static int __init init_cgroup_blkio(void) 907 /* kill the intf files first */
1678{ 908 if (pol->cftypes)
1679 return cgroup_load_subsys(&blkio_subsys); 909 cgroup_rm_cftypes(&blkio_subsys, pol->cftypes);
1680}
1681 910
1682static void __exit exit_cgroup_blkio(void) 911 /* unregister and update blkgs */
1683{ 912 blkcg_policy[pol->plid] = NULL;
1684 cgroup_unload_subsys(&blkio_subsys); 913out_unlock:
914 mutex_unlock(&blkcg_pol_mutex);
1685} 915}
1686 916EXPORT_SYMBOL_GPL(blkcg_policy_unregister);
1687module_init(init_cgroup_blkio);
1688module_exit(exit_cgroup_blkio);
1689MODULE_LICENSE("GPL");
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 6f3ace7e792f..8ac457ce7783 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -15,350 +15,371 @@
15 15
16#include <linux/cgroup.h> 16#include <linux/cgroup.h>
17#include <linux/u64_stats_sync.h> 17#include <linux/u64_stats_sync.h>
18 18#include <linux/seq_file.h>
19enum blkio_policy_id { 19#include <linux/radix-tree.h>
20 BLKIO_POLICY_PROP = 0, /* Proportional Bandwidth division */
21 BLKIO_POLICY_THROTL, /* Throttling */
22};
23 20
24/* Max limits for throttle policy */ 21/* Max limits for throttle policy */
25#define THROTL_IOPS_MAX UINT_MAX 22#define THROTL_IOPS_MAX UINT_MAX
26 23
27#if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE) 24/* CFQ specific, out here for blkcg->cfq_weight */
28 25#define CFQ_WEIGHT_MIN 10
29#ifndef CONFIG_BLK_CGROUP 26#define CFQ_WEIGHT_MAX 1000
30/* When blk-cgroup is a module, its subsys_id isn't a compile-time constant */ 27#define CFQ_WEIGHT_DEFAULT 500
31extern struct cgroup_subsys blkio_subsys;
32#define blkio_subsys_id blkio_subsys.subsys_id
33#endif
34
35enum stat_type {
36 /* Total time spent (in ns) between request dispatch to the driver and
37 * request completion for IOs doen by this cgroup. This may not be
38 * accurate when NCQ is turned on. */
39 BLKIO_STAT_SERVICE_TIME = 0,
40 /* Total time spent waiting in scheduler queue in ns */
41 BLKIO_STAT_WAIT_TIME,
42 /* Number of IOs queued up */
43 BLKIO_STAT_QUEUED,
44 /* All the single valued stats go below this */
45 BLKIO_STAT_TIME,
46#ifdef CONFIG_DEBUG_BLK_CGROUP
47 /* Time not charged to this cgroup */
48 BLKIO_STAT_UNACCOUNTED_TIME,
49 BLKIO_STAT_AVG_QUEUE_SIZE,
50 BLKIO_STAT_IDLE_TIME,
51 BLKIO_STAT_EMPTY_TIME,
52 BLKIO_STAT_GROUP_WAIT_TIME,
53 BLKIO_STAT_DEQUEUE
54#endif
55};
56 28
57/* Per cpu stats */ 29#ifdef CONFIG_BLK_CGROUP
58enum stat_type_cpu {
59 BLKIO_STAT_CPU_SECTORS,
60 /* Total bytes transferred */
61 BLKIO_STAT_CPU_SERVICE_BYTES,
62 /* Total IOs serviced, post merge */
63 BLKIO_STAT_CPU_SERVICED,
64 /* Number of IOs merged */
65 BLKIO_STAT_CPU_MERGED,
66 BLKIO_STAT_CPU_NR
67};
68 30
69enum stat_sub_type { 31enum blkg_rwstat_type {
70 BLKIO_STAT_READ = 0, 32 BLKG_RWSTAT_READ,
71 BLKIO_STAT_WRITE, 33 BLKG_RWSTAT_WRITE,
72 BLKIO_STAT_SYNC, 34 BLKG_RWSTAT_SYNC,
73 BLKIO_STAT_ASYNC, 35 BLKG_RWSTAT_ASYNC,
74 BLKIO_STAT_TOTAL
75};
76 36
77/* blkg state flags */ 37 BLKG_RWSTAT_NR,
78enum blkg_state_flags { 38 BLKG_RWSTAT_TOTAL = BLKG_RWSTAT_NR,
79 BLKG_waiting = 0,
80 BLKG_idling,
81 BLKG_empty,
82}; 39};
83 40
84/* cgroup files owned by proportional weight policy */ 41struct blkcg_gq;
85enum blkcg_file_name_prop {
86 BLKIO_PROP_weight = 1,
87 BLKIO_PROP_weight_device,
88 BLKIO_PROP_io_service_bytes,
89 BLKIO_PROP_io_serviced,
90 BLKIO_PROP_time,
91 BLKIO_PROP_sectors,
92 BLKIO_PROP_unaccounted_time,
93 BLKIO_PROP_io_service_time,
94 BLKIO_PROP_io_wait_time,
95 BLKIO_PROP_io_merged,
96 BLKIO_PROP_io_queued,
97 BLKIO_PROP_avg_queue_size,
98 BLKIO_PROP_group_wait_time,
99 BLKIO_PROP_idle_time,
100 BLKIO_PROP_empty_time,
101 BLKIO_PROP_dequeue,
102};
103 42
104/* cgroup files owned by throttle policy */ 43struct blkcg {
105enum blkcg_file_name_throtl { 44 struct cgroup_subsys_state css;
106 BLKIO_THROTL_read_bps_device, 45 spinlock_t lock;
107 BLKIO_THROTL_write_bps_device,
108 BLKIO_THROTL_read_iops_device,
109 BLKIO_THROTL_write_iops_device,
110 BLKIO_THROTL_io_service_bytes,
111 BLKIO_THROTL_io_serviced,
112};
113 46
114struct blkio_cgroup { 47 struct radix_tree_root blkg_tree;
115 struct cgroup_subsys_state css; 48 struct blkcg_gq *blkg_hint;
116 unsigned int weight; 49 struct hlist_head blkg_list;
117 spinlock_t lock; 50
118 struct hlist_head blkg_list; 51 /* for policies to test whether associated blkcg has changed */
119 struct list_head policy_list; /* list of blkio_policy_node */ 52 uint64_t id;
120};
121 53
122struct blkio_group_stats { 54 /* TODO: per-policy storage in blkcg */
123 /* total disk time and nr sectors dispatched by this group */ 55 unsigned int cfq_weight; /* belongs to cfq */
124 uint64_t time;
125 uint64_t stat_arr[BLKIO_STAT_QUEUED + 1][BLKIO_STAT_TOTAL];
126#ifdef CONFIG_DEBUG_BLK_CGROUP
127 /* Time not charged to this cgroup */
128 uint64_t unaccounted_time;
129
130 /* Sum of number of IOs queued across all samples */
131 uint64_t avg_queue_size_sum;
132 /* Count of samples taken for average */
133 uint64_t avg_queue_size_samples;
134 /* How many times this group has been removed from service tree */
135 unsigned long dequeue;
136
137 /* Total time spent waiting for it to be assigned a timeslice. */
138 uint64_t group_wait_time;
139 uint64_t start_group_wait_time;
140
141 /* Time spent idling for this blkio_group */
142 uint64_t idle_time;
143 uint64_t start_idle_time;
144 /*
145 * Total time when we have requests queued and do not contain the
146 * current active queue.
147 */
148 uint64_t empty_time;
149 uint64_t start_empty_time;
150 uint16_t flags;
151#endif
152}; 56};
153 57
154/* Per cpu blkio group stats */ 58struct blkg_stat {
155struct blkio_group_stats_cpu { 59 struct u64_stats_sync syncp;
156 uint64_t sectors; 60 uint64_t cnt;
157 uint64_t stat_arr_cpu[BLKIO_STAT_CPU_NR][BLKIO_STAT_TOTAL];
158 struct u64_stats_sync syncp;
159}; 61};
160 62
161struct blkio_group { 63struct blkg_rwstat {
162 /* An rcu protected unique identifier for the group */ 64 struct u64_stats_sync syncp;
163 void *key; 65 uint64_t cnt[BLKG_RWSTAT_NR];
164 struct hlist_node blkcg_node;
165 unsigned short blkcg_id;
166 /* Store cgroup path */
167 char path[128];
168 /* The device MKDEV(major, minor), this group has been created for */
169 dev_t dev;
170 /* policy which owns this blk group */
171 enum blkio_policy_id plid;
172
173 /* Need to serialize the stats in the case of reset/update */
174 spinlock_t stats_lock;
175 struct blkio_group_stats stats;
176 /* Per cpu stats pointer */
177 struct blkio_group_stats_cpu __percpu *stats_cpu;
178}; 66};
179 67
180struct blkio_policy_node { 68/*
181 struct list_head node; 69 * A blkcg_gq (blkg) is association between a block cgroup (blkcg) and a
182 dev_t dev; 70 * request_queue (q). This is used by blkcg policies which need to track
183 /* This node belongs to max bw policy or porportional weight policy */ 71 * information per blkcg - q pair.
184 enum blkio_policy_id plid; 72 *
185 /* cgroup file to which this rule belongs to */ 73 * There can be multiple active blkcg policies and each has its private
186 int fileid; 74 * data on each blkg, the size of which is determined by
187 75 * blkcg_policy->pd_size. blkcg core allocates and frees such areas
188 union { 76 * together with blkg and invokes pd_init/exit_fn() methods.
189 unsigned int weight; 77 *
190 /* 78 * Such private data must embed struct blkg_policy_data (pd) at the
191 * Rate read/write in terms of bytes per second 79 * beginning and pd_size can't be smaller than pd.
192 * Whether this rate represents read or write is determined 80 */
193 * by file type "fileid". 81struct blkg_policy_data {
194 */ 82 /* the blkg this per-policy data belongs to */
195 u64 bps; 83 struct blkcg_gq *blkg;
196 unsigned int iops; 84
197 } val; 85 /* used during policy activation */
86 struct list_head alloc_node;
198}; 87};
199 88
200extern unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg, 89/* association between a blk cgroup and a request queue */
201 dev_t dev); 90struct blkcg_gq {
202extern uint64_t blkcg_get_read_bps(struct blkio_cgroup *blkcg, 91 /* Pointer to the associated request_queue */
203 dev_t dev); 92 struct request_queue *q;
204extern uint64_t blkcg_get_write_bps(struct blkio_cgroup *blkcg, 93 struct list_head q_node;
205 dev_t dev); 94 struct hlist_node blkcg_node;
206extern unsigned int blkcg_get_read_iops(struct blkio_cgroup *blkcg, 95 struct blkcg *blkcg;
207 dev_t dev); 96 /* reference count */
208extern unsigned int blkcg_get_write_iops(struct blkio_cgroup *blkcg, 97 int refcnt;
209 dev_t dev); 98
210 99 struct blkg_policy_data *pd[BLKCG_MAX_POLS];
211typedef void (blkio_unlink_group_fn) (void *key, struct blkio_group *blkg); 100
212 101 struct rcu_head rcu_head;
213typedef void (blkio_update_group_weight_fn) (void *key,
214 struct blkio_group *blkg, unsigned int weight);
215typedef void (blkio_update_group_read_bps_fn) (void * key,
216 struct blkio_group *blkg, u64 read_bps);
217typedef void (blkio_update_group_write_bps_fn) (void *key,
218 struct blkio_group *blkg, u64 write_bps);
219typedef void (blkio_update_group_read_iops_fn) (void *key,
220 struct blkio_group *blkg, unsigned int read_iops);
221typedef void (blkio_update_group_write_iops_fn) (void *key,
222 struct blkio_group *blkg, unsigned int write_iops);
223
224struct blkio_policy_ops {
225 blkio_unlink_group_fn *blkio_unlink_group_fn;
226 blkio_update_group_weight_fn *blkio_update_group_weight_fn;
227 blkio_update_group_read_bps_fn *blkio_update_group_read_bps_fn;
228 blkio_update_group_write_bps_fn *blkio_update_group_write_bps_fn;
229 blkio_update_group_read_iops_fn *blkio_update_group_read_iops_fn;
230 blkio_update_group_write_iops_fn *blkio_update_group_write_iops_fn;
231}; 102};
232 103
233struct blkio_policy_type { 104typedef void (blkcg_pol_init_pd_fn)(struct blkcg_gq *blkg);
234 struct list_head list; 105typedef void (blkcg_pol_exit_pd_fn)(struct blkcg_gq *blkg);
235 struct blkio_policy_ops ops; 106typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkcg_gq *blkg);
236 enum blkio_policy_id plid; 107
108struct blkcg_policy {
109 int plid;
110 /* policy specific private data size */
111 size_t pd_size;
112 /* cgroup files for the policy */
113 struct cftype *cftypes;
114
115 /* operations */
116 blkcg_pol_init_pd_fn *pd_init_fn;
117 blkcg_pol_exit_pd_fn *pd_exit_fn;
118 blkcg_pol_reset_pd_stats_fn *pd_reset_stats_fn;
237}; 119};
238 120
121extern struct blkcg blkcg_root;
122
123struct blkcg *cgroup_to_blkcg(struct cgroup *cgroup);
124struct blkcg *bio_blkcg(struct bio *bio);
125struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q);
126struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
127 struct request_queue *q);
128int blkcg_init_queue(struct request_queue *q);
129void blkcg_drain_queue(struct request_queue *q);
130void blkcg_exit_queue(struct request_queue *q);
131
239/* Blkio controller policy registration */ 132/* Blkio controller policy registration */
240extern void blkio_policy_register(struct blkio_policy_type *); 133int blkcg_policy_register(struct blkcg_policy *pol);
241extern void blkio_policy_unregister(struct blkio_policy_type *); 134void blkcg_policy_unregister(struct blkcg_policy *pol);
135int blkcg_activate_policy(struct request_queue *q,
136 const struct blkcg_policy *pol);
137void blkcg_deactivate_policy(struct request_queue *q,
138 const struct blkcg_policy *pol);
139
140void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg,
141 u64 (*prfill)(struct seq_file *,
142 struct blkg_policy_data *, int),
143 const struct blkcg_policy *pol, int data,
144 bool show_total);
145u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v);
146u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
147 const struct blkg_rwstat *rwstat);
148u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, int off);
149u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
150 int off);
151
152struct blkg_conf_ctx {
153 struct gendisk *disk;
154 struct blkcg_gq *blkg;
155 u64 v;
156};
157
158int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
159 const char *input, struct blkg_conf_ctx *ctx);
160void blkg_conf_finish(struct blkg_conf_ctx *ctx);
161
162
163/**
164 * blkg_to_pdata - get policy private data
165 * @blkg: blkg of interest
166 * @pol: policy of interest
167 *
168 * Return pointer to private data associated with the @blkg-@pol pair.
169 */
170static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg,
171 struct blkcg_policy *pol)
172{
173 return blkg ? blkg->pd[pol->plid] : NULL;
174}
175
176/**
177 * pdata_to_blkg - get blkg associated with policy private data
178 * @pd: policy private data of interest
179 *
180 * @pd is policy private data. Determine the blkg it's associated with.
181 */
182static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd)
183{
184 return pd ? pd->blkg : NULL;
185}
186
187/**
188 * blkg_path - format cgroup path of blkg
189 * @blkg: blkg of interest
190 * @buf: target buffer
191 * @buflen: target buffer length
192 *
193 * Format the path of the cgroup of @blkg into @buf.
194 */
195static inline int blkg_path(struct blkcg_gq *blkg, char *buf, int buflen)
196{
197 int ret;
198
199 rcu_read_lock();
200 ret = cgroup_path(blkg->blkcg->css.cgroup, buf, buflen);
201 rcu_read_unlock();
202 if (ret)
203 strncpy(buf, "<unavailable>", buflen);
204 return ret;
205}
242 206
243static inline char *blkg_path(struct blkio_group *blkg) 207/**
208 * blkg_get - get a blkg reference
209 * @blkg: blkg to get
210 *
211 * The caller should be holding queue_lock and an existing reference.
212 */
213static inline void blkg_get(struct blkcg_gq *blkg)
244{ 214{
245 return blkg->path; 215 lockdep_assert_held(blkg->q->queue_lock);
216 WARN_ON_ONCE(!blkg->refcnt);
217 blkg->refcnt++;
246} 218}
247 219
248#else 220void __blkg_release(struct blkcg_gq *blkg);
249 221
250struct blkio_group { 222/**
223 * blkg_put - put a blkg reference
224 * @blkg: blkg to put
225 *
226 * The caller should be holding queue_lock.
227 */
228static inline void blkg_put(struct blkcg_gq *blkg)
229{
230 lockdep_assert_held(blkg->q->queue_lock);
231 WARN_ON_ONCE(blkg->refcnt <= 0);
232 if (!--blkg->refcnt)
233 __blkg_release(blkg);
234}
235
236/**
237 * blkg_stat_add - add a value to a blkg_stat
238 * @stat: target blkg_stat
239 * @val: value to add
240 *
241 * Add @val to @stat. The caller is responsible for synchronizing calls to
242 * this function.
243 */
244static inline void blkg_stat_add(struct blkg_stat *stat, uint64_t val)
245{
246 u64_stats_update_begin(&stat->syncp);
247 stat->cnt += val;
248 u64_stats_update_end(&stat->syncp);
249}
250
251/**
252 * blkg_stat_read - read the current value of a blkg_stat
253 * @stat: blkg_stat to read
254 *
255 * Read the current value of @stat. This function can be called without
256 * synchroniztion and takes care of u64 atomicity.
257 */
258static inline uint64_t blkg_stat_read(struct blkg_stat *stat)
259{
260 unsigned int start;
261 uint64_t v;
262
263 do {
264 start = u64_stats_fetch_begin(&stat->syncp);
265 v = stat->cnt;
266 } while (u64_stats_fetch_retry(&stat->syncp, start));
267
268 return v;
269}
270
271/**
272 * blkg_stat_reset - reset a blkg_stat
273 * @stat: blkg_stat to reset
274 */
275static inline void blkg_stat_reset(struct blkg_stat *stat)
276{
277 stat->cnt = 0;
278}
279
280/**
281 * blkg_rwstat_add - add a value to a blkg_rwstat
282 * @rwstat: target blkg_rwstat
283 * @rw: mask of REQ_{WRITE|SYNC}
284 * @val: value to add
285 *
286 * Add @val to @rwstat. The counters are chosen according to @rw. The
287 * caller is responsible for synchronizing calls to this function.
288 */
289static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat,
290 int rw, uint64_t val)
291{
292 u64_stats_update_begin(&rwstat->syncp);
293
294 if (rw & REQ_WRITE)
295 rwstat->cnt[BLKG_RWSTAT_WRITE] += val;
296 else
297 rwstat->cnt[BLKG_RWSTAT_READ] += val;
298 if (rw & REQ_SYNC)
299 rwstat->cnt[BLKG_RWSTAT_SYNC] += val;
300 else
301 rwstat->cnt[BLKG_RWSTAT_ASYNC] += val;
302
303 u64_stats_update_end(&rwstat->syncp);
304}
305
306/**
307 * blkg_rwstat_read - read the current values of a blkg_rwstat
308 * @rwstat: blkg_rwstat to read
309 *
310 * Read the current snapshot of @rwstat and return it as the return value.
311 * This function can be called without synchronization and takes care of
312 * u64 atomicity.
313 */
314static inline struct blkg_rwstat blkg_rwstat_read(struct blkg_rwstat *rwstat)
315{
316 unsigned int start;
317 struct blkg_rwstat tmp;
318
319 do {
320 start = u64_stats_fetch_begin(&rwstat->syncp);
321 tmp = *rwstat;
322 } while (u64_stats_fetch_retry(&rwstat->syncp, start));
323
324 return tmp;
325}
326
327/**
328 * blkg_rwstat_sum - read the total count of a blkg_rwstat
329 * @rwstat: blkg_rwstat to read
330 *
331 * Return the total count of @rwstat regardless of the IO direction. This
332 * function can be called without synchronization and takes care of u64
333 * atomicity.
334 */
335static inline uint64_t blkg_rwstat_sum(struct blkg_rwstat *rwstat)
336{
337 struct blkg_rwstat tmp = blkg_rwstat_read(rwstat);
338
339 return tmp.cnt[BLKG_RWSTAT_READ] + tmp.cnt[BLKG_RWSTAT_WRITE];
340}
341
342/**
343 * blkg_rwstat_reset - reset a blkg_rwstat
344 * @rwstat: blkg_rwstat to reset
345 */
346static inline void blkg_rwstat_reset(struct blkg_rwstat *rwstat)
347{
348 memset(rwstat->cnt, 0, sizeof(rwstat->cnt));
349}
350
351#else /* CONFIG_BLK_CGROUP */
352
353struct cgroup;
354
355struct blkg_policy_data {
251}; 356};
252 357
253struct blkio_policy_type { 358struct blkcg_gq {
254}; 359};
255 360
256static inline void blkio_policy_register(struct blkio_policy_type *blkiop) { } 361struct blkcg_policy {
257static inline void blkio_policy_unregister(struct blkio_policy_type *blkiop) { } 362};
258 363
259static inline char *blkg_path(struct blkio_group *blkg) { return NULL; } 364static inline struct blkcg *cgroup_to_blkcg(struct cgroup *cgroup) { return NULL; }
260 365static inline struct blkcg *bio_blkcg(struct bio *bio) { return NULL; }
261#endif 366static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; }
262 367static inline int blkcg_init_queue(struct request_queue *q) { return 0; }
263#define BLKIO_WEIGHT_MIN 10 368static inline void blkcg_drain_queue(struct request_queue *q) { }
264#define BLKIO_WEIGHT_MAX 1000 369static inline void blkcg_exit_queue(struct request_queue *q) { }
265#define BLKIO_WEIGHT_DEFAULT 500 370static inline int blkcg_policy_register(struct blkcg_policy *pol) { return 0; }
266 371static inline void blkcg_policy_unregister(struct blkcg_policy *pol) { }
267#ifdef CONFIG_DEBUG_BLK_CGROUP 372static inline int blkcg_activate_policy(struct request_queue *q,
268void blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg); 373 const struct blkcg_policy *pol) { return 0; }
269void blkiocg_update_dequeue_stats(struct blkio_group *blkg, 374static inline void blkcg_deactivate_policy(struct request_queue *q,
270 unsigned long dequeue); 375 const struct blkcg_policy *pol) { }
271void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg); 376
272void blkiocg_update_idle_time_stats(struct blkio_group *blkg); 377static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg,
273void blkiocg_set_start_empty_time(struct blkio_group *blkg); 378 struct blkcg_policy *pol) { return NULL; }
274 379static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) { return NULL; }
275#define BLKG_FLAG_FNS(name) \ 380static inline char *blkg_path(struct blkcg_gq *blkg) { return NULL; }
276static inline void blkio_mark_blkg_##name( \ 381static inline void blkg_get(struct blkcg_gq *blkg) { }
277 struct blkio_group_stats *stats) \ 382static inline void blkg_put(struct blkcg_gq *blkg) { }
278{ \ 383
279 stats->flags |= (1 << BLKG_##name); \ 384#endif /* CONFIG_BLK_CGROUP */
280} \ 385#endif /* _BLK_CGROUP_H */
281static inline void blkio_clear_blkg_##name( \
282 struct blkio_group_stats *stats) \
283{ \
284 stats->flags &= ~(1 << BLKG_##name); \
285} \
286static inline int blkio_blkg_##name(struct blkio_group_stats *stats) \
287{ \
288 return (stats->flags & (1 << BLKG_##name)) != 0; \
289} \
290
291BLKG_FLAG_FNS(waiting)
292BLKG_FLAG_FNS(idling)
293BLKG_FLAG_FNS(empty)
294#undef BLKG_FLAG_FNS
295#else
296static inline void blkiocg_update_avg_queue_size_stats(
297 struct blkio_group *blkg) {}
298static inline void blkiocg_update_dequeue_stats(struct blkio_group *blkg,
299 unsigned long dequeue) {}
300static inline void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg)
301{}
302static inline void blkiocg_update_idle_time_stats(struct blkio_group *blkg) {}
303static inline void blkiocg_set_start_empty_time(struct blkio_group *blkg) {}
304#endif
305
306#if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE)
307extern struct blkio_cgroup blkio_root_cgroup;
308extern struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup);
309extern struct blkio_cgroup *task_blkio_cgroup(struct task_struct *tsk);
310extern void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
311 struct blkio_group *blkg, void *key, dev_t dev,
312 enum blkio_policy_id plid);
313extern int blkio_alloc_blkg_stats(struct blkio_group *blkg);
314extern int blkiocg_del_blkio_group(struct blkio_group *blkg);
315extern struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg,
316 void *key);
317void blkiocg_update_timeslice_used(struct blkio_group *blkg,
318 unsigned long time,
319 unsigned long unaccounted_time);
320void blkiocg_update_dispatch_stats(struct blkio_group *blkg, uint64_t bytes,
321 bool direction, bool sync);
322void blkiocg_update_completion_stats(struct blkio_group *blkg,
323 uint64_t start_time, uint64_t io_start_time, bool direction, bool sync);
324void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction,
325 bool sync);
326void blkiocg_update_io_add_stats(struct blkio_group *blkg,
327 struct blkio_group *curr_blkg, bool direction, bool sync);
328void blkiocg_update_io_remove_stats(struct blkio_group *blkg,
329 bool direction, bool sync);
330#else
331struct cgroup;
332static inline struct blkio_cgroup *
333cgroup_to_blkio_cgroup(struct cgroup *cgroup) { return NULL; }
334static inline struct blkio_cgroup *
335task_blkio_cgroup(struct task_struct *tsk) { return NULL; }
336
337static inline void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
338 struct blkio_group *blkg, void *key, dev_t dev,
339 enum blkio_policy_id plid) {}
340
341static inline int blkio_alloc_blkg_stats(struct blkio_group *blkg) { return 0; }
342
343static inline int
344blkiocg_del_blkio_group(struct blkio_group *blkg) { return 0; }
345
346static inline struct blkio_group *
347blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key) { return NULL; }
348static inline void blkiocg_update_timeslice_used(struct blkio_group *blkg,
349 unsigned long time,
350 unsigned long unaccounted_time)
351{}
352static inline void blkiocg_update_dispatch_stats(struct blkio_group *blkg,
353 uint64_t bytes, bool direction, bool sync) {}
354static inline void blkiocg_update_completion_stats(struct blkio_group *blkg,
355 uint64_t start_time, uint64_t io_start_time, bool direction,
356 bool sync) {}
357static inline void blkiocg_update_io_merged_stats(struct blkio_group *blkg,
358 bool direction, bool sync) {}
359static inline void blkiocg_update_io_add_stats(struct blkio_group *blkg,
360 struct blkio_group *curr_blkg, bool direction, bool sync) {}
361static inline void blkiocg_update_io_remove_stats(struct blkio_group *blkg,
362 bool direction, bool sync) {}
363#endif
364#endif /* _BLK_CGROUP_H */
diff --git a/block/blk-core.c b/block/blk-core.c
index 1f61b74867e4..3c923a7aeb56 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -29,11 +29,13 @@
29#include <linux/fault-inject.h> 29#include <linux/fault-inject.h>
30#include <linux/list_sort.h> 30#include <linux/list_sort.h>
31#include <linux/delay.h> 31#include <linux/delay.h>
32#include <linux/ratelimit.h>
32 33
33#define CREATE_TRACE_POINTS 34#define CREATE_TRACE_POINTS
34#include <trace/events/block.h> 35#include <trace/events/block.h>
35 36
36#include "blk.h" 37#include "blk.h"
38#include "blk-cgroup.h"
37 39
38EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap); 40EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);
39EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap); 41EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap);
@@ -280,7 +282,7 @@ EXPORT_SYMBOL(blk_stop_queue);
280 * 282 *
281 * This function does not cancel any asynchronous activity arising 283 * This function does not cancel any asynchronous activity arising
282 * out of elevator or throttling code. That would require elevaotor_exit() 284 * out of elevator or throttling code. That would require elevaotor_exit()
283 * and blk_throtl_exit() to be called with queue lock initialized. 285 * and blkcg_exit_queue() to be called with queue lock initialized.
284 * 286 *
285 */ 287 */
286void blk_sync_queue(struct request_queue *q) 288void blk_sync_queue(struct request_queue *q)
@@ -365,17 +367,23 @@ void blk_drain_queue(struct request_queue *q, bool drain_all)
365 367
366 spin_lock_irq(q->queue_lock); 368 spin_lock_irq(q->queue_lock);
367 369
368 elv_drain_elevator(q); 370 /*
369 if (drain_all) 371 * The caller might be trying to drain @q before its
370 blk_throtl_drain(q); 372 * elevator is initialized.
373 */
374 if (q->elevator)
375 elv_drain_elevator(q);
376
377 blkcg_drain_queue(q);
371 378
372 /* 379 /*
373 * This function might be called on a queue which failed 380 * This function might be called on a queue which failed
374 * driver init after queue creation. Some drivers 381 * driver init after queue creation or is not yet fully
375 * (e.g. fd) get unhappy in such cases. Kick queue iff 382 * active yet. Some drivers (e.g. fd and loop) get unhappy
376 * dispatch queue has something on it. 383 * in such cases. Kick queue iff dispatch queue has
384 * something on it and @q has request_fn set.
377 */ 385 */
378 if (!list_empty(&q->queue_head)) 386 if (!list_empty(&q->queue_head) && q->request_fn)
379 __blk_run_queue(q); 387 __blk_run_queue(q);
380 388
381 drain |= q->rq.elvpriv; 389 drain |= q->rq.elvpriv;
@@ -403,6 +411,49 @@ void blk_drain_queue(struct request_queue *q, bool drain_all)
403} 411}
404 412
405/** 413/**
414 * blk_queue_bypass_start - enter queue bypass mode
415 * @q: queue of interest
416 *
417 * In bypass mode, only the dispatch FIFO queue of @q is used. This
418 * function makes @q enter bypass mode and drains all requests which were
419 * throttled or issued before. On return, it's guaranteed that no request
420 * is being throttled or has ELVPRIV set and blk_queue_bypass() %true
421 * inside queue or RCU read lock.
422 */
423void blk_queue_bypass_start(struct request_queue *q)
424{
425 bool drain;
426
427 spin_lock_irq(q->queue_lock);
428 drain = !q->bypass_depth++;
429 queue_flag_set(QUEUE_FLAG_BYPASS, q);
430 spin_unlock_irq(q->queue_lock);
431
432 if (drain) {
433 blk_drain_queue(q, false);
434 /* ensure blk_queue_bypass() is %true inside RCU read lock */
435 synchronize_rcu();
436 }
437}
438EXPORT_SYMBOL_GPL(blk_queue_bypass_start);
439
440/**
441 * blk_queue_bypass_end - leave queue bypass mode
442 * @q: queue of interest
443 *
444 * Leave bypass mode and restore the normal queueing behavior.
445 */
446void blk_queue_bypass_end(struct request_queue *q)
447{
448 spin_lock_irq(q->queue_lock);
449 if (!--q->bypass_depth)
450 queue_flag_clear(QUEUE_FLAG_BYPASS, q);
451 WARN_ON_ONCE(q->bypass_depth < 0);
452 spin_unlock_irq(q->queue_lock);
453}
454EXPORT_SYMBOL_GPL(blk_queue_bypass_end);
455
456/**
406 * blk_cleanup_queue - shutdown a request queue 457 * blk_cleanup_queue - shutdown a request queue
407 * @q: request queue to shutdown 458 * @q: request queue to shutdown
408 * 459 *
@@ -418,6 +469,19 @@ void blk_cleanup_queue(struct request_queue *q)
418 queue_flag_set_unlocked(QUEUE_FLAG_DEAD, q); 469 queue_flag_set_unlocked(QUEUE_FLAG_DEAD, q);
419 470
420 spin_lock_irq(lock); 471 spin_lock_irq(lock);
472
473 /*
474 * Dead queue is permanently in bypass mode till released. Note
475 * that, unlike blk_queue_bypass_start(), we aren't performing
476 * synchronize_rcu() after entering bypass mode to avoid the delay
477 * as some drivers create and destroy a lot of queues while
478 * probing. This is still safe because blk_release_queue() will be
479 * called only after the queue refcnt drops to zero and nothing,
480 * RCU or not, would be traversing the queue by then.
481 */
482 q->bypass_depth++;
483 queue_flag_set(QUEUE_FLAG_BYPASS, q);
484
421 queue_flag_set(QUEUE_FLAG_NOMERGES, q); 485 queue_flag_set(QUEUE_FLAG_NOMERGES, q);
422 queue_flag_set(QUEUE_FLAG_NOXMERGES, q); 486 queue_flag_set(QUEUE_FLAG_NOXMERGES, q);
423 queue_flag_set(QUEUE_FLAG_DEAD, q); 487 queue_flag_set(QUEUE_FLAG_DEAD, q);
@@ -428,13 +492,8 @@ void blk_cleanup_queue(struct request_queue *q)
428 spin_unlock_irq(lock); 492 spin_unlock_irq(lock);
429 mutex_unlock(&q->sysfs_lock); 493 mutex_unlock(&q->sysfs_lock);
430 494
431 /* 495 /* drain all requests queued before DEAD marking */
432 * Drain all requests queued before DEAD marking. The caller might 496 blk_drain_queue(q, true);
433 * be trying to tear down @q before its elevator is initialized, in
434 * which case we don't want to call into draining.
435 */
436 if (q->elevator)
437 blk_drain_queue(q, true);
438 497
439 /* @q won't process any more request, flush async actions */ 498 /* @q won't process any more request, flush async actions */
440 del_timer_sync(&q->backing_dev_info.laptop_mode_wb_timer); 499 del_timer_sync(&q->backing_dev_info.laptop_mode_wb_timer);
@@ -498,14 +557,15 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
498 if (err) 557 if (err)
499 goto fail_id; 558 goto fail_id;
500 559
501 if (blk_throtl_init(q))
502 goto fail_id;
503
504 setup_timer(&q->backing_dev_info.laptop_mode_wb_timer, 560 setup_timer(&q->backing_dev_info.laptop_mode_wb_timer,
505 laptop_mode_timer_fn, (unsigned long) q); 561 laptop_mode_timer_fn, (unsigned long) q);
506 setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q); 562 setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q);
563 INIT_LIST_HEAD(&q->queue_head);
507 INIT_LIST_HEAD(&q->timeout_list); 564 INIT_LIST_HEAD(&q->timeout_list);
508 INIT_LIST_HEAD(&q->icq_list); 565 INIT_LIST_HEAD(&q->icq_list);
566#ifdef CONFIG_BLK_CGROUP
567 INIT_LIST_HEAD(&q->blkg_list);
568#endif
509 INIT_LIST_HEAD(&q->flush_queue[0]); 569 INIT_LIST_HEAD(&q->flush_queue[0]);
510 INIT_LIST_HEAD(&q->flush_queue[1]); 570 INIT_LIST_HEAD(&q->flush_queue[1]);
511 INIT_LIST_HEAD(&q->flush_data_in_flight); 571 INIT_LIST_HEAD(&q->flush_data_in_flight);
@@ -522,6 +582,18 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
522 */ 582 */
523 q->queue_lock = &q->__queue_lock; 583 q->queue_lock = &q->__queue_lock;
524 584
585 /*
586 * A queue starts its life with bypass turned on to avoid
587 * unnecessary bypass on/off overhead and nasty surprises during
588 * init. The initial bypass will be finished at the end of
589 * blk_init_allocated_queue().
590 */
591 q->bypass_depth = 1;
592 __set_bit(QUEUE_FLAG_BYPASS, &q->queue_flags);
593
594 if (blkcg_init_queue(q))
595 goto fail_id;
596
525 return q; 597 return q;
526 598
527fail_id: 599fail_id:
@@ -614,15 +686,15 @@ blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn,
614 686
615 q->sg_reserved_size = INT_MAX; 687 q->sg_reserved_size = INT_MAX;
616 688
617 /* 689 /* init elevator */
618 * all done 690 if (elevator_init(q, NULL))
619 */ 691 return NULL;
620 if (!elevator_init(q, NULL)) {
621 blk_queue_congestion_threshold(q);
622 return q;
623 }
624 692
625 return NULL; 693 blk_queue_congestion_threshold(q);
694
695 /* all done, end the initial bypass */
696 blk_queue_bypass_end(q);
697 return q;
626} 698}
627EXPORT_SYMBOL(blk_init_allocated_queue); 699EXPORT_SYMBOL(blk_init_allocated_queue);
628 700
@@ -648,33 +720,6 @@ static inline void blk_free_request(struct request_queue *q, struct request *rq)
648 mempool_free(rq, q->rq.rq_pool); 720 mempool_free(rq, q->rq.rq_pool);
649} 721}
650 722
651static struct request *
652blk_alloc_request(struct request_queue *q, struct io_cq *icq,
653 unsigned int flags, gfp_t gfp_mask)
654{
655 struct request *rq = mempool_alloc(q->rq.rq_pool, gfp_mask);
656
657 if (!rq)
658 return NULL;
659
660 blk_rq_init(q, rq);
661
662 rq->cmd_flags = flags | REQ_ALLOCED;
663
664 if (flags & REQ_ELVPRIV) {
665 rq->elv.icq = icq;
666 if (unlikely(elv_set_request(q, rq, gfp_mask))) {
667 mempool_free(rq, q->rq.rq_pool);
668 return NULL;
669 }
670 /* @rq->elv.icq holds on to io_context until @rq is freed */
671 if (icq)
672 get_io_context(icq->ioc);
673 }
674
675 return rq;
676}
677
678/* 723/*
679 * ioc_batching returns true if the ioc is a valid batching request and 724 * ioc_batching returns true if the ioc is a valid batching request and
680 * should be given priority access to a request. 725 * should be given priority access to a request.
@@ -763,6 +808,22 @@ static bool blk_rq_should_init_elevator(struct bio *bio)
763} 808}
764 809
765/** 810/**
811 * rq_ioc - determine io_context for request allocation
812 * @bio: request being allocated is for this bio (can be %NULL)
813 *
814 * Determine io_context to use for request allocation for @bio. May return
815 * %NULL if %current->io_context doesn't exist.
816 */
817static struct io_context *rq_ioc(struct bio *bio)
818{
819#ifdef CONFIG_BLK_CGROUP
820 if (bio && bio->bi_ioc)
821 return bio->bi_ioc;
822#endif
823 return current->io_context;
824}
825
826/**
766 * get_request - get a free request 827 * get_request - get a free request
767 * @q: request_queue to allocate request from 828 * @q: request_queue to allocate request from
768 * @rw_flags: RW and SYNC flags 829 * @rw_flags: RW and SYNC flags
@@ -779,7 +840,7 @@ static bool blk_rq_should_init_elevator(struct bio *bio)
779static struct request *get_request(struct request_queue *q, int rw_flags, 840static struct request *get_request(struct request_queue *q, int rw_flags,
780 struct bio *bio, gfp_t gfp_mask) 841 struct bio *bio, gfp_t gfp_mask)
781{ 842{
782 struct request *rq = NULL; 843 struct request *rq;
783 struct request_list *rl = &q->rq; 844 struct request_list *rl = &q->rq;
784 struct elevator_type *et; 845 struct elevator_type *et;
785 struct io_context *ioc; 846 struct io_context *ioc;
@@ -789,7 +850,7 @@ static struct request *get_request(struct request_queue *q, int rw_flags,
789 int may_queue; 850 int may_queue;
790retry: 851retry:
791 et = q->elevator->type; 852 et = q->elevator->type;
792 ioc = current->io_context; 853 ioc = rq_ioc(bio);
793 854
794 if (unlikely(blk_queue_dead(q))) 855 if (unlikely(blk_queue_dead(q)))
795 return NULL; 856 return NULL;
@@ -808,7 +869,7 @@ retry:
808 */ 869 */
809 if (!ioc && !retried) { 870 if (!ioc && !retried) {
810 spin_unlock_irq(q->queue_lock); 871 spin_unlock_irq(q->queue_lock);
811 create_io_context(current, gfp_mask, q->node); 872 create_io_context(gfp_mask, q->node);
812 spin_lock_irq(q->queue_lock); 873 spin_lock_irq(q->queue_lock);
813 retried = true; 874 retried = true;
814 goto retry; 875 goto retry;
@@ -831,7 +892,7 @@ retry:
831 * process is not a "batcher", and not 892 * process is not a "batcher", and not
832 * exempted by the IO scheduler 893 * exempted by the IO scheduler
833 */ 894 */
834 goto out; 895 return NULL;
835 } 896 }
836 } 897 }
837 } 898 }
@@ -844,7 +905,7 @@ retry:
844 * allocated with any setting of ->nr_requests 905 * allocated with any setting of ->nr_requests
845 */ 906 */
846 if (rl->count[is_sync] >= (3 * q->nr_requests / 2)) 907 if (rl->count[is_sync] >= (3 * q->nr_requests / 2))
847 goto out; 908 return NULL;
848 909
849 rl->count[is_sync]++; 910 rl->count[is_sync]++;
850 rl->starved[is_sync] = 0; 911 rl->starved[is_sync] = 0;
@@ -859,8 +920,7 @@ retry:
859 * Also, lookup icq while holding queue_lock. If it doesn't exist, 920 * Also, lookup icq while holding queue_lock. If it doesn't exist,
860 * it will be created after releasing queue_lock. 921 * it will be created after releasing queue_lock.
861 */ 922 */
862 if (blk_rq_should_init_elevator(bio) && 923 if (blk_rq_should_init_elevator(bio) && !blk_queue_bypass(q)) {
863 !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags)) {
864 rw_flags |= REQ_ELVPRIV; 924 rw_flags |= REQ_ELVPRIV;
865 rl->elvpriv++; 925 rl->elvpriv++;
866 if (et->icq_cache && ioc) 926 if (et->icq_cache && ioc)
@@ -871,41 +931,36 @@ retry:
871 rw_flags |= REQ_IO_STAT; 931 rw_flags |= REQ_IO_STAT;
872 spin_unlock_irq(q->queue_lock); 932 spin_unlock_irq(q->queue_lock);
873 933
874 /* create icq if missing */ 934 /* allocate and init request */
875 if ((rw_flags & REQ_ELVPRIV) && unlikely(et->icq_cache && !icq)) { 935 rq = mempool_alloc(q->rq.rq_pool, gfp_mask);
876 icq = ioc_create_icq(q, gfp_mask); 936 if (!rq)
877 if (!icq) 937 goto fail_alloc;
878 goto fail_icq;
879 }
880
881 rq = blk_alloc_request(q, icq, rw_flags, gfp_mask);
882 938
883fail_icq: 939 blk_rq_init(q, rq);
884 if (unlikely(!rq)) { 940 rq->cmd_flags = rw_flags | REQ_ALLOCED;
885 /* 941
886 * Allocation failed presumably due to memory. Undo anything 942 /* init elvpriv */
887 * we might have messed up. 943 if (rw_flags & REQ_ELVPRIV) {
888 * 944 if (unlikely(et->icq_cache && !icq)) {
889 * Allocating task should really be put onto the front of the 945 create_io_context(gfp_mask, q->node);
890 * wait queue, but this is pretty rare. 946 ioc = rq_ioc(bio);
891 */ 947 if (!ioc)
892 spin_lock_irq(q->queue_lock); 948 goto fail_elvpriv;
893 freed_request(q, rw_flags); 949
950 icq = ioc_create_icq(ioc, q, gfp_mask);
951 if (!icq)
952 goto fail_elvpriv;
953 }
894 954
895 /* 955 rq->elv.icq = icq;
896 * in the very unlikely event that allocation failed and no 956 if (unlikely(elv_set_request(q, rq, bio, gfp_mask)))
897 * requests for this direction was pending, mark us starved 957 goto fail_elvpriv;
898 * so that freeing of a request in the other direction will
899 * notice us. another possible fix would be to split the
900 * rq mempool into READ and WRITE
901 */
902rq_starved:
903 if (unlikely(rl->count[is_sync] == 0))
904 rl->starved[is_sync] = 1;
905 958
906 goto out; 959 /* @rq->elv.icq holds io_context until @rq is freed */
960 if (icq)
961 get_io_context(icq->ioc);
907 } 962 }
908 963out:
909 /* 964 /*
910 * ioc may be NULL here, and ioc_batching will be false. That's 965 * ioc may be NULL here, and ioc_batching will be false. That's
911 * OK, if the queue is under the request limit then requests need 966 * OK, if the queue is under the request limit then requests need
@@ -916,8 +971,48 @@ rq_starved:
916 ioc->nr_batch_requests--; 971 ioc->nr_batch_requests--;
917 972
918 trace_block_getrq(q, bio, rw_flags & 1); 973 trace_block_getrq(q, bio, rw_flags & 1);
919out:
920 return rq; 974 return rq;
975
976fail_elvpriv:
977 /*
978 * elvpriv init failed. ioc, icq and elvpriv aren't mempool backed
979 * and may fail indefinitely under memory pressure and thus
980 * shouldn't stall IO. Treat this request as !elvpriv. This will
981 * disturb iosched and blkcg but weird is bettern than dead.
982 */
983 printk_ratelimited(KERN_WARNING "%s: request aux data allocation failed, iosched may be disturbed\n",
984 dev_name(q->backing_dev_info.dev));
985
986 rq->cmd_flags &= ~REQ_ELVPRIV;
987 rq->elv.icq = NULL;
988
989 spin_lock_irq(q->queue_lock);
990 rl->elvpriv--;
991 spin_unlock_irq(q->queue_lock);
992 goto out;
993
994fail_alloc:
995 /*
996 * Allocation failed presumably due to memory. Undo anything we
997 * might have messed up.
998 *
999 * Allocating task should really be put onto the front of the wait
1000 * queue, but this is pretty rare.
1001 */
1002 spin_lock_irq(q->queue_lock);
1003 freed_request(q, rw_flags);
1004
1005 /*
1006 * in the very unlikely event that allocation failed and no
1007 * requests for this direction was pending, mark us starved so that
1008 * freeing of a request in the other direction will notice
1009 * us. another possible fix would be to split the rq mempool into
1010 * READ and WRITE
1011 */
1012rq_starved:
1013 if (unlikely(rl->count[is_sync] == 0))
1014 rl->starved[is_sync] = 1;
1015 return NULL;
921} 1016}
922 1017
923/** 1018/**
@@ -961,7 +1056,7 @@ static struct request *get_request_wait(struct request_queue *q, int rw_flags,
961 * up to a big batch of them for a small period time. 1056 * up to a big batch of them for a small period time.
962 * See ioc_batching, ioc_set_batching 1057 * See ioc_batching, ioc_set_batching
963 */ 1058 */
964 create_io_context(current, GFP_NOIO, q->node); 1059 create_io_context(GFP_NOIO, q->node);
965 ioc_set_batching(q, current->io_context); 1060 ioc_set_batching(q, current->io_context);
966 1061
967 spin_lock_irq(q->queue_lock); 1062 spin_lock_irq(q->queue_lock);
diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index fb95dd2f889a..1e2d53b04858 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -155,20 +155,20 @@ void put_io_context(struct io_context *ioc)
155} 155}
156EXPORT_SYMBOL(put_io_context); 156EXPORT_SYMBOL(put_io_context);
157 157
158/* Called by the exiting task */ 158/**
159void exit_io_context(struct task_struct *task) 159 * put_io_context_active - put active reference on ioc
160 * @ioc: ioc of interest
161 *
162 * Undo get_io_context_active(). If active reference reaches zero after
163 * put, @ioc can never issue further IOs and ioscheds are notified.
164 */
165void put_io_context_active(struct io_context *ioc)
160{ 166{
161 struct io_context *ioc;
162 struct io_cq *icq;
163 struct hlist_node *n; 167 struct hlist_node *n;
164 unsigned long flags; 168 unsigned long flags;
169 struct io_cq *icq;
165 170
166 task_lock(task); 171 if (!atomic_dec_and_test(&ioc->active_ref)) {
167 ioc = task->io_context;
168 task->io_context = NULL;
169 task_unlock(task);
170
171 if (!atomic_dec_and_test(&ioc->nr_tasks)) {
172 put_io_context(ioc); 172 put_io_context(ioc);
173 return; 173 return;
174 } 174 }
@@ -197,6 +197,20 @@ retry:
197 put_io_context(ioc); 197 put_io_context(ioc);
198} 198}
199 199
200/* Called by the exiting task */
201void exit_io_context(struct task_struct *task)
202{
203 struct io_context *ioc;
204
205 task_lock(task);
206 ioc = task->io_context;
207 task->io_context = NULL;
208 task_unlock(task);
209
210 atomic_dec(&ioc->nr_tasks);
211 put_io_context_active(ioc);
212}
213
200/** 214/**
201 * ioc_clear_queue - break any ioc association with the specified queue 215 * ioc_clear_queue - break any ioc association with the specified queue
202 * @q: request_queue being cleared 216 * @q: request_queue being cleared
@@ -218,19 +232,18 @@ void ioc_clear_queue(struct request_queue *q)
218 } 232 }
219} 233}
220 234
221void create_io_context_slowpath(struct task_struct *task, gfp_t gfp_flags, 235int create_task_io_context(struct task_struct *task, gfp_t gfp_flags, int node)
222 int node)
223{ 236{
224 struct io_context *ioc; 237 struct io_context *ioc;
225 238
226 ioc = kmem_cache_alloc_node(iocontext_cachep, gfp_flags | __GFP_ZERO, 239 ioc = kmem_cache_alloc_node(iocontext_cachep, gfp_flags | __GFP_ZERO,
227 node); 240 node);
228 if (unlikely(!ioc)) 241 if (unlikely(!ioc))
229 return; 242 return -ENOMEM;
230 243
231 /* initialize */ 244 /* initialize */
232 atomic_long_set(&ioc->refcount, 1); 245 atomic_long_set(&ioc->refcount, 1);
233 atomic_set(&ioc->nr_tasks, 1); 246 atomic_set(&ioc->active_ref, 1);
234 spin_lock_init(&ioc->lock); 247 spin_lock_init(&ioc->lock);
235 INIT_RADIX_TREE(&ioc->icq_tree, GFP_ATOMIC | __GFP_HIGH); 248 INIT_RADIX_TREE(&ioc->icq_tree, GFP_ATOMIC | __GFP_HIGH);
236 INIT_HLIST_HEAD(&ioc->icq_list); 249 INIT_HLIST_HEAD(&ioc->icq_list);
@@ -250,6 +263,8 @@ void create_io_context_slowpath(struct task_struct *task, gfp_t gfp_flags,
250 else 263 else
251 kmem_cache_free(iocontext_cachep, ioc); 264 kmem_cache_free(iocontext_cachep, ioc);
252 task_unlock(task); 265 task_unlock(task);
266
267 return 0;
253} 268}
254 269
255/** 270/**
@@ -281,7 +296,7 @@ struct io_context *get_task_io_context(struct task_struct *task,
281 return ioc; 296 return ioc;
282 } 297 }
283 task_unlock(task); 298 task_unlock(task);
284 } while (create_io_context(task, gfp_flags, node)); 299 } while (!create_task_io_context(task, gfp_flags, node));
285 300
286 return NULL; 301 return NULL;
287} 302}
@@ -325,26 +340,23 @@ EXPORT_SYMBOL(ioc_lookup_icq);
325 340
326/** 341/**
327 * ioc_create_icq - create and link io_cq 342 * ioc_create_icq - create and link io_cq
343 * @ioc: io_context of interest
328 * @q: request_queue of interest 344 * @q: request_queue of interest
329 * @gfp_mask: allocation mask 345 * @gfp_mask: allocation mask
330 * 346 *
331 * Make sure io_cq linking %current->io_context and @q exists. If either 347 * Make sure io_cq linking @ioc and @q exists. If icq doesn't exist, they
332 * io_context and/or icq don't exist, they will be created using @gfp_mask. 348 * will be created using @gfp_mask.
333 * 349 *
334 * The caller is responsible for ensuring @ioc won't go away and @q is 350 * The caller is responsible for ensuring @ioc won't go away and @q is
335 * alive and will stay alive until this function returns. 351 * alive and will stay alive until this function returns.
336 */ 352 */
337struct io_cq *ioc_create_icq(struct request_queue *q, gfp_t gfp_mask) 353struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q,
354 gfp_t gfp_mask)
338{ 355{
339 struct elevator_type *et = q->elevator->type; 356 struct elevator_type *et = q->elevator->type;
340 struct io_context *ioc;
341 struct io_cq *icq; 357 struct io_cq *icq;
342 358
343 /* allocate stuff */ 359 /* allocate stuff */
344 ioc = create_io_context(current, gfp_mask, q->node);
345 if (!ioc)
346 return NULL;
347
348 icq = kmem_cache_alloc_node(et->icq_cache, gfp_mask | __GFP_ZERO, 360 icq = kmem_cache_alloc_node(et->icq_cache, gfp_mask | __GFP_ZERO,
349 q->node); 361 q->node);
350 if (!icq) 362 if (!icq)
@@ -382,74 +394,6 @@ struct io_cq *ioc_create_icq(struct request_queue *q, gfp_t gfp_mask)
382 return icq; 394 return icq;
383} 395}
384 396
385void ioc_set_icq_flags(struct io_context *ioc, unsigned int flags)
386{
387 struct io_cq *icq;
388 struct hlist_node *n;
389
390 hlist_for_each_entry(icq, n, &ioc->icq_list, ioc_node)
391 icq->flags |= flags;
392}
393
394/**
395 * ioc_ioprio_changed - notify ioprio change
396 * @ioc: io_context of interest
397 * @ioprio: new ioprio
398 *
399 * @ioc's ioprio has changed to @ioprio. Set %ICQ_IOPRIO_CHANGED for all
400 * icq's. iosched is responsible for checking the bit and applying it on
401 * request issue path.
402 */
403void ioc_ioprio_changed(struct io_context *ioc, int ioprio)
404{
405 unsigned long flags;
406
407 spin_lock_irqsave(&ioc->lock, flags);
408 ioc->ioprio = ioprio;
409 ioc_set_icq_flags(ioc, ICQ_IOPRIO_CHANGED);
410 spin_unlock_irqrestore(&ioc->lock, flags);
411}
412
413/**
414 * ioc_cgroup_changed - notify cgroup change
415 * @ioc: io_context of interest
416 *
417 * @ioc's cgroup has changed. Set %ICQ_CGROUP_CHANGED for all icq's.
418 * iosched is responsible for checking the bit and applying it on request
419 * issue path.
420 */
421void ioc_cgroup_changed(struct io_context *ioc)
422{
423 unsigned long flags;
424
425 spin_lock_irqsave(&ioc->lock, flags);
426 ioc_set_icq_flags(ioc, ICQ_CGROUP_CHANGED);
427 spin_unlock_irqrestore(&ioc->lock, flags);
428}
429EXPORT_SYMBOL(ioc_cgroup_changed);
430
431/**
432 * icq_get_changed - fetch and clear icq changed mask
433 * @icq: icq of interest
434 *
435 * Fetch and clear ICQ_*_CHANGED bits from @icq. Grabs and releases
436 * @icq->ioc->lock.
437 */
438unsigned icq_get_changed(struct io_cq *icq)
439{
440 unsigned int changed = 0;
441 unsigned long flags;
442
443 if (unlikely(icq->flags & ICQ_CHANGED_MASK)) {
444 spin_lock_irqsave(&icq->ioc->lock, flags);
445 changed = icq->flags & ICQ_CHANGED_MASK;
446 icq->flags &= ~ICQ_CHANGED_MASK;
447 spin_unlock_irqrestore(&icq->ioc->lock, flags);
448 }
449 return changed;
450}
451EXPORT_SYMBOL(icq_get_changed);
452
453static int __init blk_ioc_init(void) 397static int __init blk_ioc_init(void)
454{ 398{
455 iocontext_cachep = kmem_cache_create("blkdev_ioc", 399 iocontext_cachep = kmem_cache_create("blkdev_ioc",
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index cf150011d808..aa41b47c22d2 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -9,6 +9,7 @@
9#include <linux/blktrace_api.h> 9#include <linux/blktrace_api.h>
10 10
11#include "blk.h" 11#include "blk.h"
12#include "blk-cgroup.h"
12 13
13struct queue_sysfs_entry { 14struct queue_sysfs_entry {
14 struct attribute attr; 15 struct attribute attr;
@@ -479,6 +480,8 @@ static void blk_release_queue(struct kobject *kobj)
479 480
480 blk_sync_queue(q); 481 blk_sync_queue(q);
481 482
483 blkcg_exit_queue(q);
484
482 if (q->elevator) { 485 if (q->elevator) {
483 spin_lock_irq(q->queue_lock); 486 spin_lock_irq(q->queue_lock);
484 ioc_clear_queue(q); 487 ioc_clear_queue(q);
@@ -486,15 +489,12 @@ static void blk_release_queue(struct kobject *kobj)
486 elevator_exit(q->elevator); 489 elevator_exit(q->elevator);
487 } 490 }
488 491
489 blk_throtl_exit(q);
490
491 if (rl->rq_pool) 492 if (rl->rq_pool)
492 mempool_destroy(rl->rq_pool); 493 mempool_destroy(rl->rq_pool);
493 494
494 if (q->queue_tags) 495 if (q->queue_tags)
495 __blk_queue_free_tags(q); 496 __blk_queue_free_tags(q);
496 497
497 blk_throtl_release(q);
498 blk_trace_shutdown(q); 498 blk_trace_shutdown(q);
499 499
500 bdi_destroy(&q->backing_dev_info); 500 bdi_destroy(&q->backing_dev_info);
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index f2ddb94626bd..14dedecfc7e8 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -21,6 +21,8 @@ static int throtl_quantum = 32;
21/* Throttling is performed over 100ms slice and after that slice is renewed */ 21/* Throttling is performed over 100ms slice and after that slice is renewed */
22static unsigned long throtl_slice = HZ/10; /* 100 ms */ 22static unsigned long throtl_slice = HZ/10; /* 100 ms */
23 23
24static struct blkcg_policy blkcg_policy_throtl;
25
24/* A workqueue to queue throttle related work */ 26/* A workqueue to queue throttle related work */
25static struct workqueue_struct *kthrotld_workqueue; 27static struct workqueue_struct *kthrotld_workqueue;
26static void throtl_schedule_delayed_work(struct throtl_data *td, 28static void throtl_schedule_delayed_work(struct throtl_data *td,
@@ -38,9 +40,17 @@ struct throtl_rb_root {
38 40
39#define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node) 41#define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node)
40 42
43/* Per-cpu group stats */
44struct tg_stats_cpu {
45 /* total bytes transferred */
46 struct blkg_rwstat service_bytes;
47 /* total IOs serviced, post merge */
48 struct blkg_rwstat serviced;
49};
50
41struct throtl_grp { 51struct throtl_grp {
42 /* List of throtl groups on the request queue*/ 52 /* must be the first member */
43 struct hlist_node tg_node; 53 struct blkg_policy_data pd;
44 54
45 /* active throtl group service_tree member */ 55 /* active throtl group service_tree member */
46 struct rb_node rb_node; 56 struct rb_node rb_node;
@@ -52,8 +62,6 @@ struct throtl_grp {
52 */ 62 */
53 unsigned long disptime; 63 unsigned long disptime;
54 64
55 struct blkio_group blkg;
56 atomic_t ref;
57 unsigned int flags; 65 unsigned int flags;
58 66
59 /* Two lists for READ and WRITE */ 67 /* Two lists for READ and WRITE */
@@ -80,18 +88,18 @@ struct throtl_grp {
80 /* Some throttle limits got updated for the group */ 88 /* Some throttle limits got updated for the group */
81 int limits_changed; 89 int limits_changed;
82 90
83 struct rcu_head rcu_head; 91 /* Per cpu stats pointer */
92 struct tg_stats_cpu __percpu *stats_cpu;
93
94 /* List of tgs waiting for per cpu stats memory to be allocated */
95 struct list_head stats_alloc_node;
84}; 96};
85 97
86struct throtl_data 98struct throtl_data
87{ 99{
88 /* List of throtl groups */
89 struct hlist_head tg_list;
90
91 /* service tree for active throtl groups */ 100 /* service tree for active throtl groups */
92 struct throtl_rb_root tg_service_tree; 101 struct throtl_rb_root tg_service_tree;
93 102
94 struct throtl_grp *root_tg;
95 struct request_queue *queue; 103 struct request_queue *queue;
96 104
97 /* Total Number of queued bios on READ and WRITE lists */ 105 /* Total Number of queued bios on READ and WRITE lists */
@@ -108,6 +116,33 @@ struct throtl_data
108 int limits_changed; 116 int limits_changed;
109}; 117};
110 118
119/* list and work item to allocate percpu group stats */
120static DEFINE_SPINLOCK(tg_stats_alloc_lock);
121static LIST_HEAD(tg_stats_alloc_list);
122
123static void tg_stats_alloc_fn(struct work_struct *);
124static DECLARE_DELAYED_WORK(tg_stats_alloc_work, tg_stats_alloc_fn);
125
126static inline struct throtl_grp *pd_to_tg(struct blkg_policy_data *pd)
127{
128 return pd ? container_of(pd, struct throtl_grp, pd) : NULL;
129}
130
131static inline struct throtl_grp *blkg_to_tg(struct blkcg_gq *blkg)
132{
133 return pd_to_tg(blkg_to_pd(blkg, &blkcg_policy_throtl));
134}
135
136static inline struct blkcg_gq *tg_to_blkg(struct throtl_grp *tg)
137{
138 return pd_to_blkg(&tg->pd);
139}
140
141static inline struct throtl_grp *td_root_tg(struct throtl_data *td)
142{
143 return blkg_to_tg(td->queue->root_blkg);
144}
145
111enum tg_state_flags { 146enum tg_state_flags {
112 THROTL_TG_FLAG_on_rr = 0, /* on round-robin busy list */ 147 THROTL_TG_FLAG_on_rr = 0, /* on round-robin busy list */
113}; 148};
@@ -128,244 +163,148 @@ static inline int throtl_tg_##name(const struct throtl_grp *tg) \
128 163
129THROTL_TG_FNS(on_rr); 164THROTL_TG_FNS(on_rr);
130 165
131#define throtl_log_tg(td, tg, fmt, args...) \ 166#define throtl_log_tg(td, tg, fmt, args...) do { \
132 blk_add_trace_msg((td)->queue, "throtl %s " fmt, \ 167 char __pbuf[128]; \
133 blkg_path(&(tg)->blkg), ##args); \ 168 \
169 blkg_path(tg_to_blkg(tg), __pbuf, sizeof(__pbuf)); \
170 blk_add_trace_msg((td)->queue, "throtl %s " fmt, __pbuf, ##args); \
171} while (0)
134 172
135#define throtl_log(td, fmt, args...) \ 173#define throtl_log(td, fmt, args...) \
136 blk_add_trace_msg((td)->queue, "throtl " fmt, ##args) 174 blk_add_trace_msg((td)->queue, "throtl " fmt, ##args)
137 175
138static inline struct throtl_grp *tg_of_blkg(struct blkio_group *blkg)
139{
140 if (blkg)
141 return container_of(blkg, struct throtl_grp, blkg);
142
143 return NULL;
144}
145
146static inline unsigned int total_nr_queued(struct throtl_data *td) 176static inline unsigned int total_nr_queued(struct throtl_data *td)
147{ 177{
148 return td->nr_queued[0] + td->nr_queued[1]; 178 return td->nr_queued[0] + td->nr_queued[1];
149} 179}
150 180
151static inline struct throtl_grp *throtl_ref_get_tg(struct throtl_grp *tg) 181/*
152{ 182 * Worker for allocating per cpu stat for tgs. This is scheduled on the
153 atomic_inc(&tg->ref); 183 * system_nrt_wq once there are some groups on the alloc_list waiting for
154 return tg; 184 * allocation.
155} 185 */
156 186static void tg_stats_alloc_fn(struct work_struct *work)
157static void throtl_free_tg(struct rcu_head *head)
158{ 187{
159 struct throtl_grp *tg; 188 static struct tg_stats_cpu *stats_cpu; /* this fn is non-reentrant */
189 struct delayed_work *dwork = to_delayed_work(work);
190 bool empty = false;
191
192alloc_stats:
193 if (!stats_cpu) {
194 stats_cpu = alloc_percpu(struct tg_stats_cpu);
195 if (!stats_cpu) {
196 /* allocation failed, try again after some time */
197 queue_delayed_work(system_nrt_wq, dwork,
198 msecs_to_jiffies(10));
199 return;
200 }
201 }
160 202
161 tg = container_of(head, struct throtl_grp, rcu_head); 203 spin_lock_irq(&tg_stats_alloc_lock);
162 free_percpu(tg->blkg.stats_cpu);
163 kfree(tg);
164}
165 204
166static void throtl_put_tg(struct throtl_grp *tg) 205 if (!list_empty(&tg_stats_alloc_list)) {
167{ 206 struct throtl_grp *tg = list_first_entry(&tg_stats_alloc_list,
168 BUG_ON(atomic_read(&tg->ref) <= 0); 207 struct throtl_grp,
169 if (!atomic_dec_and_test(&tg->ref)) 208 stats_alloc_node);
170 return; 209 swap(tg->stats_cpu, stats_cpu);
210 list_del_init(&tg->stats_alloc_node);
211 }
171 212
172 /* 213 empty = list_empty(&tg_stats_alloc_list);
173 * A group is freed in rcu manner. But having an rcu lock does not 214 spin_unlock_irq(&tg_stats_alloc_lock);
174 * mean that one can access all the fields of blkg and assume these 215 if (!empty)
175 * are valid. For example, don't try to follow throtl_data and 216 goto alloc_stats;
176 * request queue links.
177 *
178 * Having a reference to blkg under an rcu allows acess to only
179 * values local to groups like group stats and group rate limits
180 */
181 call_rcu(&tg->rcu_head, throtl_free_tg);
182} 217}
183 218
184static void throtl_init_group(struct throtl_grp *tg) 219static void throtl_pd_init(struct blkcg_gq *blkg)
185{ 220{
186 INIT_HLIST_NODE(&tg->tg_node); 221 struct throtl_grp *tg = blkg_to_tg(blkg);
222
187 RB_CLEAR_NODE(&tg->rb_node); 223 RB_CLEAR_NODE(&tg->rb_node);
188 bio_list_init(&tg->bio_lists[0]); 224 bio_list_init(&tg->bio_lists[0]);
189 bio_list_init(&tg->bio_lists[1]); 225 bio_list_init(&tg->bio_lists[1]);
190 tg->limits_changed = false; 226 tg->limits_changed = false;
191 227
192 /* Practically unlimited BW */ 228 tg->bps[READ] = -1;
193 tg->bps[0] = tg->bps[1] = -1; 229 tg->bps[WRITE] = -1;
194 tg->iops[0] = tg->iops[1] = -1; 230 tg->iops[READ] = -1;
195 231 tg->iops[WRITE] = -1;
196 /*
197 * Take the initial reference that will be released on destroy
198 * This can be thought of a joint reference by cgroup and
199 * request queue which will be dropped by either request queue
200 * exit or cgroup deletion path depending on who is exiting first.
201 */
202 atomic_set(&tg->ref, 1);
203}
204
205/* Should be called with rcu read lock held (needed for blkcg) */
206static void
207throtl_add_group_to_td_list(struct throtl_data *td, struct throtl_grp *tg)
208{
209 hlist_add_head(&tg->tg_node, &td->tg_list);
210 td->nr_undestroyed_grps++;
211}
212
213static void
214__throtl_tg_fill_dev_details(struct throtl_data *td, struct throtl_grp *tg)
215{
216 struct backing_dev_info *bdi = &td->queue->backing_dev_info;
217 unsigned int major, minor;
218
219 if (!tg || tg->blkg.dev)
220 return;
221 232
222 /* 233 /*
223 * Fill in device details for a group which might not have been 234 * Ugh... We need to perform per-cpu allocation for tg->stats_cpu
224 * filled at group creation time as queue was being instantiated 235 * but percpu allocator can't be called from IO path. Queue tg on
225 * and driver had not attached a device yet 236 * tg_stats_alloc_list and allocate from work item.
226 */ 237 */
227 if (bdi->dev && dev_name(bdi->dev)) { 238 spin_lock(&tg_stats_alloc_lock);
228 sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); 239 list_add(&tg->stats_alloc_node, &tg_stats_alloc_list);
229 tg->blkg.dev = MKDEV(major, minor); 240 queue_delayed_work(system_nrt_wq, &tg_stats_alloc_work, 0);
230 } 241 spin_unlock(&tg_stats_alloc_lock);
231} 242}
232 243
233/* 244static void throtl_pd_exit(struct blkcg_gq *blkg)
234 * Should be called with without queue lock held. Here queue lock will be
235 * taken rarely. It will be taken only once during life time of a group
236 * if need be
237 */
238static void
239throtl_tg_fill_dev_details(struct throtl_data *td, struct throtl_grp *tg)
240{ 245{
241 if (!tg || tg->blkg.dev) 246 struct throtl_grp *tg = blkg_to_tg(blkg);
242 return;
243 247
244 spin_lock_irq(td->queue->queue_lock); 248 spin_lock(&tg_stats_alloc_lock);
245 __throtl_tg_fill_dev_details(td, tg); 249 list_del_init(&tg->stats_alloc_node);
246 spin_unlock_irq(td->queue->queue_lock); 250 spin_unlock(&tg_stats_alloc_lock);
247}
248 251
249static void throtl_init_add_tg_lists(struct throtl_data *td, 252 free_percpu(tg->stats_cpu);
250 struct throtl_grp *tg, struct blkio_cgroup *blkcg)
251{
252 __throtl_tg_fill_dev_details(td, tg);
253
254 /* Add group onto cgroup list */
255 blkiocg_add_blkio_group(blkcg, &tg->blkg, (void *)td,
256 tg->blkg.dev, BLKIO_POLICY_THROTL);
257
258 tg->bps[READ] = blkcg_get_read_bps(blkcg, tg->blkg.dev);
259 tg->bps[WRITE] = blkcg_get_write_bps(blkcg, tg->blkg.dev);
260 tg->iops[READ] = blkcg_get_read_iops(blkcg, tg->blkg.dev);
261 tg->iops[WRITE] = blkcg_get_write_iops(blkcg, tg->blkg.dev);
262
263 throtl_add_group_to_td_list(td, tg);
264} 253}
265 254
266/* Should be called without queue lock and outside of rcu period */ 255static void throtl_pd_reset_stats(struct blkcg_gq *blkg)
267static struct throtl_grp *throtl_alloc_tg(struct throtl_data *td)
268{ 256{
269 struct throtl_grp *tg = NULL; 257 struct throtl_grp *tg = blkg_to_tg(blkg);
270 int ret; 258 int cpu;
271 259
272 tg = kzalloc_node(sizeof(*tg), GFP_ATOMIC, td->queue->node); 260 if (tg->stats_cpu == NULL)
273 if (!tg) 261 return;
274 return NULL;
275 262
276 ret = blkio_alloc_blkg_stats(&tg->blkg); 263 for_each_possible_cpu(cpu) {
264 struct tg_stats_cpu *sc = per_cpu_ptr(tg->stats_cpu, cpu);
277 265
278 if (ret) { 266 blkg_rwstat_reset(&sc->service_bytes);
279 kfree(tg); 267 blkg_rwstat_reset(&sc->serviced);
280 return NULL;
281 } 268 }
282
283 throtl_init_group(tg);
284 return tg;
285} 269}
286 270
287static struct 271static struct throtl_grp *throtl_lookup_tg(struct throtl_data *td,
288throtl_grp *throtl_find_tg(struct throtl_data *td, struct blkio_cgroup *blkcg) 272 struct blkcg *blkcg)
289{ 273{
290 struct throtl_grp *tg = NULL;
291 void *key = td;
292
293 /* 274 /*
294 * This is the common case when there are no blkio cgroups. 275 * This is the common case when there are no blkcgs. Avoid lookup
295 * Avoid lookup in this case 276 * in this case
296 */ 277 */
297 if (blkcg == &blkio_root_cgroup) 278 if (blkcg == &blkcg_root)
298 tg = td->root_tg; 279 return td_root_tg(td);
299 else
300 tg = tg_of_blkg(blkiocg_lookup_group(blkcg, key));
301 280
302 __throtl_tg_fill_dev_details(td, tg); 281 return blkg_to_tg(blkg_lookup(blkcg, td->queue));
303 return tg;
304} 282}
305 283
306static struct throtl_grp * throtl_get_tg(struct throtl_data *td) 284static struct throtl_grp *throtl_lookup_create_tg(struct throtl_data *td,
285 struct blkcg *blkcg)
307{ 286{
308 struct throtl_grp *tg = NULL, *__tg = NULL;
309 struct blkio_cgroup *blkcg;
310 struct request_queue *q = td->queue; 287 struct request_queue *q = td->queue;
311 288 struct throtl_grp *tg = NULL;
312 /* no throttling for dead queue */
313 if (unlikely(blk_queue_dead(q)))
314 return NULL;
315
316 rcu_read_lock();
317 blkcg = task_blkio_cgroup(current);
318 tg = throtl_find_tg(td, blkcg);
319 if (tg) {
320 rcu_read_unlock();
321 return tg;
322 }
323
324 /*
325 * Need to allocate a group. Allocation of group also needs allocation
326 * of per cpu stats which in-turn takes a mutex() and can block. Hence
327 * we need to drop rcu lock and queue_lock before we call alloc.
328 */
329 rcu_read_unlock();
330 spin_unlock_irq(q->queue_lock);
331
332 tg = throtl_alloc_tg(td);
333
334 /* Group allocated and queue is still alive. take the lock */
335 spin_lock_irq(q->queue_lock);
336
337 /* Make sure @q is still alive */
338 if (unlikely(blk_queue_dead(q))) {
339 kfree(tg);
340 return NULL;
341 }
342
343 /*
344 * Initialize the new group. After sleeping, read the blkcg again.
345 */
346 rcu_read_lock();
347 blkcg = task_blkio_cgroup(current);
348 289
349 /* 290 /*
350 * If some other thread already allocated the group while we were 291 * This is the common case when there are no blkcgs. Avoid lookup
351 * not holding queue lock, free up the group 292 * in this case
352 */ 293 */
353 __tg = throtl_find_tg(td, blkcg); 294 if (blkcg == &blkcg_root) {
354 295 tg = td_root_tg(td);
355 if (__tg) { 296 } else {
356 kfree(tg); 297 struct blkcg_gq *blkg;
357 rcu_read_unlock(); 298
358 return __tg; 299 blkg = blkg_lookup_create(blkcg, q);
359 } 300
360 301 /* if %NULL and @q is alive, fall back to root_tg */
361 /* Group allocation failed. Account the IO to root group */ 302 if (!IS_ERR(blkg))
362 if (!tg) { 303 tg = blkg_to_tg(blkg);
363 tg = td->root_tg; 304 else if (!blk_queue_dead(q))
364 return tg; 305 tg = td_root_tg(td);
365 } 306 }
366 307
367 throtl_init_add_tg_lists(td, tg, blkcg);
368 rcu_read_unlock();
369 return tg; 308 return tg;
370} 309}
371 310
@@ -734,16 +673,41 @@ static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg,
734 return 0; 673 return 0;
735} 674}
736 675
676static void throtl_update_dispatch_stats(struct blkcg_gq *blkg, u64 bytes,
677 int rw)
678{
679 struct throtl_grp *tg = blkg_to_tg(blkg);
680 struct tg_stats_cpu *stats_cpu;
681 unsigned long flags;
682
683 /* If per cpu stats are not allocated yet, don't do any accounting. */
684 if (tg->stats_cpu == NULL)
685 return;
686
687 /*
688 * Disabling interrupts to provide mutual exclusion between two
689 * writes on same cpu. It probably is not needed for 64bit. Not
690 * optimizing that case yet.
691 */
692 local_irq_save(flags);
693
694 stats_cpu = this_cpu_ptr(tg->stats_cpu);
695
696 blkg_rwstat_add(&stats_cpu->serviced, rw, 1);
697 blkg_rwstat_add(&stats_cpu->service_bytes, rw, bytes);
698
699 local_irq_restore(flags);
700}
701
737static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) 702static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
738{ 703{
739 bool rw = bio_data_dir(bio); 704 bool rw = bio_data_dir(bio);
740 bool sync = rw_is_sync(bio->bi_rw);
741 705
742 /* Charge the bio to the group */ 706 /* Charge the bio to the group */
743 tg->bytes_disp[rw] += bio->bi_size; 707 tg->bytes_disp[rw] += bio->bi_size;
744 tg->io_disp[rw]++; 708 tg->io_disp[rw]++;
745 709
746 blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size, rw, sync); 710 throtl_update_dispatch_stats(tg_to_blkg(tg), bio->bi_size, bio->bi_rw);
747} 711}
748 712
749static void throtl_add_bio_tg(struct throtl_data *td, struct throtl_grp *tg, 713static void throtl_add_bio_tg(struct throtl_data *td, struct throtl_grp *tg,
@@ -753,7 +717,7 @@ static void throtl_add_bio_tg(struct throtl_data *td, struct throtl_grp *tg,
753 717
754 bio_list_add(&tg->bio_lists[rw], bio); 718 bio_list_add(&tg->bio_lists[rw], bio);
755 /* Take a bio reference on tg */ 719 /* Take a bio reference on tg */
756 throtl_ref_get_tg(tg); 720 blkg_get(tg_to_blkg(tg));
757 tg->nr_queued[rw]++; 721 tg->nr_queued[rw]++;
758 td->nr_queued[rw]++; 722 td->nr_queued[rw]++;
759 throtl_enqueue_tg(td, tg); 723 throtl_enqueue_tg(td, tg);
@@ -786,8 +750,8 @@ static void tg_dispatch_one_bio(struct throtl_data *td, struct throtl_grp *tg,
786 750
787 bio = bio_list_pop(&tg->bio_lists[rw]); 751 bio = bio_list_pop(&tg->bio_lists[rw]);
788 tg->nr_queued[rw]--; 752 tg->nr_queued[rw]--;
789 /* Drop bio reference on tg */ 753 /* Drop bio reference on blkg */
790 throtl_put_tg(tg); 754 blkg_put(tg_to_blkg(tg));
791 755
792 BUG_ON(td->nr_queued[rw] <= 0); 756 BUG_ON(td->nr_queued[rw] <= 0);
793 td->nr_queued[rw]--; 757 td->nr_queued[rw]--;
@@ -865,8 +829,8 @@ static int throtl_select_dispatch(struct throtl_data *td, struct bio_list *bl)
865 829
866static void throtl_process_limit_change(struct throtl_data *td) 830static void throtl_process_limit_change(struct throtl_data *td)
867{ 831{
868 struct throtl_grp *tg; 832 struct request_queue *q = td->queue;
869 struct hlist_node *pos, *n; 833 struct blkcg_gq *blkg, *n;
870 834
871 if (!td->limits_changed) 835 if (!td->limits_changed)
872 return; 836 return;
@@ -875,7 +839,9 @@ static void throtl_process_limit_change(struct throtl_data *td)
875 839
876 throtl_log(td, "limits changed"); 840 throtl_log(td, "limits changed");
877 841
878 hlist_for_each_entry_safe(tg, pos, n, &td->tg_list, tg_node) { 842 list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) {
843 struct throtl_grp *tg = blkg_to_tg(blkg);
844
879 if (!tg->limits_changed) 845 if (!tg->limits_changed)
880 continue; 846 continue;
881 847
@@ -973,120 +939,159 @@ throtl_schedule_delayed_work(struct throtl_data *td, unsigned long delay)
973 } 939 }
974} 940}
975 941
976static void 942static u64 tg_prfill_cpu_rwstat(struct seq_file *sf,
977throtl_destroy_tg(struct throtl_data *td, struct throtl_grp *tg) 943 struct blkg_policy_data *pd, int off)
978{ 944{
979 /* Something wrong if we are trying to remove same group twice */ 945 struct throtl_grp *tg = pd_to_tg(pd);
980 BUG_ON(hlist_unhashed(&tg->tg_node)); 946 struct blkg_rwstat rwstat = { }, tmp;
947 int i, cpu;
981 948
982 hlist_del_init(&tg->tg_node); 949 for_each_possible_cpu(cpu) {
950 struct tg_stats_cpu *sc = per_cpu_ptr(tg->stats_cpu, cpu);
983 951
984 /* 952 tmp = blkg_rwstat_read((void *)sc + off);
985 * Put the reference taken at the time of creation so that when all 953 for (i = 0; i < BLKG_RWSTAT_NR; i++)
986 * queues are gone, group can be destroyed. 954 rwstat.cnt[i] += tmp.cnt[i];
987 */ 955 }
988 throtl_put_tg(tg); 956
989 td->nr_undestroyed_grps--; 957 return __blkg_prfill_rwstat(sf, pd, &rwstat);
990} 958}
991 959
992static void throtl_release_tgs(struct throtl_data *td) 960static int tg_print_cpu_rwstat(struct cgroup *cgrp, struct cftype *cft,
961 struct seq_file *sf)
993{ 962{
994 struct hlist_node *pos, *n; 963 struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
995 struct throtl_grp *tg;
996 964
997 hlist_for_each_entry_safe(tg, pos, n, &td->tg_list, tg_node) { 965 blkcg_print_blkgs(sf, blkcg, tg_prfill_cpu_rwstat, &blkcg_policy_throtl,
998 /* 966 cft->private, true);
999 * If cgroup removal path got to blk_group first and removed 967 return 0;
1000 * it from cgroup list, then it will take care of destroying
1001 * cfqg also.
1002 */
1003 if (!blkiocg_del_blkio_group(&tg->blkg))
1004 throtl_destroy_tg(td, tg);
1005 }
1006} 968}
1007 969
1008/* 970static u64 tg_prfill_conf_u64(struct seq_file *sf, struct blkg_policy_data *pd,
1009 * Blk cgroup controller notification saying that blkio_group object is being 971 int off)
1010 * delinked as associated cgroup object is going away. That also means that
1011 * no new IO will come in this group. So get rid of this group as soon as
1012 * any pending IO in the group is finished.
1013 *
1014 * This function is called under rcu_read_lock(). key is the rcu protected
1015 * pointer. That means "key" is a valid throtl_data pointer as long as we are
1016 * rcu read lock.
1017 *
1018 * "key" was fetched from blkio_group under blkio_cgroup->lock. That means
1019 * it should not be NULL as even if queue was going away, cgroup deltion
1020 * path got to it first.
1021 */
1022void throtl_unlink_blkio_group(void *key, struct blkio_group *blkg)
1023{ 972{
1024 unsigned long flags; 973 struct throtl_grp *tg = pd_to_tg(pd);
1025 struct throtl_data *td = key; 974 u64 v = *(u64 *)((void *)tg + off);
1026 975
1027 spin_lock_irqsave(td->queue->queue_lock, flags); 976 if (v == -1)
1028 throtl_destroy_tg(td, tg_of_blkg(blkg)); 977 return 0;
1029 spin_unlock_irqrestore(td->queue->queue_lock, flags); 978 return __blkg_prfill_u64(sf, pd, v);
1030} 979}
1031 980
1032static void throtl_update_blkio_group_common(struct throtl_data *td, 981static u64 tg_prfill_conf_uint(struct seq_file *sf, struct blkg_policy_data *pd,
1033 struct throtl_grp *tg) 982 int off)
1034{ 983{
1035 xchg(&tg->limits_changed, true); 984 struct throtl_grp *tg = pd_to_tg(pd);
1036 xchg(&td->limits_changed, true); 985 unsigned int v = *(unsigned int *)((void *)tg + off);
1037 /* Schedule a work now to process the limit change */ 986
1038 throtl_schedule_delayed_work(td, 0); 987 if (v == -1)
988 return 0;
989 return __blkg_prfill_u64(sf, pd, v);
1039} 990}
1040 991
1041/* 992static int tg_print_conf_u64(struct cgroup *cgrp, struct cftype *cft,
1042 * For all update functions, key should be a valid pointer because these 993 struct seq_file *sf)
1043 * update functions are called under blkcg_lock, that means, blkg is
1044 * valid and in turn key is valid. queue exit path can not race because
1045 * of blkcg_lock
1046 *
1047 * Can not take queue lock in update functions as queue lock under blkcg_lock
1048 * is not allowed. Under other paths we take blkcg_lock under queue_lock.
1049 */
1050static void throtl_update_blkio_group_read_bps(void *key,
1051 struct blkio_group *blkg, u64 read_bps)
1052{ 994{
1053 struct throtl_data *td = key; 995 blkcg_print_blkgs(sf, cgroup_to_blkcg(cgrp), tg_prfill_conf_u64,
1054 struct throtl_grp *tg = tg_of_blkg(blkg); 996 &blkcg_policy_throtl, cft->private, false);
1055 997 return 0;
1056 tg->bps[READ] = read_bps;
1057 throtl_update_blkio_group_common(td, tg);
1058} 998}
1059 999
1060static void throtl_update_blkio_group_write_bps(void *key, 1000static int tg_print_conf_uint(struct cgroup *cgrp, struct cftype *cft,
1061 struct blkio_group *blkg, u64 write_bps) 1001 struct seq_file *sf)
1062{ 1002{
1063 struct throtl_data *td = key; 1003 blkcg_print_blkgs(sf, cgroup_to_blkcg(cgrp), tg_prfill_conf_uint,
1064 struct throtl_grp *tg = tg_of_blkg(blkg); 1004 &blkcg_policy_throtl, cft->private, false);
1065 1005 return 0;
1066 tg->bps[WRITE] = write_bps;
1067 throtl_update_blkio_group_common(td, tg);
1068} 1006}
1069 1007
1070static void throtl_update_blkio_group_read_iops(void *key, 1008static int tg_set_conf(struct cgroup *cgrp, struct cftype *cft, const char *buf,
1071 struct blkio_group *blkg, unsigned int read_iops) 1009 bool is_u64)
1072{ 1010{
1073 struct throtl_data *td = key; 1011 struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
1074 struct throtl_grp *tg = tg_of_blkg(blkg); 1012 struct blkg_conf_ctx ctx;
1013 struct throtl_grp *tg;
1014 struct throtl_data *td;
1015 int ret;
1016
1017 ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx);
1018 if (ret)
1019 return ret;
1020
1021 tg = blkg_to_tg(ctx.blkg);
1022 td = ctx.blkg->q->td;
1075 1023
1076 tg->iops[READ] = read_iops; 1024 if (!ctx.v)
1077 throtl_update_blkio_group_common(td, tg); 1025 ctx.v = -1;
1026
1027 if (is_u64)
1028 *(u64 *)((void *)tg + cft->private) = ctx.v;
1029 else
1030 *(unsigned int *)((void *)tg + cft->private) = ctx.v;
1031
1032 /* XXX: we don't need the following deferred processing */
1033 xchg(&tg->limits_changed, true);
1034 xchg(&td->limits_changed, true);
1035 throtl_schedule_delayed_work(td, 0);
1036
1037 blkg_conf_finish(&ctx);
1038 return 0;
1078} 1039}
1079 1040
1080static void throtl_update_blkio_group_write_iops(void *key, 1041static int tg_set_conf_u64(struct cgroup *cgrp, struct cftype *cft,
1081 struct blkio_group *blkg, unsigned int write_iops) 1042 const char *buf)
1082{ 1043{
1083 struct throtl_data *td = key; 1044 return tg_set_conf(cgrp, cft, buf, true);
1084 struct throtl_grp *tg = tg_of_blkg(blkg); 1045}
1085 1046
1086 tg->iops[WRITE] = write_iops; 1047static int tg_set_conf_uint(struct cgroup *cgrp, struct cftype *cft,
1087 throtl_update_blkio_group_common(td, tg); 1048 const char *buf)
1049{
1050 return tg_set_conf(cgrp, cft, buf, false);
1088} 1051}
1089 1052
1053static struct cftype throtl_files[] = {
1054 {
1055 .name = "throttle.read_bps_device",
1056 .private = offsetof(struct throtl_grp, bps[READ]),
1057 .read_seq_string = tg_print_conf_u64,
1058 .write_string = tg_set_conf_u64,
1059 .max_write_len = 256,
1060 },
1061 {
1062 .name = "throttle.write_bps_device",
1063 .private = offsetof(struct throtl_grp, bps[WRITE]),
1064 .read_seq_string = tg_print_conf_u64,
1065 .write_string = tg_set_conf_u64,
1066 .max_write_len = 256,
1067 },
1068 {
1069 .name = "throttle.read_iops_device",
1070 .private = offsetof(struct throtl_grp, iops[READ]),
1071 .read_seq_string = tg_print_conf_uint,
1072 .write_string = tg_set_conf_uint,
1073 .max_write_len = 256,
1074 },
1075 {
1076 .name = "throttle.write_iops_device",
1077 .private = offsetof(struct throtl_grp, iops[WRITE]),
1078 .read_seq_string = tg_print_conf_uint,
1079 .write_string = tg_set_conf_uint,
1080 .max_write_len = 256,
1081 },
1082 {
1083 .name = "throttle.io_service_bytes",
1084 .private = offsetof(struct tg_stats_cpu, service_bytes),
1085 .read_seq_string = tg_print_cpu_rwstat,
1086 },
1087 {
1088 .name = "throttle.io_serviced",
1089 .private = offsetof(struct tg_stats_cpu, serviced),
1090 .read_seq_string = tg_print_cpu_rwstat,
1091 },
1092 { } /* terminate */
1093};
1094
1090static void throtl_shutdown_wq(struct request_queue *q) 1095static void throtl_shutdown_wq(struct request_queue *q)
1091{ 1096{
1092 struct throtl_data *td = q->td; 1097 struct throtl_data *td = q->td;
@@ -1094,19 +1099,13 @@ static void throtl_shutdown_wq(struct request_queue *q)
1094 cancel_delayed_work_sync(&td->throtl_work); 1099 cancel_delayed_work_sync(&td->throtl_work);
1095} 1100}
1096 1101
1097static struct blkio_policy_type blkio_policy_throtl = { 1102static struct blkcg_policy blkcg_policy_throtl = {
1098 .ops = { 1103 .pd_size = sizeof(struct throtl_grp),
1099 .blkio_unlink_group_fn = throtl_unlink_blkio_group, 1104 .cftypes = throtl_files,
1100 .blkio_update_group_read_bps_fn = 1105
1101 throtl_update_blkio_group_read_bps, 1106 .pd_init_fn = throtl_pd_init,
1102 .blkio_update_group_write_bps_fn = 1107 .pd_exit_fn = throtl_pd_exit,
1103 throtl_update_blkio_group_write_bps, 1108 .pd_reset_stats_fn = throtl_pd_reset_stats,
1104 .blkio_update_group_read_iops_fn =
1105 throtl_update_blkio_group_read_iops,
1106 .blkio_update_group_write_iops_fn =
1107 throtl_update_blkio_group_write_iops,
1108 },
1109 .plid = BLKIO_POLICY_THROTL,
1110}; 1109};
1111 1110
1112bool blk_throtl_bio(struct request_queue *q, struct bio *bio) 1111bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
@@ -1114,7 +1113,7 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
1114 struct throtl_data *td = q->td; 1113 struct throtl_data *td = q->td;
1115 struct throtl_grp *tg; 1114 struct throtl_grp *tg;
1116 bool rw = bio_data_dir(bio), update_disptime = true; 1115 bool rw = bio_data_dir(bio), update_disptime = true;
1117 struct blkio_cgroup *blkcg; 1116 struct blkcg *blkcg;
1118 bool throttled = false; 1117 bool throttled = false;
1119 1118
1120 if (bio->bi_rw & REQ_THROTTLED) { 1119 if (bio->bi_rw & REQ_THROTTLED) {
@@ -1122,33 +1121,31 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
1122 goto out; 1121 goto out;
1123 } 1122 }
1124 1123
1124 /* bio_associate_current() needs ioc, try creating */
1125 create_io_context(GFP_ATOMIC, q->node);
1126
1125 /* 1127 /*
1126 * A throtl_grp pointer retrieved under rcu can be used to access 1128 * A throtl_grp pointer retrieved under rcu can be used to access
1127 * basic fields like stats and io rates. If a group has no rules, 1129 * basic fields like stats and io rates. If a group has no rules,
1128 * just update the dispatch stats in lockless manner and return. 1130 * just update the dispatch stats in lockless manner and return.
1129 */ 1131 */
1130
1131 rcu_read_lock(); 1132 rcu_read_lock();
1132 blkcg = task_blkio_cgroup(current); 1133 blkcg = bio_blkcg(bio);
1133 tg = throtl_find_tg(td, blkcg); 1134 tg = throtl_lookup_tg(td, blkcg);
1134 if (tg) { 1135 if (tg) {
1135 throtl_tg_fill_dev_details(td, tg);
1136
1137 if (tg_no_rule_group(tg, rw)) { 1136 if (tg_no_rule_group(tg, rw)) {
1138 blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size, 1137 throtl_update_dispatch_stats(tg_to_blkg(tg),
1139 rw, rw_is_sync(bio->bi_rw)); 1138 bio->bi_size, bio->bi_rw);
1140 rcu_read_unlock(); 1139 goto out_unlock_rcu;
1141 goto out;
1142 } 1140 }
1143 } 1141 }
1144 rcu_read_unlock();
1145 1142
1146 /* 1143 /*
1147 * Either group has not been allocated yet or it is not an unlimited 1144 * Either group has not been allocated yet or it is not an unlimited
1148 * IO group 1145 * IO group
1149 */ 1146 */
1150 spin_lock_irq(q->queue_lock); 1147 spin_lock_irq(q->queue_lock);
1151 tg = throtl_get_tg(td); 1148 tg = throtl_lookup_create_tg(td, blkcg);
1152 if (unlikely(!tg)) 1149 if (unlikely(!tg))
1153 goto out_unlock; 1150 goto out_unlock;
1154 1151
@@ -1189,6 +1186,7 @@ queue_bio:
1189 tg->io_disp[rw], tg->iops[rw], 1186 tg->io_disp[rw], tg->iops[rw],
1190 tg->nr_queued[READ], tg->nr_queued[WRITE]); 1187 tg->nr_queued[READ], tg->nr_queued[WRITE]);
1191 1188
1189 bio_associate_current(bio);
1192 throtl_add_bio_tg(q->td, tg, bio); 1190 throtl_add_bio_tg(q->td, tg, bio);
1193 throttled = true; 1191 throttled = true;
1194 1192
@@ -1199,6 +1197,8 @@ queue_bio:
1199 1197
1200out_unlock: 1198out_unlock:
1201 spin_unlock_irq(q->queue_lock); 1199 spin_unlock_irq(q->queue_lock);
1200out_unlock_rcu:
1201 rcu_read_unlock();
1202out: 1202out:
1203 return throttled; 1203 return throttled;
1204} 1204}
@@ -1241,79 +1241,31 @@ void blk_throtl_drain(struct request_queue *q)
1241int blk_throtl_init(struct request_queue *q) 1241int blk_throtl_init(struct request_queue *q)
1242{ 1242{
1243 struct throtl_data *td; 1243 struct throtl_data *td;
1244 struct throtl_grp *tg; 1244 int ret;
1245 1245
1246 td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node); 1246 td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node);
1247 if (!td) 1247 if (!td)
1248 return -ENOMEM; 1248 return -ENOMEM;
1249 1249
1250 INIT_HLIST_HEAD(&td->tg_list);
1251 td->tg_service_tree = THROTL_RB_ROOT; 1250 td->tg_service_tree = THROTL_RB_ROOT;
1252 td->limits_changed = false; 1251 td->limits_changed = false;
1253 INIT_DELAYED_WORK(&td->throtl_work, blk_throtl_work); 1252 INIT_DELAYED_WORK(&td->throtl_work, blk_throtl_work);
1254 1253
1255 /* alloc and Init root group. */ 1254 q->td = td;
1256 td->queue = q; 1255 td->queue = q;
1257 tg = throtl_alloc_tg(td);
1258 1256
1259 if (!tg) { 1257 /* activate policy */
1258 ret = blkcg_activate_policy(q, &blkcg_policy_throtl);
1259 if (ret)
1260 kfree(td); 1260 kfree(td);
1261 return -ENOMEM; 1261 return ret;
1262 }
1263
1264 td->root_tg = tg;
1265
1266 rcu_read_lock();
1267 throtl_init_add_tg_lists(td, tg, &blkio_root_cgroup);
1268 rcu_read_unlock();
1269
1270 /* Attach throtl data to request queue */
1271 q->td = td;
1272 return 0;
1273} 1262}
1274 1263
1275void blk_throtl_exit(struct request_queue *q) 1264void blk_throtl_exit(struct request_queue *q)
1276{ 1265{
1277 struct throtl_data *td = q->td; 1266 BUG_ON(!q->td);
1278 bool wait = false;
1279
1280 BUG_ON(!td);
1281
1282 throtl_shutdown_wq(q); 1267 throtl_shutdown_wq(q);
1283 1268 blkcg_deactivate_policy(q, &blkcg_policy_throtl);
1284 spin_lock_irq(q->queue_lock);
1285 throtl_release_tgs(td);
1286
1287 /* If there are other groups */
1288 if (td->nr_undestroyed_grps > 0)
1289 wait = true;
1290
1291 spin_unlock_irq(q->queue_lock);
1292
1293 /*
1294 * Wait for tg->blkg->key accessors to exit their grace periods.
1295 * Do this wait only if there are other undestroyed groups out
1296 * there (other than root group). This can happen if cgroup deletion
1297 * path claimed the responsibility of cleaning up a group before
1298 * queue cleanup code get to the group.
1299 *
1300 * Do not call synchronize_rcu() unconditionally as there are drivers
1301 * which create/delete request queue hundreds of times during scan/boot
1302 * and synchronize_rcu() can take significant time and slow down boot.
1303 */
1304 if (wait)
1305 synchronize_rcu();
1306
1307 /*
1308 * Just being safe to make sure after previous flush if some body did
1309 * update limits through cgroup and another work got queued, cancel
1310 * it.
1311 */
1312 throtl_shutdown_wq(q);
1313}
1314
1315void blk_throtl_release(struct request_queue *q)
1316{
1317 kfree(q->td); 1269 kfree(q->td);
1318} 1270}
1319 1271
@@ -1323,8 +1275,7 @@ static int __init throtl_init(void)
1323 if (!kthrotld_workqueue) 1275 if (!kthrotld_workqueue)
1324 panic("Failed to create kthrotld\n"); 1276 panic("Failed to create kthrotld\n");
1325 1277
1326 blkio_policy_register(&blkio_policy_throtl); 1278 return blkcg_policy_register(&blkcg_policy_throtl);
1327 return 0;
1328} 1279}
1329 1280
1330module_init(throtl_init); 1281module_init(throtl_init);
diff --git a/block/blk.h b/block/blk.h
index d45be871329e..85f6ae42f7d3 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -23,7 +23,8 @@ void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
23 struct bio *bio); 23 struct bio *bio);
24int blk_rq_append_bio(struct request_queue *q, struct request *rq, 24int blk_rq_append_bio(struct request_queue *q, struct request *rq,
25 struct bio *bio); 25 struct bio *bio);
26void blk_drain_queue(struct request_queue *q, bool drain_all); 26void blk_queue_bypass_start(struct request_queue *q);
27void blk_queue_bypass_end(struct request_queue *q);
27void blk_dequeue_request(struct request *rq); 28void blk_dequeue_request(struct request *rq);
28void __blk_queue_free_tags(struct request_queue *q); 29void __blk_queue_free_tags(struct request_queue *q);
29bool __blk_end_bidi_request(struct request *rq, int error, 30bool __blk_end_bidi_request(struct request *rq, int error,
@@ -144,9 +145,6 @@ void blk_queue_congestion_threshold(struct request_queue *q);
144 145
145int blk_dev_init(void); 146int blk_dev_init(void);
146 147
147void elv_quiesce_start(struct request_queue *q);
148void elv_quiesce_end(struct request_queue *q);
149
150 148
151/* 149/*
152 * Return the threshold (number of used requests) at which the queue is 150 * Return the threshold (number of used requests) at which the queue is
@@ -186,32 +184,30 @@ static inline int blk_do_io_stat(struct request *rq)
186 */ 184 */
187void get_io_context(struct io_context *ioc); 185void get_io_context(struct io_context *ioc);
188struct io_cq *ioc_lookup_icq(struct io_context *ioc, struct request_queue *q); 186struct io_cq *ioc_lookup_icq(struct io_context *ioc, struct request_queue *q);
189struct io_cq *ioc_create_icq(struct request_queue *q, gfp_t gfp_mask); 187struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q,
188 gfp_t gfp_mask);
190void ioc_clear_queue(struct request_queue *q); 189void ioc_clear_queue(struct request_queue *q);
191 190
192void create_io_context_slowpath(struct task_struct *task, gfp_t gfp_mask, 191int create_task_io_context(struct task_struct *task, gfp_t gfp_mask, int node);
193 int node);
194 192
195/** 193/**
196 * create_io_context - try to create task->io_context 194 * create_io_context - try to create task->io_context
197 * @task: target task
198 * @gfp_mask: allocation mask 195 * @gfp_mask: allocation mask
199 * @node: allocation node 196 * @node: allocation node
200 * 197 *
201 * If @task->io_context is %NULL, allocate a new io_context and install it. 198 * If %current->io_context is %NULL, allocate a new io_context and install
202 * Returns the current @task->io_context which may be %NULL if allocation 199 * it. Returns the current %current->io_context which may be %NULL if
203 * failed. 200 * allocation failed.
204 * 201 *
205 * Note that this function can't be called with IRQ disabled because 202 * Note that this function can't be called with IRQ disabled because
206 * task_lock which protects @task->io_context is IRQ-unsafe. 203 * task_lock which protects %current->io_context is IRQ-unsafe.
207 */ 204 */
208static inline struct io_context *create_io_context(struct task_struct *task, 205static inline struct io_context *create_io_context(gfp_t gfp_mask, int node)
209 gfp_t gfp_mask, int node)
210{ 206{
211 WARN_ON_ONCE(irqs_disabled()); 207 WARN_ON_ONCE(irqs_disabled());
212 if (unlikely(!task->io_context)) 208 if (unlikely(!current->io_context))
213 create_io_context_slowpath(task, gfp_mask, node); 209 create_task_io_context(current, gfp_mask, node);
214 return task->io_context; 210 return current->io_context;
215} 211}
216 212
217/* 213/*
@@ -222,7 +218,6 @@ extern bool blk_throtl_bio(struct request_queue *q, struct bio *bio);
222extern void blk_throtl_drain(struct request_queue *q); 218extern void blk_throtl_drain(struct request_queue *q);
223extern int blk_throtl_init(struct request_queue *q); 219extern int blk_throtl_init(struct request_queue *q);
224extern void blk_throtl_exit(struct request_queue *q); 220extern void blk_throtl_exit(struct request_queue *q);
225extern void blk_throtl_release(struct request_queue *q);
226#else /* CONFIG_BLK_DEV_THROTTLING */ 221#else /* CONFIG_BLK_DEV_THROTTLING */
227static inline bool blk_throtl_bio(struct request_queue *q, struct bio *bio) 222static inline bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
228{ 223{
@@ -231,7 +226,6 @@ static inline bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
231static inline void blk_throtl_drain(struct request_queue *q) { } 226static inline void blk_throtl_drain(struct request_queue *q) { }
232static inline int blk_throtl_init(struct request_queue *q) { return 0; } 227static inline int blk_throtl_init(struct request_queue *q) { return 0; }
233static inline void blk_throtl_exit(struct request_queue *q) { } 228static inline void blk_throtl_exit(struct request_queue *q) { }
234static inline void blk_throtl_release(struct request_queue *q) { }
235#endif /* CONFIG_BLK_DEV_THROTTLING */ 229#endif /* CONFIG_BLK_DEV_THROTTLING */
236 230
237#endif /* BLK_INTERNAL_H */ 231#endif /* BLK_INTERNAL_H */
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 3c38536bd52c..673c977cc2bf 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -15,7 +15,9 @@
15#include <linux/ioprio.h> 15#include <linux/ioprio.h>
16#include <linux/blktrace_api.h> 16#include <linux/blktrace_api.h>
17#include "blk.h" 17#include "blk.h"
18#include "cfq.h" 18#include "blk-cgroup.h"
19
20static struct blkcg_policy blkcg_policy_cfq __maybe_unused;
19 21
20/* 22/*
21 * tunables 23 * tunables
@@ -171,8 +173,53 @@ enum wl_type_t {
171 SYNC_WORKLOAD = 2 173 SYNC_WORKLOAD = 2
172}; 174};
173 175
176struct cfqg_stats {
177#ifdef CONFIG_CFQ_GROUP_IOSCHED
178 /* total bytes transferred */
179 struct blkg_rwstat service_bytes;
180 /* total IOs serviced, post merge */
181 struct blkg_rwstat serviced;
182 /* number of ios merged */
183 struct blkg_rwstat merged;
184 /* total time spent on device in ns, may not be accurate w/ queueing */
185 struct blkg_rwstat service_time;
186 /* total time spent waiting in scheduler queue in ns */
187 struct blkg_rwstat wait_time;
188 /* number of IOs queued up */
189 struct blkg_rwstat queued;
190 /* total sectors transferred */
191 struct blkg_stat sectors;
192 /* total disk time and nr sectors dispatched by this group */
193 struct blkg_stat time;
194#ifdef CONFIG_DEBUG_BLK_CGROUP
195 /* time not charged to this cgroup */
196 struct blkg_stat unaccounted_time;
197 /* sum of number of ios queued across all samples */
198 struct blkg_stat avg_queue_size_sum;
199 /* count of samples taken for average */
200 struct blkg_stat avg_queue_size_samples;
201 /* how many times this group has been removed from service tree */
202 struct blkg_stat dequeue;
203 /* total time spent waiting for it to be assigned a timeslice. */
204 struct blkg_stat group_wait_time;
205 /* time spent idling for this blkcg_gq */
206 struct blkg_stat idle_time;
207 /* total time with empty current active q with other requests queued */
208 struct blkg_stat empty_time;
209 /* fields after this shouldn't be cleared on stat reset */
210 uint64_t start_group_wait_time;
211 uint64_t start_idle_time;
212 uint64_t start_empty_time;
213 uint16_t flags;
214#endif /* CONFIG_DEBUG_BLK_CGROUP */
215#endif /* CONFIG_CFQ_GROUP_IOSCHED */
216};
217
174/* This is per cgroup per device grouping structure */ 218/* This is per cgroup per device grouping structure */
175struct cfq_group { 219struct cfq_group {
220 /* must be the first member */
221 struct blkg_policy_data pd;
222
176 /* group service_tree member */ 223 /* group service_tree member */
177 struct rb_node rb_node; 224 struct rb_node rb_node;
178 225
@@ -180,7 +227,7 @@ struct cfq_group {
180 u64 vdisktime; 227 u64 vdisktime;
181 unsigned int weight; 228 unsigned int weight;
182 unsigned int new_weight; 229 unsigned int new_weight;
183 bool needs_update; 230 unsigned int dev_weight;
184 231
185 /* number of cfqq currently on this group */ 232 /* number of cfqq currently on this group */
186 int nr_cfqq; 233 int nr_cfqq;
@@ -206,20 +253,21 @@ struct cfq_group {
206 unsigned long saved_workload_slice; 253 unsigned long saved_workload_slice;
207 enum wl_type_t saved_workload; 254 enum wl_type_t saved_workload;
208 enum wl_prio_t saved_serving_prio; 255 enum wl_prio_t saved_serving_prio;
209 struct blkio_group blkg; 256
210#ifdef CONFIG_CFQ_GROUP_IOSCHED
211 struct hlist_node cfqd_node;
212 int ref;
213#endif
214 /* number of requests that are on the dispatch list or inside driver */ 257 /* number of requests that are on the dispatch list or inside driver */
215 int dispatched; 258 int dispatched;
216 struct cfq_ttime ttime; 259 struct cfq_ttime ttime;
260 struct cfqg_stats stats;
217}; 261};
218 262
219struct cfq_io_cq { 263struct cfq_io_cq {
220 struct io_cq icq; /* must be the first member */ 264 struct io_cq icq; /* must be the first member */
221 struct cfq_queue *cfqq[2]; 265 struct cfq_queue *cfqq[2];
222 struct cfq_ttime ttime; 266 struct cfq_ttime ttime;
267 int ioprio; /* the current ioprio */
268#ifdef CONFIG_CFQ_GROUP_IOSCHED
269 uint64_t blkcg_id; /* the current blkcg ID */
270#endif
223}; 271};
224 272
225/* 273/*
@@ -229,7 +277,7 @@ struct cfq_data {
229 struct request_queue *queue; 277 struct request_queue *queue;
230 /* Root service tree for cfq_groups */ 278 /* Root service tree for cfq_groups */
231 struct cfq_rb_root grp_service_tree; 279 struct cfq_rb_root grp_service_tree;
232 struct cfq_group root_group; 280 struct cfq_group *root_group;
233 281
234 /* 282 /*
235 * The priority currently being served 283 * The priority currently being served
@@ -303,12 +351,6 @@ struct cfq_data {
303 struct cfq_queue oom_cfqq; 351 struct cfq_queue oom_cfqq;
304 352
305 unsigned long last_delayed_sync; 353 unsigned long last_delayed_sync;
306
307 /* List of cfq groups being managed on this device*/
308 struct hlist_head cfqg_list;
309
310 /* Number of groups which are on blkcg->blkg_list */
311 unsigned int nr_blkcg_linked_grps;
312}; 354};
313 355
314static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd); 356static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);
@@ -371,21 +413,284 @@ CFQ_CFQQ_FNS(deep);
371CFQ_CFQQ_FNS(wait_busy); 413CFQ_CFQQ_FNS(wait_busy);
372#undef CFQ_CFQQ_FNS 414#undef CFQ_CFQQ_FNS
373 415
416static inline struct cfq_group *pd_to_cfqg(struct blkg_policy_data *pd)
417{
418 return pd ? container_of(pd, struct cfq_group, pd) : NULL;
419}
420
421static inline struct cfq_group *blkg_to_cfqg(struct blkcg_gq *blkg)
422{
423 return pd_to_cfqg(blkg_to_pd(blkg, &blkcg_policy_cfq));
424}
425
426static inline struct blkcg_gq *cfqg_to_blkg(struct cfq_group *cfqg)
427{
428 return pd_to_blkg(&cfqg->pd);
429}
430
431#if defined(CONFIG_CFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP)
432
433/* cfqg stats flags */
434enum cfqg_stats_flags {
435 CFQG_stats_waiting = 0,
436 CFQG_stats_idling,
437 CFQG_stats_empty,
438};
439
440#define CFQG_FLAG_FNS(name) \
441static inline void cfqg_stats_mark_##name(struct cfqg_stats *stats) \
442{ \
443 stats->flags |= (1 << CFQG_stats_##name); \
444} \
445static inline void cfqg_stats_clear_##name(struct cfqg_stats *stats) \
446{ \
447 stats->flags &= ~(1 << CFQG_stats_##name); \
448} \
449static inline int cfqg_stats_##name(struct cfqg_stats *stats) \
450{ \
451 return (stats->flags & (1 << CFQG_stats_##name)) != 0; \
452} \
453
454CFQG_FLAG_FNS(waiting)
455CFQG_FLAG_FNS(idling)
456CFQG_FLAG_FNS(empty)
457#undef CFQG_FLAG_FNS
458
459/* This should be called with the queue_lock held. */
460static void cfqg_stats_update_group_wait_time(struct cfqg_stats *stats)
461{
462 unsigned long long now;
463
464 if (!cfqg_stats_waiting(stats))
465 return;
466
467 now = sched_clock();
468 if (time_after64(now, stats->start_group_wait_time))
469 blkg_stat_add(&stats->group_wait_time,
470 now - stats->start_group_wait_time);
471 cfqg_stats_clear_waiting(stats);
472}
473
474/* This should be called with the queue_lock held. */
475static void cfqg_stats_set_start_group_wait_time(struct cfq_group *cfqg,
476 struct cfq_group *curr_cfqg)
477{
478 struct cfqg_stats *stats = &cfqg->stats;
479
480 if (cfqg_stats_waiting(stats))
481 return;
482 if (cfqg == curr_cfqg)
483 return;
484 stats->start_group_wait_time = sched_clock();
485 cfqg_stats_mark_waiting(stats);
486}
487
488/* This should be called with the queue_lock held. */
489static void cfqg_stats_end_empty_time(struct cfqg_stats *stats)
490{
491 unsigned long long now;
492
493 if (!cfqg_stats_empty(stats))
494 return;
495
496 now = sched_clock();
497 if (time_after64(now, stats->start_empty_time))
498 blkg_stat_add(&stats->empty_time,
499 now - stats->start_empty_time);
500 cfqg_stats_clear_empty(stats);
501}
502
503static void cfqg_stats_update_dequeue(struct cfq_group *cfqg)
504{
505 blkg_stat_add(&cfqg->stats.dequeue, 1);
506}
507
508static void cfqg_stats_set_start_empty_time(struct cfq_group *cfqg)
509{
510 struct cfqg_stats *stats = &cfqg->stats;
511
512 if (blkg_rwstat_sum(&stats->queued))
513 return;
514
515 /*
516 * group is already marked empty. This can happen if cfqq got new
517 * request in parent group and moved to this group while being added
518 * to service tree. Just ignore the event and move on.
519 */
520 if (cfqg_stats_empty(stats))
521 return;
522
523 stats->start_empty_time = sched_clock();
524 cfqg_stats_mark_empty(stats);
525}
526
527static void cfqg_stats_update_idle_time(struct cfq_group *cfqg)
528{
529 struct cfqg_stats *stats = &cfqg->stats;
530
531 if (cfqg_stats_idling(stats)) {
532 unsigned long long now = sched_clock();
533
534 if (time_after64(now, stats->start_idle_time))
535 blkg_stat_add(&stats->idle_time,
536 now - stats->start_idle_time);
537 cfqg_stats_clear_idling(stats);
538 }
539}
540
541static void cfqg_stats_set_start_idle_time(struct cfq_group *cfqg)
542{
543 struct cfqg_stats *stats = &cfqg->stats;
544
545 BUG_ON(cfqg_stats_idling(stats));
546
547 stats->start_idle_time = sched_clock();
548 cfqg_stats_mark_idling(stats);
549}
550
551static void cfqg_stats_update_avg_queue_size(struct cfq_group *cfqg)
552{
553 struct cfqg_stats *stats = &cfqg->stats;
554
555 blkg_stat_add(&stats->avg_queue_size_sum,
556 blkg_rwstat_sum(&stats->queued));
557 blkg_stat_add(&stats->avg_queue_size_samples, 1);
558 cfqg_stats_update_group_wait_time(stats);
559}
560
561#else /* CONFIG_CFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */
562
563static inline void cfqg_stats_set_start_group_wait_time(struct cfq_group *cfqg, struct cfq_group *curr_cfqg) { }
564static inline void cfqg_stats_end_empty_time(struct cfqg_stats *stats) { }
565static inline void cfqg_stats_update_dequeue(struct cfq_group *cfqg) { }
566static inline void cfqg_stats_set_start_empty_time(struct cfq_group *cfqg) { }
567static inline void cfqg_stats_update_idle_time(struct cfq_group *cfqg) { }
568static inline void cfqg_stats_set_start_idle_time(struct cfq_group *cfqg) { }
569static inline void cfqg_stats_update_avg_queue_size(struct cfq_group *cfqg) { }
570
571#endif /* CONFIG_CFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */
572
374#ifdef CONFIG_CFQ_GROUP_IOSCHED 573#ifdef CONFIG_CFQ_GROUP_IOSCHED
375#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \ 574
575static inline void cfqg_get(struct cfq_group *cfqg)
576{
577 return blkg_get(cfqg_to_blkg(cfqg));
578}
579
580static inline void cfqg_put(struct cfq_group *cfqg)
581{
582 return blkg_put(cfqg_to_blkg(cfqg));
583}
584
585#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) do { \
586 char __pbuf[128]; \
587 \
588 blkg_path(cfqg_to_blkg((cfqq)->cfqg), __pbuf, sizeof(__pbuf)); \
376 blk_add_trace_msg((cfqd)->queue, "cfq%d%c %s " fmt, (cfqq)->pid, \ 589 blk_add_trace_msg((cfqd)->queue, "cfq%d%c %s " fmt, (cfqq)->pid, \
377 cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \ 590 cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \
378 blkg_path(&(cfqq)->cfqg->blkg), ##args) 591 __pbuf, ##args); \
592} while (0)
379 593
380#define cfq_log_cfqg(cfqd, cfqg, fmt, args...) \ 594#define cfq_log_cfqg(cfqd, cfqg, fmt, args...) do { \
381 blk_add_trace_msg((cfqd)->queue, "%s " fmt, \ 595 char __pbuf[128]; \
382 blkg_path(&(cfqg)->blkg), ##args) \ 596 \
597 blkg_path(cfqg_to_blkg(cfqg), __pbuf, sizeof(__pbuf)); \
598 blk_add_trace_msg((cfqd)->queue, "%s " fmt, __pbuf, ##args); \
599} while (0)
600
601static inline void cfqg_stats_update_io_add(struct cfq_group *cfqg,
602 struct cfq_group *curr_cfqg, int rw)
603{
604 blkg_rwstat_add(&cfqg->stats.queued, rw, 1);
605 cfqg_stats_end_empty_time(&cfqg->stats);
606 cfqg_stats_set_start_group_wait_time(cfqg, curr_cfqg);
607}
608
609static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg,
610 unsigned long time, unsigned long unaccounted_time)
611{
612 blkg_stat_add(&cfqg->stats.time, time);
613#ifdef CONFIG_DEBUG_BLK_CGROUP
614 blkg_stat_add(&cfqg->stats.unaccounted_time, unaccounted_time);
615#endif
616}
617
618static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg, int rw)
619{
620 blkg_rwstat_add(&cfqg->stats.queued, rw, -1);
621}
622
623static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, int rw)
624{
625 blkg_rwstat_add(&cfqg->stats.merged, rw, 1);
626}
627
628static inline void cfqg_stats_update_dispatch(struct cfq_group *cfqg,
629 uint64_t bytes, int rw)
630{
631 blkg_stat_add(&cfqg->stats.sectors, bytes >> 9);
632 blkg_rwstat_add(&cfqg->stats.serviced, rw, 1);
633 blkg_rwstat_add(&cfqg->stats.service_bytes, rw, bytes);
634}
635
636static inline void cfqg_stats_update_completion(struct cfq_group *cfqg,
637 uint64_t start_time, uint64_t io_start_time, int rw)
638{
639 struct cfqg_stats *stats = &cfqg->stats;
640 unsigned long long now = sched_clock();
641
642 if (time_after64(now, io_start_time))
643 blkg_rwstat_add(&stats->service_time, rw, now - io_start_time);
644 if (time_after64(io_start_time, start_time))
645 blkg_rwstat_add(&stats->wait_time, rw,
646 io_start_time - start_time);
647}
648
649static void cfq_pd_reset_stats(struct blkcg_gq *blkg)
650{
651 struct cfq_group *cfqg = blkg_to_cfqg(blkg);
652 struct cfqg_stats *stats = &cfqg->stats;
653
654 /* queued stats shouldn't be cleared */
655 blkg_rwstat_reset(&stats->service_bytes);
656 blkg_rwstat_reset(&stats->serviced);
657 blkg_rwstat_reset(&stats->merged);
658 blkg_rwstat_reset(&stats->service_time);
659 blkg_rwstat_reset(&stats->wait_time);
660 blkg_stat_reset(&stats->time);
661#ifdef CONFIG_DEBUG_BLK_CGROUP
662 blkg_stat_reset(&stats->unaccounted_time);
663 blkg_stat_reset(&stats->avg_queue_size_sum);
664 blkg_stat_reset(&stats->avg_queue_size_samples);
665 blkg_stat_reset(&stats->dequeue);
666 blkg_stat_reset(&stats->group_wait_time);
667 blkg_stat_reset(&stats->idle_time);
668 blkg_stat_reset(&stats->empty_time);
669#endif
670}
671
672#else /* CONFIG_CFQ_GROUP_IOSCHED */
673
674static inline void cfqg_get(struct cfq_group *cfqg) { }
675static inline void cfqg_put(struct cfq_group *cfqg) { }
383 676
384#else
385#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \ 677#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \
386 blk_add_trace_msg((cfqd)->queue, "cfq%d " fmt, (cfqq)->pid, ##args) 678 blk_add_trace_msg((cfqd)->queue, "cfq%d " fmt, (cfqq)->pid, ##args)
387#define cfq_log_cfqg(cfqd, cfqg, fmt, args...) do {} while (0) 679#define cfq_log_cfqg(cfqd, cfqg, fmt, args...) do {} while (0)
388#endif 680
681static inline void cfqg_stats_update_io_add(struct cfq_group *cfqg,
682 struct cfq_group *curr_cfqg, int rw) { }
683static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg,
684 unsigned long time, unsigned long unaccounted_time) { }
685static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg, int rw) { }
686static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, int rw) { }
687static inline void cfqg_stats_update_dispatch(struct cfq_group *cfqg,
688 uint64_t bytes, int rw) { }
689static inline void cfqg_stats_update_completion(struct cfq_group *cfqg,
690 uint64_t start_time, uint64_t io_start_time, int rw) { }
691
692#endif /* CONFIG_CFQ_GROUP_IOSCHED */
693
389#define cfq_log(cfqd, fmt, args...) \ 694#define cfq_log(cfqd, fmt, args...) \
390 blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args) 695 blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args)
391 696
@@ -466,8 +771,9 @@ static inline int cfqg_busy_async_queues(struct cfq_data *cfqd,
466} 771}
467 772
468static void cfq_dispatch_insert(struct request_queue *, struct request *); 773static void cfq_dispatch_insert(struct request_queue *, struct request *);
469static struct cfq_queue *cfq_get_queue(struct cfq_data *, bool, 774static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, bool is_sync,
470 struct io_context *, gfp_t); 775 struct cfq_io_cq *cic, struct bio *bio,
776 gfp_t gfp_mask);
471 777
472static inline struct cfq_io_cq *icq_to_cic(struct io_cq *icq) 778static inline struct cfq_io_cq *icq_to_cic(struct io_cq *icq)
473{ 779{
@@ -545,7 +851,7 @@ static inline u64 cfq_scale_slice(unsigned long delta, struct cfq_group *cfqg)
545{ 851{
546 u64 d = delta << CFQ_SERVICE_SHIFT; 852 u64 d = delta << CFQ_SERVICE_SHIFT;
547 853
548 d = d * BLKIO_WEIGHT_DEFAULT; 854 d = d * CFQ_WEIGHT_DEFAULT;
549 do_div(d, cfqg->weight); 855 do_div(d, cfqg->weight);
550 return d; 856 return d;
551} 857}
@@ -872,9 +1178,9 @@ static void
872cfq_update_group_weight(struct cfq_group *cfqg) 1178cfq_update_group_weight(struct cfq_group *cfqg)
873{ 1179{
874 BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node)); 1180 BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node));
875 if (cfqg->needs_update) { 1181 if (cfqg->new_weight) {
876 cfqg->weight = cfqg->new_weight; 1182 cfqg->weight = cfqg->new_weight;
877 cfqg->needs_update = false; 1183 cfqg->new_weight = 0;
878 } 1184 }
879} 1185}
880 1186
@@ -936,7 +1242,7 @@ cfq_group_notify_queue_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
936 cfq_log_cfqg(cfqd, cfqg, "del_from_rr group"); 1242 cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");
937 cfq_group_service_tree_del(st, cfqg); 1243 cfq_group_service_tree_del(st, cfqg);
938 cfqg->saved_workload_slice = 0; 1244 cfqg->saved_workload_slice = 0;
939 cfq_blkiocg_update_dequeue_stats(&cfqg->blkg, 1); 1245 cfqg_stats_update_dequeue(cfqg);
940} 1246}
941 1247
942static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq, 1248static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq,
@@ -1008,178 +1314,59 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
1008 "sl_used=%u disp=%u charge=%u iops=%u sect=%lu", 1314 "sl_used=%u disp=%u charge=%u iops=%u sect=%lu",
1009 used_sl, cfqq->slice_dispatch, charge, 1315 used_sl, cfqq->slice_dispatch, charge,
1010 iops_mode(cfqd), cfqq->nr_sectors); 1316 iops_mode(cfqd), cfqq->nr_sectors);
1011 cfq_blkiocg_update_timeslice_used(&cfqg->blkg, used_sl, 1317 cfqg_stats_update_timeslice_used(cfqg, used_sl, unaccounted_sl);
1012 unaccounted_sl); 1318 cfqg_stats_set_start_empty_time(cfqg);
1013 cfq_blkiocg_set_start_empty_time(&cfqg->blkg);
1014} 1319}
1015 1320
1016#ifdef CONFIG_CFQ_GROUP_IOSCHED 1321/**
1017static inline struct cfq_group *cfqg_of_blkg(struct blkio_group *blkg) 1322 * cfq_init_cfqg_base - initialize base part of a cfq_group
1018{ 1323 * @cfqg: cfq_group to initialize
1019 if (blkg) 1324 *
1020 return container_of(blkg, struct cfq_group, blkg); 1325 * Initialize the base part which is used whether %CONFIG_CFQ_GROUP_IOSCHED
1021 return NULL; 1326 * is enabled or not.
1022}
1023
1024static void cfq_update_blkio_group_weight(void *key, struct blkio_group *blkg,
1025 unsigned int weight)
1026{
1027 struct cfq_group *cfqg = cfqg_of_blkg(blkg);
1028 cfqg->new_weight = weight;
1029 cfqg->needs_update = true;
1030}
1031
1032static void cfq_init_add_cfqg_lists(struct cfq_data *cfqd,
1033 struct cfq_group *cfqg, struct blkio_cgroup *blkcg)
1034{
1035 struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
1036 unsigned int major, minor;
1037
1038 /*
1039 * Add group onto cgroup list. It might happen that bdi->dev is
1040 * not initialized yet. Initialize this new group without major
1041 * and minor info and this info will be filled in once a new thread
1042 * comes for IO.
1043 */
1044 if (bdi->dev) {
1045 sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
1046 cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg,
1047 (void *)cfqd, MKDEV(major, minor));
1048 } else
1049 cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg,
1050 (void *)cfqd, 0);
1051
1052 cfqd->nr_blkcg_linked_grps++;
1053 cfqg->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
1054
1055 /* Add group on cfqd list */
1056 hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
1057}
1058
1059/*
1060 * Should be called from sleepable context. No request queue lock as per
1061 * cpu stats are allocated dynamically and alloc_percpu needs to be called
1062 * from sleepable context.
1063 */ 1327 */
1064static struct cfq_group * cfq_alloc_cfqg(struct cfq_data *cfqd) 1328static void cfq_init_cfqg_base(struct cfq_group *cfqg)
1065{ 1329{
1066 struct cfq_group *cfqg = NULL;
1067 int i, j, ret;
1068 struct cfq_rb_root *st; 1330 struct cfq_rb_root *st;
1069 1331 int i, j;
1070 cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node);
1071 if (!cfqg)
1072 return NULL;
1073 1332
1074 for_each_cfqg_st(cfqg, i, j, st) 1333 for_each_cfqg_st(cfqg, i, j, st)
1075 *st = CFQ_RB_ROOT; 1334 *st = CFQ_RB_ROOT;
1076 RB_CLEAR_NODE(&cfqg->rb_node); 1335 RB_CLEAR_NODE(&cfqg->rb_node);
1077 1336
1078 cfqg->ttime.last_end_request = jiffies; 1337 cfqg->ttime.last_end_request = jiffies;
1079
1080 /*
1081 * Take the initial reference that will be released on destroy
1082 * This can be thought of a joint reference by cgroup and
1083 * elevator which will be dropped by either elevator exit
1084 * or cgroup deletion path depending on who is exiting first.
1085 */
1086 cfqg->ref = 1;
1087
1088 ret = blkio_alloc_blkg_stats(&cfqg->blkg);
1089 if (ret) {
1090 kfree(cfqg);
1091 return NULL;
1092 }
1093
1094 return cfqg;
1095} 1338}
1096 1339
1097static struct cfq_group * 1340#ifdef CONFIG_CFQ_GROUP_IOSCHED
1098cfq_find_cfqg(struct cfq_data *cfqd, struct blkio_cgroup *blkcg) 1341static void cfq_pd_init(struct blkcg_gq *blkg)
1099{ 1342{
1100 struct cfq_group *cfqg = NULL; 1343 struct cfq_group *cfqg = blkg_to_cfqg(blkg);
1101 void *key = cfqd;
1102 struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
1103 unsigned int major, minor;
1104
1105 /*
1106 * This is the common case when there are no blkio cgroups.
1107 * Avoid lookup in this case
1108 */
1109 if (blkcg == &blkio_root_cgroup)
1110 cfqg = &cfqd->root_group;
1111 else
1112 cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
1113
1114 if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
1115 sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
1116 cfqg->blkg.dev = MKDEV(major, minor);
1117 }
1118 1344
1119 return cfqg; 1345 cfq_init_cfqg_base(cfqg);
1346 cfqg->weight = blkg->blkcg->cfq_weight;
1120} 1347}
1121 1348
1122/* 1349/*
1123 * Search for the cfq group current task belongs to. request_queue lock must 1350 * Search for the cfq group current task belongs to. request_queue lock must
1124 * be held. 1351 * be held.
1125 */ 1352 */
1126static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd) 1353static struct cfq_group *cfq_lookup_create_cfqg(struct cfq_data *cfqd,
1354 struct blkcg *blkcg)
1127{ 1355{
1128 struct blkio_cgroup *blkcg;
1129 struct cfq_group *cfqg = NULL, *__cfqg = NULL;
1130 struct request_queue *q = cfqd->queue; 1356 struct request_queue *q = cfqd->queue;
1357 struct cfq_group *cfqg = NULL;
1131 1358
1132 rcu_read_lock(); 1359 /* avoid lookup for the common case where there's no blkcg */
1133 blkcg = task_blkio_cgroup(current); 1360 if (blkcg == &blkcg_root) {
1134 cfqg = cfq_find_cfqg(cfqd, blkcg); 1361 cfqg = cfqd->root_group;
1135 if (cfqg) { 1362 } else {
1136 rcu_read_unlock(); 1363 struct blkcg_gq *blkg;
1137 return cfqg;
1138 }
1139
1140 /*
1141 * Need to allocate a group. Allocation of group also needs allocation
1142 * of per cpu stats which in-turn takes a mutex() and can block. Hence
1143 * we need to drop rcu lock and queue_lock before we call alloc.
1144 *
1145 * Not taking any queue reference here and assuming that queue is
1146 * around by the time we return. CFQ queue allocation code does
1147 * the same. It might be racy though.
1148 */
1149
1150 rcu_read_unlock();
1151 spin_unlock_irq(q->queue_lock);
1152
1153 cfqg = cfq_alloc_cfqg(cfqd);
1154
1155 spin_lock_irq(q->queue_lock);
1156
1157 rcu_read_lock();
1158 blkcg = task_blkio_cgroup(current);
1159
1160 /*
1161 * If some other thread already allocated the group while we were
1162 * not holding queue lock, free up the group
1163 */
1164 __cfqg = cfq_find_cfqg(cfqd, blkcg);
1165 1364
1166 if (__cfqg) { 1365 blkg = blkg_lookup_create(blkcg, q);
1167 kfree(cfqg); 1366 if (!IS_ERR(blkg))
1168 rcu_read_unlock(); 1367 cfqg = blkg_to_cfqg(blkg);
1169 return __cfqg;
1170 } 1368 }
1171 1369
1172 if (!cfqg)
1173 cfqg = &cfqd->root_group;
1174
1175 cfq_init_add_cfqg_lists(cfqd, cfqg, blkcg);
1176 rcu_read_unlock();
1177 return cfqg;
1178}
1179
1180static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg)
1181{
1182 cfqg->ref++;
1183 return cfqg; 1370 return cfqg;
1184} 1371}
1185 1372
@@ -1187,94 +1374,224 @@ static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg)
1187{ 1374{
1188 /* Currently, all async queues are mapped to root group */ 1375 /* Currently, all async queues are mapped to root group */
1189 if (!cfq_cfqq_sync(cfqq)) 1376 if (!cfq_cfqq_sync(cfqq))
1190 cfqg = &cfqq->cfqd->root_group; 1377 cfqg = cfqq->cfqd->root_group;
1191 1378
1192 cfqq->cfqg = cfqg; 1379 cfqq->cfqg = cfqg;
1193 /* cfqq reference on cfqg */ 1380 /* cfqq reference on cfqg */
1194 cfqq->cfqg->ref++; 1381 cfqg_get(cfqg);
1195} 1382}
1196 1383
1197static void cfq_put_cfqg(struct cfq_group *cfqg) 1384static u64 cfqg_prfill_weight_device(struct seq_file *sf,
1385 struct blkg_policy_data *pd, int off)
1198{ 1386{
1199 struct cfq_rb_root *st; 1387 struct cfq_group *cfqg = pd_to_cfqg(pd);
1200 int i, j;
1201 1388
1202 BUG_ON(cfqg->ref <= 0); 1389 if (!cfqg->dev_weight)
1203 cfqg->ref--; 1390 return 0;
1204 if (cfqg->ref) 1391 return __blkg_prfill_u64(sf, pd, cfqg->dev_weight);
1205 return;
1206 for_each_cfqg_st(cfqg, i, j, st)
1207 BUG_ON(!RB_EMPTY_ROOT(&st->rb));
1208 free_percpu(cfqg->blkg.stats_cpu);
1209 kfree(cfqg);
1210} 1392}
1211 1393
1212static void cfq_destroy_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg) 1394static int cfqg_print_weight_device(struct cgroup *cgrp, struct cftype *cft,
1395 struct seq_file *sf)
1213{ 1396{
1214 /* Something wrong if we are trying to remove same group twice */ 1397 blkcg_print_blkgs(sf, cgroup_to_blkcg(cgrp),
1215 BUG_ON(hlist_unhashed(&cfqg->cfqd_node)); 1398 cfqg_prfill_weight_device, &blkcg_policy_cfq, 0,
1399 false);
1400 return 0;
1401}
1216 1402
1217 hlist_del_init(&cfqg->cfqd_node); 1403static int cfq_print_weight(struct cgroup *cgrp, struct cftype *cft,
1404 struct seq_file *sf)
1405{
1406 seq_printf(sf, "%u\n", cgroup_to_blkcg(cgrp)->cfq_weight);
1407 return 0;
1408}
1218 1409
1219 BUG_ON(cfqd->nr_blkcg_linked_grps <= 0); 1410static int cfqg_set_weight_device(struct cgroup *cgrp, struct cftype *cft,
1220 cfqd->nr_blkcg_linked_grps--; 1411 const char *buf)
1412{
1413 struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
1414 struct blkg_conf_ctx ctx;
1415 struct cfq_group *cfqg;
1416 int ret;
1221 1417
1222 /* 1418 ret = blkg_conf_prep(blkcg, &blkcg_policy_cfq, buf, &ctx);
1223 * Put the reference taken at the time of creation so that when all 1419 if (ret)
1224 * queues are gone, group can be destroyed. 1420 return ret;
1225 */ 1421
1226 cfq_put_cfqg(cfqg); 1422 ret = -EINVAL;
1423 cfqg = blkg_to_cfqg(ctx.blkg);
1424 if (!ctx.v || (ctx.v >= CFQ_WEIGHT_MIN && ctx.v <= CFQ_WEIGHT_MAX)) {
1425 cfqg->dev_weight = ctx.v;
1426 cfqg->new_weight = cfqg->dev_weight ?: blkcg->cfq_weight;
1427 ret = 0;
1428 }
1429
1430 blkg_conf_finish(&ctx);
1431 return ret;
1227} 1432}
1228 1433
1229static void cfq_release_cfq_groups(struct cfq_data *cfqd) 1434static int cfq_set_weight(struct cgroup *cgrp, struct cftype *cft, u64 val)
1230{ 1435{
1231 struct hlist_node *pos, *n; 1436 struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
1232 struct cfq_group *cfqg; 1437 struct blkcg_gq *blkg;
1438 struct hlist_node *n;
1233 1439
1234 hlist_for_each_entry_safe(cfqg, pos, n, &cfqd->cfqg_list, cfqd_node) { 1440 if (val < CFQ_WEIGHT_MIN || val > CFQ_WEIGHT_MAX)
1235 /* 1441 return -EINVAL;
1236 * If cgroup removal path got to blk_group first and removed 1442
1237 * it from cgroup list, then it will take care of destroying 1443 spin_lock_irq(&blkcg->lock);
1238 * cfqg also. 1444 blkcg->cfq_weight = (unsigned int)val;
1239 */ 1445
1240 if (!cfq_blkiocg_del_blkio_group(&cfqg->blkg)) 1446 hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
1241 cfq_destroy_cfqg(cfqd, cfqg); 1447 struct cfq_group *cfqg = blkg_to_cfqg(blkg);
1448
1449 if (cfqg && !cfqg->dev_weight)
1450 cfqg->new_weight = blkcg->cfq_weight;
1242 } 1451 }
1452
1453 spin_unlock_irq(&blkcg->lock);
1454 return 0;
1243} 1455}
1244 1456
1245/* 1457static int cfqg_print_stat(struct cgroup *cgrp, struct cftype *cft,
1246 * Blk cgroup controller notification saying that blkio_group object is being 1458 struct seq_file *sf)
1247 * delinked as associated cgroup object is going away. That also means that
1248 * no new IO will come in this group. So get rid of this group as soon as
1249 * any pending IO in the group is finished.
1250 *
1251 * This function is called under rcu_read_lock(). key is the rcu protected
1252 * pointer. That means "key" is a valid cfq_data pointer as long as we are rcu
1253 * read lock.
1254 *
1255 * "key" was fetched from blkio_group under blkio_cgroup->lock. That means
1256 * it should not be NULL as even if elevator was exiting, cgroup deltion
1257 * path got to it first.
1258 */
1259static void cfq_unlink_blkio_group(void *key, struct blkio_group *blkg)
1260{ 1459{
1261 unsigned long flags; 1460 struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
1262 struct cfq_data *cfqd = key;
1263 1461
1264 spin_lock_irqsave(cfqd->queue->queue_lock, flags); 1462 blkcg_print_blkgs(sf, blkcg, blkg_prfill_stat, &blkcg_policy_cfq,
1265 cfq_destroy_cfqg(cfqd, cfqg_of_blkg(blkg)); 1463 cft->private, false);
1266 spin_unlock_irqrestore(cfqd->queue->queue_lock, flags); 1464 return 0;
1267} 1465}
1268 1466
1269#else /* GROUP_IOSCHED */ 1467static int cfqg_print_rwstat(struct cgroup *cgrp, struct cftype *cft,
1270static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd) 1468 struct seq_file *sf)
1271{ 1469{
1272 return &cfqd->root_group; 1470 struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
1471
1472 blkcg_print_blkgs(sf, blkcg, blkg_prfill_rwstat, &blkcg_policy_cfq,
1473 cft->private, true);
1474 return 0;
1273} 1475}
1274 1476
1275static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg) 1477#ifdef CONFIG_DEBUG_BLK_CGROUP
1478static u64 cfqg_prfill_avg_queue_size(struct seq_file *sf,
1479 struct blkg_policy_data *pd, int off)
1276{ 1480{
1277 return cfqg; 1481 struct cfq_group *cfqg = pd_to_cfqg(pd);
1482 u64 samples = blkg_stat_read(&cfqg->stats.avg_queue_size_samples);
1483 u64 v = 0;
1484
1485 if (samples) {
1486 v = blkg_stat_read(&cfqg->stats.avg_queue_size_sum);
1487 do_div(v, samples);
1488 }
1489 __blkg_prfill_u64(sf, pd, v);
1490 return 0;
1491}
1492
1493/* print avg_queue_size */
1494static int cfqg_print_avg_queue_size(struct cgroup *cgrp, struct cftype *cft,
1495 struct seq_file *sf)
1496{
1497 struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
1498
1499 blkcg_print_blkgs(sf, blkcg, cfqg_prfill_avg_queue_size,
1500 &blkcg_policy_cfq, 0, false);
1501 return 0;
1502}
1503#endif /* CONFIG_DEBUG_BLK_CGROUP */
1504
1505static struct cftype cfq_blkcg_files[] = {
1506 {
1507 .name = "weight_device",
1508 .read_seq_string = cfqg_print_weight_device,
1509 .write_string = cfqg_set_weight_device,
1510 .max_write_len = 256,
1511 },
1512 {
1513 .name = "weight",
1514 .read_seq_string = cfq_print_weight,
1515 .write_u64 = cfq_set_weight,
1516 },
1517 {
1518 .name = "time",
1519 .private = offsetof(struct cfq_group, stats.time),
1520 .read_seq_string = cfqg_print_stat,
1521 },
1522 {
1523 .name = "sectors",
1524 .private = offsetof(struct cfq_group, stats.sectors),
1525 .read_seq_string = cfqg_print_stat,
1526 },
1527 {
1528 .name = "io_service_bytes",
1529 .private = offsetof(struct cfq_group, stats.service_bytes),
1530 .read_seq_string = cfqg_print_rwstat,
1531 },
1532 {
1533 .name = "io_serviced",
1534 .private = offsetof(struct cfq_group, stats.serviced),
1535 .read_seq_string = cfqg_print_rwstat,
1536 },
1537 {
1538 .name = "io_service_time",
1539 .private = offsetof(struct cfq_group, stats.service_time),
1540 .read_seq_string = cfqg_print_rwstat,
1541 },
1542 {
1543 .name = "io_wait_time",
1544 .private = offsetof(struct cfq_group, stats.wait_time),
1545 .read_seq_string = cfqg_print_rwstat,
1546 },
1547 {
1548 .name = "io_merged",
1549 .private = offsetof(struct cfq_group, stats.merged),
1550 .read_seq_string = cfqg_print_rwstat,
1551 },
1552 {
1553 .name = "io_queued",
1554 .private = offsetof(struct cfq_group, stats.queued),
1555 .read_seq_string = cfqg_print_rwstat,
1556 },
1557#ifdef CONFIG_DEBUG_BLK_CGROUP
1558 {
1559 .name = "avg_queue_size",
1560 .read_seq_string = cfqg_print_avg_queue_size,
1561 },
1562 {
1563 .name = "group_wait_time",
1564 .private = offsetof(struct cfq_group, stats.group_wait_time),
1565 .read_seq_string = cfqg_print_stat,
1566 },
1567 {
1568 .name = "idle_time",
1569 .private = offsetof(struct cfq_group, stats.idle_time),
1570 .read_seq_string = cfqg_print_stat,
1571 },
1572 {
1573 .name = "empty_time",
1574 .private = offsetof(struct cfq_group, stats.empty_time),
1575 .read_seq_string = cfqg_print_stat,
1576 },
1577 {
1578 .name = "dequeue",
1579 .private = offsetof(struct cfq_group, stats.dequeue),
1580 .read_seq_string = cfqg_print_stat,
1581 },
1582 {
1583 .name = "unaccounted_time",
1584 .private = offsetof(struct cfq_group, stats.unaccounted_time),
1585 .read_seq_string = cfqg_print_stat,
1586 },
1587#endif /* CONFIG_DEBUG_BLK_CGROUP */
1588 { } /* terminate */
1589};
1590#else /* GROUP_IOSCHED */
1591static struct cfq_group *cfq_lookup_create_cfqg(struct cfq_data *cfqd,
1592 struct blkcg *blkcg)
1593{
1594 return cfqd->root_group;
1278} 1595}
1279 1596
1280static inline void 1597static inline void
@@ -1282,9 +1599,6 @@ cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) {
1282 cfqq->cfqg = cfqg; 1599 cfqq->cfqg = cfqg;
1283} 1600}
1284 1601
1285static void cfq_release_cfq_groups(struct cfq_data *cfqd) {}
1286static inline void cfq_put_cfqg(struct cfq_group *cfqg) {}
1287
1288#endif /* GROUP_IOSCHED */ 1602#endif /* GROUP_IOSCHED */
1289 1603
1290/* 1604/*
@@ -1551,12 +1865,10 @@ static void cfq_reposition_rq_rb(struct cfq_queue *cfqq, struct request *rq)
1551{ 1865{
1552 elv_rb_del(&cfqq->sort_list, rq); 1866 elv_rb_del(&cfqq->sort_list, rq);
1553 cfqq->queued[rq_is_sync(rq)]--; 1867 cfqq->queued[rq_is_sync(rq)]--;
1554 cfq_blkiocg_update_io_remove_stats(&(RQ_CFQG(rq))->blkg, 1868 cfqg_stats_update_io_remove(RQ_CFQG(rq), rq->cmd_flags);
1555 rq_data_dir(rq), rq_is_sync(rq));
1556 cfq_add_rq_rb(rq); 1869 cfq_add_rq_rb(rq);
1557 cfq_blkiocg_update_io_add_stats(&(RQ_CFQG(rq))->blkg, 1870 cfqg_stats_update_io_add(RQ_CFQG(rq), cfqq->cfqd->serving_group,
1558 &cfqq->cfqd->serving_group->blkg, rq_data_dir(rq), 1871 rq->cmd_flags);
1559 rq_is_sync(rq));
1560} 1872}
1561 1873
1562static struct request * 1874static struct request *
@@ -1612,8 +1924,7 @@ static void cfq_remove_request(struct request *rq)
1612 cfq_del_rq_rb(rq); 1924 cfq_del_rq_rb(rq);
1613 1925
1614 cfqq->cfqd->rq_queued--; 1926 cfqq->cfqd->rq_queued--;
1615 cfq_blkiocg_update_io_remove_stats(&(RQ_CFQG(rq))->blkg, 1927 cfqg_stats_update_io_remove(RQ_CFQG(rq), rq->cmd_flags);
1616 rq_data_dir(rq), rq_is_sync(rq));
1617 if (rq->cmd_flags & REQ_PRIO) { 1928 if (rq->cmd_flags & REQ_PRIO) {
1618 WARN_ON(!cfqq->prio_pending); 1929 WARN_ON(!cfqq->prio_pending);
1619 cfqq->prio_pending--; 1930 cfqq->prio_pending--;
@@ -1648,8 +1959,7 @@ static void cfq_merged_request(struct request_queue *q, struct request *req,
1648static void cfq_bio_merged(struct request_queue *q, struct request *req, 1959static void cfq_bio_merged(struct request_queue *q, struct request *req,
1649 struct bio *bio) 1960 struct bio *bio)
1650{ 1961{
1651 cfq_blkiocg_update_io_merged_stats(&(RQ_CFQG(req))->blkg, 1962 cfqg_stats_update_io_merged(RQ_CFQG(req), bio->bi_rw);
1652 bio_data_dir(bio), cfq_bio_sync(bio));
1653} 1963}
1654 1964
1655static void 1965static void
@@ -1671,8 +1981,7 @@ cfq_merged_requests(struct request_queue *q, struct request *rq,
1671 if (cfqq->next_rq == next) 1981 if (cfqq->next_rq == next)
1672 cfqq->next_rq = rq; 1982 cfqq->next_rq = rq;
1673 cfq_remove_request(next); 1983 cfq_remove_request(next);
1674 cfq_blkiocg_update_io_merged_stats(&(RQ_CFQG(rq))->blkg, 1984 cfqg_stats_update_io_merged(RQ_CFQG(rq), next->cmd_flags);
1675 rq_data_dir(next), rq_is_sync(next));
1676 1985
1677 cfqq = RQ_CFQQ(next); 1986 cfqq = RQ_CFQQ(next);
1678 /* 1987 /*
@@ -1713,7 +2022,7 @@ static int cfq_allow_merge(struct request_queue *q, struct request *rq,
1713static inline void cfq_del_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq) 2022static inline void cfq_del_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq)
1714{ 2023{
1715 del_timer(&cfqd->idle_slice_timer); 2024 del_timer(&cfqd->idle_slice_timer);
1716 cfq_blkiocg_update_idle_time_stats(&cfqq->cfqg->blkg); 2025 cfqg_stats_update_idle_time(cfqq->cfqg);
1717} 2026}
1718 2027
1719static void __cfq_set_active_queue(struct cfq_data *cfqd, 2028static void __cfq_set_active_queue(struct cfq_data *cfqd,
@@ -1722,7 +2031,7 @@ static void __cfq_set_active_queue(struct cfq_data *cfqd,
1722 if (cfqq) { 2031 if (cfqq) {
1723 cfq_log_cfqq(cfqd, cfqq, "set_active wl_prio:%d wl_type:%d", 2032 cfq_log_cfqq(cfqd, cfqq, "set_active wl_prio:%d wl_type:%d",
1724 cfqd->serving_prio, cfqd->serving_type); 2033 cfqd->serving_prio, cfqd->serving_type);
1725 cfq_blkiocg_update_avg_queue_size_stats(&cfqq->cfqg->blkg); 2034 cfqg_stats_update_avg_queue_size(cfqq->cfqg);
1726 cfqq->slice_start = 0; 2035 cfqq->slice_start = 0;
1727 cfqq->dispatch_start = jiffies; 2036 cfqq->dispatch_start = jiffies;
1728 cfqq->allocated_slice = 0; 2037 cfqq->allocated_slice = 0;
@@ -2043,7 +2352,7 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
2043 * task has exited, don't wait 2352 * task has exited, don't wait
2044 */ 2353 */
2045 cic = cfqd->active_cic; 2354 cic = cfqd->active_cic;
2046 if (!cic || !atomic_read(&cic->icq.ioc->nr_tasks)) 2355 if (!cic || !atomic_read(&cic->icq.ioc->active_ref))
2047 return; 2356 return;
2048 2357
2049 /* 2358 /*
@@ -2070,7 +2379,7 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
2070 sl = cfqd->cfq_slice_idle; 2379 sl = cfqd->cfq_slice_idle;
2071 2380
2072 mod_timer(&cfqd->idle_slice_timer, jiffies + sl); 2381 mod_timer(&cfqd->idle_slice_timer, jiffies + sl);
2073 cfq_blkiocg_update_set_idle_time_stats(&cfqq->cfqg->blkg); 2382 cfqg_stats_set_start_idle_time(cfqq->cfqg);
2074 cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu group_idle: %d", sl, 2383 cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu group_idle: %d", sl,
2075 group_idle ? 1 : 0); 2384 group_idle ? 1 : 0);
2076} 2385}
@@ -2093,8 +2402,7 @@ static void cfq_dispatch_insert(struct request_queue *q, struct request *rq)
2093 2402
2094 cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++; 2403 cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++;
2095 cfqq->nr_sectors += blk_rq_sectors(rq); 2404 cfqq->nr_sectors += blk_rq_sectors(rq);
2096 cfq_blkiocg_update_dispatch_stats(&cfqq->cfqg->blkg, blk_rq_bytes(rq), 2405 cfqg_stats_update_dispatch(cfqq->cfqg, blk_rq_bytes(rq), rq->cmd_flags);
2097 rq_data_dir(rq), rq_is_sync(rq));
2098} 2406}
2099 2407
2100/* 2408/*
@@ -2677,7 +2985,7 @@ static void cfq_put_queue(struct cfq_queue *cfqq)
2677 2985
2678 BUG_ON(cfq_cfqq_on_rr(cfqq)); 2986 BUG_ON(cfq_cfqq_on_rr(cfqq));
2679 kmem_cache_free(cfq_pool, cfqq); 2987 kmem_cache_free(cfq_pool, cfqq);
2680 cfq_put_cfqg(cfqg); 2988 cfqg_put(cfqg);
2681} 2989}
2682 2990
2683static void cfq_put_cooperator(struct cfq_queue *cfqq) 2991static void cfq_put_cooperator(struct cfq_queue *cfqq)
@@ -2736,7 +3044,7 @@ static void cfq_exit_icq(struct io_cq *icq)
2736 } 3044 }
2737} 3045}
2738 3046
2739static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc) 3047static void cfq_init_prio_data(struct cfq_queue *cfqq, struct cfq_io_cq *cic)
2740{ 3048{
2741 struct task_struct *tsk = current; 3049 struct task_struct *tsk = current;
2742 int ioprio_class; 3050 int ioprio_class;
@@ -2744,7 +3052,7 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc)
2744 if (!cfq_cfqq_prio_changed(cfqq)) 3052 if (!cfq_cfqq_prio_changed(cfqq))
2745 return; 3053 return;
2746 3054
2747 ioprio_class = IOPRIO_PRIO_CLASS(ioc->ioprio); 3055 ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio);
2748 switch (ioprio_class) { 3056 switch (ioprio_class) {
2749 default: 3057 default:
2750 printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class); 3058 printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class);
@@ -2756,11 +3064,11 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc)
2756 cfqq->ioprio_class = task_nice_ioclass(tsk); 3064 cfqq->ioprio_class = task_nice_ioclass(tsk);
2757 break; 3065 break;
2758 case IOPRIO_CLASS_RT: 3066 case IOPRIO_CLASS_RT:
2759 cfqq->ioprio = task_ioprio(ioc); 3067 cfqq->ioprio = IOPRIO_PRIO_DATA(cic->ioprio);
2760 cfqq->ioprio_class = IOPRIO_CLASS_RT; 3068 cfqq->ioprio_class = IOPRIO_CLASS_RT;
2761 break; 3069 break;
2762 case IOPRIO_CLASS_BE: 3070 case IOPRIO_CLASS_BE:
2763 cfqq->ioprio = task_ioprio(ioc); 3071 cfqq->ioprio = IOPRIO_PRIO_DATA(cic->ioprio);
2764 cfqq->ioprio_class = IOPRIO_CLASS_BE; 3072 cfqq->ioprio_class = IOPRIO_CLASS_BE;
2765 break; 3073 break;
2766 case IOPRIO_CLASS_IDLE: 3074 case IOPRIO_CLASS_IDLE:
@@ -2778,19 +3086,24 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc)
2778 cfq_clear_cfqq_prio_changed(cfqq); 3086 cfq_clear_cfqq_prio_changed(cfqq);
2779} 3087}
2780 3088
2781static void changed_ioprio(struct cfq_io_cq *cic) 3089static void check_ioprio_changed(struct cfq_io_cq *cic, struct bio *bio)
2782{ 3090{
3091 int ioprio = cic->icq.ioc->ioprio;
2783 struct cfq_data *cfqd = cic_to_cfqd(cic); 3092 struct cfq_data *cfqd = cic_to_cfqd(cic);
2784 struct cfq_queue *cfqq; 3093 struct cfq_queue *cfqq;
2785 3094
2786 if (unlikely(!cfqd)) 3095 /*
3096 * Check whether ioprio has changed. The condition may trigger
3097 * spuriously on a newly created cic but there's no harm.
3098 */
3099 if (unlikely(!cfqd) || likely(cic->ioprio == ioprio))
2787 return; 3100 return;
2788 3101
2789 cfqq = cic->cfqq[BLK_RW_ASYNC]; 3102 cfqq = cic->cfqq[BLK_RW_ASYNC];
2790 if (cfqq) { 3103 if (cfqq) {
2791 struct cfq_queue *new_cfqq; 3104 struct cfq_queue *new_cfqq;
2792 new_cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic->icq.ioc, 3105 new_cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic, bio,
2793 GFP_ATOMIC); 3106 GFP_ATOMIC);
2794 if (new_cfqq) { 3107 if (new_cfqq) {
2795 cic->cfqq[BLK_RW_ASYNC] = new_cfqq; 3108 cic->cfqq[BLK_RW_ASYNC] = new_cfqq;
2796 cfq_put_queue(cfqq); 3109 cfq_put_queue(cfqq);
@@ -2800,6 +3113,8 @@ static void changed_ioprio(struct cfq_io_cq *cic)
2800 cfqq = cic->cfqq[BLK_RW_SYNC]; 3113 cfqq = cic->cfqq[BLK_RW_SYNC];
2801 if (cfqq) 3114 if (cfqq)
2802 cfq_mark_cfqq_prio_changed(cfqq); 3115 cfq_mark_cfqq_prio_changed(cfqq);
3116
3117 cic->ioprio = ioprio;
2803} 3118}
2804 3119
2805static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq, 3120static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
@@ -2823,17 +3138,24 @@ static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
2823} 3138}
2824 3139
2825#ifdef CONFIG_CFQ_GROUP_IOSCHED 3140#ifdef CONFIG_CFQ_GROUP_IOSCHED
2826static void changed_cgroup(struct cfq_io_cq *cic) 3141static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)
2827{ 3142{
2828 struct cfq_queue *sync_cfqq = cic_to_cfqq(cic, 1);
2829 struct cfq_data *cfqd = cic_to_cfqd(cic); 3143 struct cfq_data *cfqd = cic_to_cfqd(cic);
2830 struct request_queue *q; 3144 struct cfq_queue *sync_cfqq;
3145 uint64_t id;
2831 3146
2832 if (unlikely(!cfqd)) 3147 rcu_read_lock();
2833 return; 3148 id = bio_blkcg(bio)->id;
3149 rcu_read_unlock();
2834 3150
2835 q = cfqd->queue; 3151 /*
3152 * Check whether blkcg has changed. The condition may trigger
3153 * spuriously on a newly created cic but there's no harm.
3154 */
3155 if (unlikely(!cfqd) || likely(cic->blkcg_id == id))
3156 return;
2836 3157
3158 sync_cfqq = cic_to_cfqq(cic, 1);
2837 if (sync_cfqq) { 3159 if (sync_cfqq) {
2838 /* 3160 /*
2839 * Drop reference to sync queue. A new sync queue will be 3161 * Drop reference to sync queue. A new sync queue will be
@@ -2843,21 +3165,26 @@ static void changed_cgroup(struct cfq_io_cq *cic)
2843 cic_set_cfqq(cic, NULL, 1); 3165 cic_set_cfqq(cic, NULL, 1);
2844 cfq_put_queue(sync_cfqq); 3166 cfq_put_queue(sync_cfqq);
2845 } 3167 }
3168
3169 cic->blkcg_id = id;
2846} 3170}
3171#else
3172static inline void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) { }
2847#endif /* CONFIG_CFQ_GROUP_IOSCHED */ 3173#endif /* CONFIG_CFQ_GROUP_IOSCHED */
2848 3174
2849static struct cfq_queue * 3175static struct cfq_queue *
2850cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync, 3176cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic,
2851 struct io_context *ioc, gfp_t gfp_mask) 3177 struct bio *bio, gfp_t gfp_mask)
2852{ 3178{
3179 struct blkcg *blkcg;
2853 struct cfq_queue *cfqq, *new_cfqq = NULL; 3180 struct cfq_queue *cfqq, *new_cfqq = NULL;
2854 struct cfq_io_cq *cic;
2855 struct cfq_group *cfqg; 3181 struct cfq_group *cfqg;
2856 3182
2857retry: 3183retry:
2858 cfqg = cfq_get_cfqg(cfqd); 3184 rcu_read_lock();
2859 cic = cfq_cic_lookup(cfqd, ioc); 3185
2860 /* cic always exists here */ 3186 blkcg = bio_blkcg(bio);
3187 cfqg = cfq_lookup_create_cfqg(cfqd, blkcg);
2861 cfqq = cic_to_cfqq(cic, is_sync); 3188 cfqq = cic_to_cfqq(cic, is_sync);
2862 3189
2863 /* 3190 /*
@@ -2870,6 +3197,7 @@ retry:
2870 cfqq = new_cfqq; 3197 cfqq = new_cfqq;
2871 new_cfqq = NULL; 3198 new_cfqq = NULL;
2872 } else if (gfp_mask & __GFP_WAIT) { 3199 } else if (gfp_mask & __GFP_WAIT) {
3200 rcu_read_unlock();
2873 spin_unlock_irq(cfqd->queue->queue_lock); 3201 spin_unlock_irq(cfqd->queue->queue_lock);
2874 new_cfqq = kmem_cache_alloc_node(cfq_pool, 3202 new_cfqq = kmem_cache_alloc_node(cfq_pool,
2875 gfp_mask | __GFP_ZERO, 3203 gfp_mask | __GFP_ZERO,
@@ -2885,7 +3213,7 @@ retry:
2885 3213
2886 if (cfqq) { 3214 if (cfqq) {
2887 cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync); 3215 cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync);
2888 cfq_init_prio_data(cfqq, ioc); 3216 cfq_init_prio_data(cfqq, cic);
2889 cfq_link_cfqq_cfqg(cfqq, cfqg); 3217 cfq_link_cfqq_cfqg(cfqq, cfqg);
2890 cfq_log_cfqq(cfqd, cfqq, "alloced"); 3218 cfq_log_cfqq(cfqd, cfqq, "alloced");
2891 } else 3219 } else
@@ -2895,6 +3223,7 @@ retry:
2895 if (new_cfqq) 3223 if (new_cfqq)
2896 kmem_cache_free(cfq_pool, new_cfqq); 3224 kmem_cache_free(cfq_pool, new_cfqq);
2897 3225
3226 rcu_read_unlock();
2898 return cfqq; 3227 return cfqq;
2899} 3228}
2900 3229
@@ -2904,6 +3233,9 @@ cfq_async_queue_prio(struct cfq_data *cfqd, int ioprio_class, int ioprio)
2904 switch (ioprio_class) { 3233 switch (ioprio_class) {
2905 case IOPRIO_CLASS_RT: 3234 case IOPRIO_CLASS_RT:
2906 return &cfqd->async_cfqq[0][ioprio]; 3235 return &cfqd->async_cfqq[0][ioprio];
3236 case IOPRIO_CLASS_NONE:
3237 ioprio = IOPRIO_NORM;
3238 /* fall through */
2907 case IOPRIO_CLASS_BE: 3239 case IOPRIO_CLASS_BE:
2908 return &cfqd->async_cfqq[1][ioprio]; 3240 return &cfqd->async_cfqq[1][ioprio];
2909 case IOPRIO_CLASS_IDLE: 3241 case IOPRIO_CLASS_IDLE:
@@ -2914,11 +3246,11 @@ cfq_async_queue_prio(struct cfq_data *cfqd, int ioprio_class, int ioprio)
2914} 3246}
2915 3247
2916static struct cfq_queue * 3248static struct cfq_queue *
2917cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct io_context *ioc, 3249cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic,
2918 gfp_t gfp_mask) 3250 struct bio *bio, gfp_t gfp_mask)
2919{ 3251{
2920 const int ioprio = task_ioprio(ioc); 3252 const int ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio);
2921 const int ioprio_class = task_ioprio_class(ioc); 3253 const int ioprio = IOPRIO_PRIO_DATA(cic->ioprio);
2922 struct cfq_queue **async_cfqq = NULL; 3254 struct cfq_queue **async_cfqq = NULL;
2923 struct cfq_queue *cfqq = NULL; 3255 struct cfq_queue *cfqq = NULL;
2924 3256
@@ -2928,7 +3260,7 @@ cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct io_context *ioc,
2928 } 3260 }
2929 3261
2930 if (!cfqq) 3262 if (!cfqq)
2931 cfqq = cfq_find_alloc_queue(cfqd, is_sync, ioc, gfp_mask); 3263 cfqq = cfq_find_alloc_queue(cfqd, is_sync, cic, bio, gfp_mask);
2932 3264
2933 /* 3265 /*
2934 * pin the queue now that it's allocated, scheduler exit will prune it 3266 * pin the queue now that it's allocated, scheduler exit will prune it
@@ -3010,7 +3342,7 @@ cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
3010 3342
3011 if (cfqq->next_rq && (cfqq->next_rq->cmd_flags & REQ_NOIDLE)) 3343 if (cfqq->next_rq && (cfqq->next_rq->cmd_flags & REQ_NOIDLE))
3012 enable_idle = 0; 3344 enable_idle = 0;
3013 else if (!atomic_read(&cic->icq.ioc->nr_tasks) || 3345 else if (!atomic_read(&cic->icq.ioc->active_ref) ||
3014 !cfqd->cfq_slice_idle || 3346 !cfqd->cfq_slice_idle ||
3015 (!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq))) 3347 (!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq)))
3016 enable_idle = 0; 3348 enable_idle = 0;
@@ -3174,8 +3506,7 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
3174 cfq_clear_cfqq_wait_request(cfqq); 3506 cfq_clear_cfqq_wait_request(cfqq);
3175 __blk_run_queue(cfqd->queue); 3507 __blk_run_queue(cfqd->queue);
3176 } else { 3508 } else {
3177 cfq_blkiocg_update_idle_time_stats( 3509 cfqg_stats_update_idle_time(cfqq->cfqg);
3178 &cfqq->cfqg->blkg);
3179 cfq_mark_cfqq_must_dispatch(cfqq); 3510 cfq_mark_cfqq_must_dispatch(cfqq);
3180 } 3511 }
3181 } 3512 }
@@ -3197,14 +3528,13 @@ static void cfq_insert_request(struct request_queue *q, struct request *rq)
3197 struct cfq_queue *cfqq = RQ_CFQQ(rq); 3528 struct cfq_queue *cfqq = RQ_CFQQ(rq);
3198 3529
3199 cfq_log_cfqq(cfqd, cfqq, "insert_request"); 3530 cfq_log_cfqq(cfqd, cfqq, "insert_request");
3200 cfq_init_prio_data(cfqq, RQ_CIC(rq)->icq.ioc); 3531 cfq_init_prio_data(cfqq, RQ_CIC(rq));
3201 3532
3202 rq_set_fifo_time(rq, jiffies + cfqd->cfq_fifo_expire[rq_is_sync(rq)]); 3533 rq_set_fifo_time(rq, jiffies + cfqd->cfq_fifo_expire[rq_is_sync(rq)]);
3203 list_add_tail(&rq->queuelist, &cfqq->fifo); 3534 list_add_tail(&rq->queuelist, &cfqq->fifo);
3204 cfq_add_rq_rb(rq); 3535 cfq_add_rq_rb(rq);
3205 cfq_blkiocg_update_io_add_stats(&(RQ_CFQG(rq))->blkg, 3536 cfqg_stats_update_io_add(RQ_CFQG(rq), cfqd->serving_group,
3206 &cfqd->serving_group->blkg, rq_data_dir(rq), 3537 rq->cmd_flags);
3207 rq_is_sync(rq));
3208 cfq_rq_enqueued(cfqd, cfqq, rq); 3538 cfq_rq_enqueued(cfqd, cfqq, rq);
3209} 3539}
3210 3540
@@ -3300,9 +3630,8 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
3300 cfqd->rq_in_driver--; 3630 cfqd->rq_in_driver--;
3301 cfqq->dispatched--; 3631 cfqq->dispatched--;
3302 (RQ_CFQG(rq))->dispatched--; 3632 (RQ_CFQG(rq))->dispatched--;
3303 cfq_blkiocg_update_completion_stats(&cfqq->cfqg->blkg, 3633 cfqg_stats_update_completion(cfqq->cfqg, rq_start_time_ns(rq),
3304 rq_start_time_ns(rq), rq_io_start_time_ns(rq), 3634 rq_io_start_time_ns(rq), rq->cmd_flags);
3305 rq_data_dir(rq), rq_is_sync(rq));
3306 3635
3307 cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--; 3636 cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--;
3308 3637
@@ -3399,7 +3728,7 @@ static int cfq_may_queue(struct request_queue *q, int rw)
3399 3728
3400 cfqq = cic_to_cfqq(cic, rw_is_sync(rw)); 3729 cfqq = cic_to_cfqq(cic, rw_is_sync(rw));
3401 if (cfqq) { 3730 if (cfqq) {
3402 cfq_init_prio_data(cfqq, cic->icq.ioc); 3731 cfq_init_prio_data(cfqq, cic);
3403 3732
3404 return __cfq_may_queue(cfqq); 3733 return __cfq_may_queue(cfqq);
3405 } 3734 }
@@ -3421,7 +3750,7 @@ static void cfq_put_request(struct request *rq)
3421 cfqq->allocated[rw]--; 3750 cfqq->allocated[rw]--;
3422 3751
3423 /* Put down rq reference on cfqg */ 3752 /* Put down rq reference on cfqg */
3424 cfq_put_cfqg(RQ_CFQG(rq)); 3753 cfqg_put(RQ_CFQG(rq));
3425 rq->elv.priv[0] = NULL; 3754 rq->elv.priv[0] = NULL;
3426 rq->elv.priv[1] = NULL; 3755 rq->elv.priv[1] = NULL;
3427 3756
@@ -3465,32 +3794,25 @@ split_cfqq(struct cfq_io_cq *cic, struct cfq_queue *cfqq)
3465 * Allocate cfq data structures associated with this request. 3794 * Allocate cfq data structures associated with this request.
3466 */ 3795 */
3467static int 3796static int
3468cfq_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask) 3797cfq_set_request(struct request_queue *q, struct request *rq, struct bio *bio,
3798 gfp_t gfp_mask)
3469{ 3799{
3470 struct cfq_data *cfqd = q->elevator->elevator_data; 3800 struct cfq_data *cfqd = q->elevator->elevator_data;
3471 struct cfq_io_cq *cic = icq_to_cic(rq->elv.icq); 3801 struct cfq_io_cq *cic = icq_to_cic(rq->elv.icq);
3472 const int rw = rq_data_dir(rq); 3802 const int rw = rq_data_dir(rq);
3473 const bool is_sync = rq_is_sync(rq); 3803 const bool is_sync = rq_is_sync(rq);
3474 struct cfq_queue *cfqq; 3804 struct cfq_queue *cfqq;
3475 unsigned int changed;
3476 3805
3477 might_sleep_if(gfp_mask & __GFP_WAIT); 3806 might_sleep_if(gfp_mask & __GFP_WAIT);
3478 3807
3479 spin_lock_irq(q->queue_lock); 3808 spin_lock_irq(q->queue_lock);
3480 3809
3481 /* handle changed notifications */ 3810 check_ioprio_changed(cic, bio);
3482 changed = icq_get_changed(&cic->icq); 3811 check_blkcg_changed(cic, bio);
3483 if (unlikely(changed & ICQ_IOPRIO_CHANGED))
3484 changed_ioprio(cic);
3485#ifdef CONFIG_CFQ_GROUP_IOSCHED
3486 if (unlikely(changed & ICQ_CGROUP_CHANGED))
3487 changed_cgroup(cic);
3488#endif
3489
3490new_queue: 3812new_queue:
3491 cfqq = cic_to_cfqq(cic, is_sync); 3813 cfqq = cic_to_cfqq(cic, is_sync);
3492 if (!cfqq || cfqq == &cfqd->oom_cfqq) { 3814 if (!cfqq || cfqq == &cfqd->oom_cfqq) {
3493 cfqq = cfq_get_queue(cfqd, is_sync, cic->icq.ioc, gfp_mask); 3815 cfqq = cfq_get_queue(cfqd, is_sync, cic, bio, gfp_mask);
3494 cic_set_cfqq(cic, cfqq, is_sync); 3816 cic_set_cfqq(cic, cfqq, is_sync);
3495 } else { 3817 } else {
3496 /* 3818 /*
@@ -3516,8 +3838,9 @@ new_queue:
3516 cfqq->allocated[rw]++; 3838 cfqq->allocated[rw]++;
3517 3839
3518 cfqq->ref++; 3840 cfqq->ref++;
3841 cfqg_get(cfqq->cfqg);
3519 rq->elv.priv[0] = cfqq; 3842 rq->elv.priv[0] = cfqq;
3520 rq->elv.priv[1] = cfq_ref_get_cfqg(cfqq->cfqg); 3843 rq->elv.priv[1] = cfqq->cfqg;
3521 spin_unlock_irq(q->queue_lock); 3844 spin_unlock_irq(q->queue_lock);
3522 return 0; 3845 return 0;
3523} 3846}
@@ -3614,7 +3937,6 @@ static void cfq_exit_queue(struct elevator_queue *e)
3614{ 3937{
3615 struct cfq_data *cfqd = e->elevator_data; 3938 struct cfq_data *cfqd = e->elevator_data;
3616 struct request_queue *q = cfqd->queue; 3939 struct request_queue *q = cfqd->queue;
3617 bool wait = false;
3618 3940
3619 cfq_shutdown_timer_wq(cfqd); 3941 cfq_shutdown_timer_wq(cfqd);
3620 3942
@@ -3624,89 +3946,52 @@ static void cfq_exit_queue(struct elevator_queue *e)
3624 __cfq_slice_expired(cfqd, cfqd->active_queue, 0); 3946 __cfq_slice_expired(cfqd, cfqd->active_queue, 0);
3625 3947
3626 cfq_put_async_queues(cfqd); 3948 cfq_put_async_queues(cfqd);
3627 cfq_release_cfq_groups(cfqd);
3628
3629 /*
3630 * If there are groups which we could not unlink from blkcg list,
3631 * wait for a rcu period for them to be freed.
3632 */
3633 if (cfqd->nr_blkcg_linked_grps)
3634 wait = true;
3635 3949
3636 spin_unlock_irq(q->queue_lock); 3950 spin_unlock_irq(q->queue_lock);
3637 3951
3638 cfq_shutdown_timer_wq(cfqd); 3952 cfq_shutdown_timer_wq(cfqd);
3639 3953
3640 /* 3954#ifndef CONFIG_CFQ_GROUP_IOSCHED
3641 * Wait for cfqg->blkg->key accessors to exit their grace periods. 3955 kfree(cfqd->root_group);
3642 * Do this wait only if there are other unlinked groups out
3643 * there. This can happen if cgroup deletion path claimed the
3644 * responsibility of cleaning up a group before queue cleanup code
3645 * get to the group.
3646 *
3647 * Do not call synchronize_rcu() unconditionally as there are drivers
3648 * which create/delete request queue hundreds of times during scan/boot
3649 * and synchronize_rcu() can take significant time and slow down boot.
3650 */
3651 if (wait)
3652 synchronize_rcu();
3653
3654#ifdef CONFIG_CFQ_GROUP_IOSCHED
3655 /* Free up per cpu stats for root group */
3656 free_percpu(cfqd->root_group.blkg.stats_cpu);
3657#endif 3956#endif
3957 blkcg_deactivate_policy(q, &blkcg_policy_cfq);
3658 kfree(cfqd); 3958 kfree(cfqd);
3659} 3959}
3660 3960
3661static void *cfq_init_queue(struct request_queue *q) 3961static int cfq_init_queue(struct request_queue *q)
3662{ 3962{
3663 struct cfq_data *cfqd; 3963 struct cfq_data *cfqd;
3664 int i, j; 3964 struct blkcg_gq *blkg __maybe_unused;
3665 struct cfq_group *cfqg; 3965 int i, ret;
3666 struct cfq_rb_root *st;
3667 3966
3668 cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node); 3967 cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node);
3669 if (!cfqd) 3968 if (!cfqd)
3670 return NULL; 3969 return -ENOMEM;
3970
3971 cfqd->queue = q;
3972 q->elevator->elevator_data = cfqd;
3671 3973
3672 /* Init root service tree */ 3974 /* Init root service tree */
3673 cfqd->grp_service_tree = CFQ_RB_ROOT; 3975 cfqd->grp_service_tree = CFQ_RB_ROOT;
3674 3976
3675 /* Init root group */ 3977 /* Init root group and prefer root group over other groups by default */
3676 cfqg = &cfqd->root_group;
3677 for_each_cfqg_st(cfqg, i, j, st)
3678 *st = CFQ_RB_ROOT;
3679 RB_CLEAR_NODE(&cfqg->rb_node);
3680
3681 /* Give preference to root group over other groups */
3682 cfqg->weight = 2*BLKIO_WEIGHT_DEFAULT;
3683
3684#ifdef CONFIG_CFQ_GROUP_IOSCHED 3978#ifdef CONFIG_CFQ_GROUP_IOSCHED
3685 /* 3979 ret = blkcg_activate_policy(q, &blkcg_policy_cfq);
3686 * Set root group reference to 2. One reference will be dropped when 3980 if (ret)
3687 * all groups on cfqd->cfqg_list are being deleted during queue exit. 3981 goto out_free;
3688 * Other reference will remain there as we don't want to delete this
3689 * group as it is statically allocated and gets destroyed when
3690 * throtl_data goes away.
3691 */
3692 cfqg->ref = 2;
3693
3694 if (blkio_alloc_blkg_stats(&cfqg->blkg)) {
3695 kfree(cfqg);
3696 kfree(cfqd);
3697 return NULL;
3698 }
3699
3700 rcu_read_lock();
3701 3982
3702 cfq_blkiocg_add_blkio_group(&blkio_root_cgroup, &cfqg->blkg, 3983 cfqd->root_group = blkg_to_cfqg(q->root_blkg);
3703 (void *)cfqd, 0); 3984#else
3704 rcu_read_unlock(); 3985 ret = -ENOMEM;
3705 cfqd->nr_blkcg_linked_grps++; 3986 cfqd->root_group = kzalloc_node(sizeof(*cfqd->root_group),
3987 GFP_KERNEL, cfqd->queue->node);
3988 if (!cfqd->root_group)
3989 goto out_free;
3706 3990
3707 /* Add group on cfqd->cfqg_list */ 3991 cfq_init_cfqg_base(cfqd->root_group);
3708 hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
3709#endif 3992#endif
3993 cfqd->root_group->weight = 2 * CFQ_WEIGHT_DEFAULT;
3994
3710 /* 3995 /*
3711 * Not strictly needed (since RB_ROOT just clears the node and we 3996 * Not strictly needed (since RB_ROOT just clears the node and we
3712 * zeroed cfqd on alloc), but better be safe in case someone decides 3997 * zeroed cfqd on alloc), but better be safe in case someone decides
@@ -3718,13 +4003,17 @@ static void *cfq_init_queue(struct request_queue *q)
3718 /* 4003 /*
3719 * Our fallback cfqq if cfq_find_alloc_queue() runs into OOM issues. 4004 * Our fallback cfqq if cfq_find_alloc_queue() runs into OOM issues.
3720 * Grab a permanent reference to it, so that the normal code flow 4005 * Grab a permanent reference to it, so that the normal code flow
3721 * will not attempt to free it. 4006 * will not attempt to free it. oom_cfqq is linked to root_group
4007 * but shouldn't hold a reference as it'll never be unlinked. Lose
4008 * the reference from linking right away.
3722 */ 4009 */
3723 cfq_init_cfqq(cfqd, &cfqd->oom_cfqq, 1, 0); 4010 cfq_init_cfqq(cfqd, &cfqd->oom_cfqq, 1, 0);
3724 cfqd->oom_cfqq.ref++; 4011 cfqd->oom_cfqq.ref++;
3725 cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, &cfqd->root_group);
3726 4012
3727 cfqd->queue = q; 4013 spin_lock_irq(q->queue_lock);
4014 cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, cfqd->root_group);
4015 cfqg_put(cfqd->root_group);
4016 spin_unlock_irq(q->queue_lock);
3728 4017
3729 init_timer(&cfqd->idle_slice_timer); 4018 init_timer(&cfqd->idle_slice_timer);
3730 cfqd->idle_slice_timer.function = cfq_idle_slice_timer; 4019 cfqd->idle_slice_timer.function = cfq_idle_slice_timer;
@@ -3750,7 +4039,11 @@ static void *cfq_init_queue(struct request_queue *q)
3750 * second, in order to have larger depth for async operations. 4039 * second, in order to have larger depth for async operations.
3751 */ 4040 */
3752 cfqd->last_delayed_sync = jiffies - HZ; 4041 cfqd->last_delayed_sync = jiffies - HZ;
3753 return cfqd; 4042 return 0;
4043
4044out_free:
4045 kfree(cfqd);
4046 return ret;
3754} 4047}
3755 4048
3756/* 4049/*
@@ -3877,15 +4170,13 @@ static struct elevator_type iosched_cfq = {
3877}; 4170};
3878 4171
3879#ifdef CONFIG_CFQ_GROUP_IOSCHED 4172#ifdef CONFIG_CFQ_GROUP_IOSCHED
3880static struct blkio_policy_type blkio_policy_cfq = { 4173static struct blkcg_policy blkcg_policy_cfq = {
3881 .ops = { 4174 .pd_size = sizeof(struct cfq_group),
3882 .blkio_unlink_group_fn = cfq_unlink_blkio_group, 4175 .cftypes = cfq_blkcg_files,
3883 .blkio_update_group_weight_fn = cfq_update_blkio_group_weight, 4176
3884 }, 4177 .pd_init_fn = cfq_pd_init,
3885 .plid = BLKIO_POLICY_PROP, 4178 .pd_reset_stats_fn = cfq_pd_reset_stats,
3886}; 4179};
3887#else
3888static struct blkio_policy_type blkio_policy_cfq;
3889#endif 4180#endif
3890 4181
3891static int __init cfq_init(void) 4182static int __init cfq_init(void)
@@ -3906,24 +4197,31 @@ static int __init cfq_init(void)
3906#else 4197#else
3907 cfq_group_idle = 0; 4198 cfq_group_idle = 0;
3908#endif 4199#endif
4200
4201 ret = blkcg_policy_register(&blkcg_policy_cfq);
4202 if (ret)
4203 return ret;
4204
3909 cfq_pool = KMEM_CACHE(cfq_queue, 0); 4205 cfq_pool = KMEM_CACHE(cfq_queue, 0);
3910 if (!cfq_pool) 4206 if (!cfq_pool)
3911 return -ENOMEM; 4207 goto err_pol_unreg;
3912 4208
3913 ret = elv_register(&iosched_cfq); 4209 ret = elv_register(&iosched_cfq);
3914 if (ret) { 4210 if (ret)
3915 kmem_cache_destroy(cfq_pool); 4211 goto err_free_pool;
3916 return ret;
3917 }
3918
3919 blkio_policy_register(&blkio_policy_cfq);
3920 4212
3921 return 0; 4213 return 0;
4214
4215err_free_pool:
4216 kmem_cache_destroy(cfq_pool);
4217err_pol_unreg:
4218 blkcg_policy_unregister(&blkcg_policy_cfq);
4219 return ret;
3922} 4220}
3923 4221
3924static void __exit cfq_exit(void) 4222static void __exit cfq_exit(void)
3925{ 4223{
3926 blkio_policy_unregister(&blkio_policy_cfq); 4224 blkcg_policy_unregister(&blkcg_policy_cfq);
3927 elv_unregister(&iosched_cfq); 4225 elv_unregister(&iosched_cfq);
3928 kmem_cache_destroy(cfq_pool); 4226 kmem_cache_destroy(cfq_pool);
3929} 4227}
diff --git a/block/cfq.h b/block/cfq.h
deleted file mode 100644
index 2a155927e37c..000000000000
--- a/block/cfq.h
+++ /dev/null
@@ -1,115 +0,0 @@
1#ifndef _CFQ_H
2#define _CFQ_H
3#include "blk-cgroup.h"
4
5#ifdef CONFIG_CFQ_GROUP_IOSCHED
6static inline void cfq_blkiocg_update_io_add_stats(struct blkio_group *blkg,
7 struct blkio_group *curr_blkg, bool direction, bool sync)
8{
9 blkiocg_update_io_add_stats(blkg, curr_blkg, direction, sync);
10}
11
12static inline void cfq_blkiocg_update_dequeue_stats(struct blkio_group *blkg,
13 unsigned long dequeue)
14{
15 blkiocg_update_dequeue_stats(blkg, dequeue);
16}
17
18static inline void cfq_blkiocg_update_timeslice_used(struct blkio_group *blkg,
19 unsigned long time, unsigned long unaccounted_time)
20{
21 blkiocg_update_timeslice_used(blkg, time, unaccounted_time);
22}
23
24static inline void cfq_blkiocg_set_start_empty_time(struct blkio_group *blkg)
25{
26 blkiocg_set_start_empty_time(blkg);
27}
28
29static inline void cfq_blkiocg_update_io_remove_stats(struct blkio_group *blkg,
30 bool direction, bool sync)
31{
32 blkiocg_update_io_remove_stats(blkg, direction, sync);
33}
34
35static inline void cfq_blkiocg_update_io_merged_stats(struct blkio_group *blkg,
36 bool direction, bool sync)
37{
38 blkiocg_update_io_merged_stats(blkg, direction, sync);
39}
40
41static inline void cfq_blkiocg_update_idle_time_stats(struct blkio_group *blkg)
42{
43 blkiocg_update_idle_time_stats(blkg);
44}
45
46static inline void
47cfq_blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg)
48{
49 blkiocg_update_avg_queue_size_stats(blkg);
50}
51
52static inline void
53cfq_blkiocg_update_set_idle_time_stats(struct blkio_group *blkg)
54{
55 blkiocg_update_set_idle_time_stats(blkg);
56}
57
58static inline void cfq_blkiocg_update_dispatch_stats(struct blkio_group *blkg,
59 uint64_t bytes, bool direction, bool sync)
60{
61 blkiocg_update_dispatch_stats(blkg, bytes, direction, sync);
62}
63
64static inline void cfq_blkiocg_update_completion_stats(struct blkio_group *blkg, uint64_t start_time, uint64_t io_start_time, bool direction, bool sync)
65{
66 blkiocg_update_completion_stats(blkg, start_time, io_start_time,
67 direction, sync);
68}
69
70static inline void cfq_blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
71 struct blkio_group *blkg, void *key, dev_t dev) {
72 blkiocg_add_blkio_group(blkcg, blkg, key, dev, BLKIO_POLICY_PROP);
73}
74
75static inline int cfq_blkiocg_del_blkio_group(struct blkio_group *blkg)
76{
77 return blkiocg_del_blkio_group(blkg);
78}
79
80#else /* CFQ_GROUP_IOSCHED */
81static inline void cfq_blkiocg_update_io_add_stats(struct blkio_group *blkg,
82 struct blkio_group *curr_blkg, bool direction, bool sync) {}
83
84static inline void cfq_blkiocg_update_dequeue_stats(struct blkio_group *blkg,
85 unsigned long dequeue) {}
86
87static inline void cfq_blkiocg_update_timeslice_used(struct blkio_group *blkg,
88 unsigned long time, unsigned long unaccounted_time) {}
89static inline void cfq_blkiocg_set_start_empty_time(struct blkio_group *blkg) {}
90static inline void cfq_blkiocg_update_io_remove_stats(struct blkio_group *blkg,
91 bool direction, bool sync) {}
92static inline void cfq_blkiocg_update_io_merged_stats(struct blkio_group *blkg,
93 bool direction, bool sync) {}
94static inline void cfq_blkiocg_update_idle_time_stats(struct blkio_group *blkg)
95{
96}
97static inline void
98cfq_blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg) {}
99
100static inline void
101cfq_blkiocg_update_set_idle_time_stats(struct blkio_group *blkg) {}
102
103static inline void cfq_blkiocg_update_dispatch_stats(struct blkio_group *blkg,
104 uint64_t bytes, bool direction, bool sync) {}
105static inline void cfq_blkiocg_update_completion_stats(struct blkio_group *blkg, uint64_t start_time, uint64_t io_start_time, bool direction, bool sync) {}
106
107static inline void cfq_blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
108 struct blkio_group *blkg, void *key, dev_t dev) {}
109static inline int cfq_blkiocg_del_blkio_group(struct blkio_group *blkg)
110{
111 return 0;
112}
113
114#endif /* CFQ_GROUP_IOSCHED */
115#endif
diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c
index 7bf12d793fcd..599b12e5380f 100644
--- a/block/deadline-iosched.c
+++ b/block/deadline-iosched.c
@@ -337,13 +337,13 @@ static void deadline_exit_queue(struct elevator_queue *e)
337/* 337/*
338 * initialize elevator private data (deadline_data). 338 * initialize elevator private data (deadline_data).
339 */ 339 */
340static void *deadline_init_queue(struct request_queue *q) 340static int deadline_init_queue(struct request_queue *q)
341{ 341{
342 struct deadline_data *dd; 342 struct deadline_data *dd;
343 343
344 dd = kmalloc_node(sizeof(*dd), GFP_KERNEL | __GFP_ZERO, q->node); 344 dd = kmalloc_node(sizeof(*dd), GFP_KERNEL | __GFP_ZERO, q->node);
345 if (!dd) 345 if (!dd)
346 return NULL; 346 return -ENOMEM;
347 347
348 INIT_LIST_HEAD(&dd->fifo_list[READ]); 348 INIT_LIST_HEAD(&dd->fifo_list[READ]);
349 INIT_LIST_HEAD(&dd->fifo_list[WRITE]); 349 INIT_LIST_HEAD(&dd->fifo_list[WRITE]);
@@ -354,7 +354,9 @@ static void *deadline_init_queue(struct request_queue *q)
354 dd->writes_starved = writes_starved; 354 dd->writes_starved = writes_starved;
355 dd->front_merges = 1; 355 dd->front_merges = 1;
356 dd->fifo_batch = fifo_batch; 356 dd->fifo_batch = fifo_batch;
357 return dd; 357
358 q->elevator->elevator_data = dd;
359 return 0;
358} 360}
359 361
360/* 362/*
diff --git a/block/elevator.c b/block/elevator.c
index f016855a46b0..6a55d418896f 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -38,6 +38,7 @@
38#include <trace/events/block.h> 38#include <trace/events/block.h>
39 39
40#include "blk.h" 40#include "blk.h"
41#include "blk-cgroup.h"
41 42
42static DEFINE_SPINLOCK(elv_list_lock); 43static DEFINE_SPINLOCK(elv_list_lock);
43static LIST_HEAD(elv_list); 44static LIST_HEAD(elv_list);
@@ -121,15 +122,6 @@ static struct elevator_type *elevator_get(const char *name)
121 return e; 122 return e;
122} 123}
123 124
124static int elevator_init_queue(struct request_queue *q,
125 struct elevator_queue *eq)
126{
127 eq->elevator_data = eq->type->ops.elevator_init_fn(q);
128 if (eq->elevator_data)
129 return 0;
130 return -ENOMEM;
131}
132
133static char chosen_elevator[ELV_NAME_MAX]; 125static char chosen_elevator[ELV_NAME_MAX];
134 126
135static int __init elevator_setup(char *str) 127static int __init elevator_setup(char *str)
@@ -188,7 +180,6 @@ static void elevator_release(struct kobject *kobj)
188int elevator_init(struct request_queue *q, char *name) 180int elevator_init(struct request_queue *q, char *name)
189{ 181{
190 struct elevator_type *e = NULL; 182 struct elevator_type *e = NULL;
191 struct elevator_queue *eq;
192 int err; 183 int err;
193 184
194 if (unlikely(q->elevator)) 185 if (unlikely(q->elevator))
@@ -222,17 +213,16 @@ int elevator_init(struct request_queue *q, char *name)
222 } 213 }
223 } 214 }
224 215
225 eq = elevator_alloc(q, e); 216 q->elevator = elevator_alloc(q, e);
226 if (!eq) 217 if (!q->elevator)
227 return -ENOMEM; 218 return -ENOMEM;
228 219
229 err = elevator_init_queue(q, eq); 220 err = e->ops.elevator_init_fn(q);
230 if (err) { 221 if (err) {
231 kobject_put(&eq->kobj); 222 kobject_put(&q->elevator->kobj);
232 return err; 223 return err;
233 } 224 }
234 225
235 q->elevator = eq;
236 return 0; 226 return 0;
237} 227}
238EXPORT_SYMBOL(elevator_init); 228EXPORT_SYMBOL(elevator_init);
@@ -564,25 +554,6 @@ void elv_drain_elevator(struct request_queue *q)
564 } 554 }
565} 555}
566 556
567void elv_quiesce_start(struct request_queue *q)
568{
569 if (!q->elevator)
570 return;
571
572 spin_lock_irq(q->queue_lock);
573 queue_flag_set(QUEUE_FLAG_ELVSWITCH, q);
574 spin_unlock_irq(q->queue_lock);
575
576 blk_drain_queue(q, false);
577}
578
579void elv_quiesce_end(struct request_queue *q)
580{
581 spin_lock_irq(q->queue_lock);
582 queue_flag_clear(QUEUE_FLAG_ELVSWITCH, q);
583 spin_unlock_irq(q->queue_lock);
584}
585
586void __elv_add_request(struct request_queue *q, struct request *rq, int where) 557void __elv_add_request(struct request_queue *q, struct request *rq, int where)
587{ 558{
588 trace_block_rq_insert(q, rq); 559 trace_block_rq_insert(q, rq);
@@ -692,12 +663,13 @@ struct request *elv_former_request(struct request_queue *q, struct request *rq)
692 return NULL; 663 return NULL;
693} 664}
694 665
695int elv_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask) 666int elv_set_request(struct request_queue *q, struct request *rq,
667 struct bio *bio, gfp_t gfp_mask)
696{ 668{
697 struct elevator_queue *e = q->elevator; 669 struct elevator_queue *e = q->elevator;
698 670
699 if (e->type->ops.elevator_set_req_fn) 671 if (e->type->ops.elevator_set_req_fn)
700 return e->type->ops.elevator_set_req_fn(q, rq, gfp_mask); 672 return e->type->ops.elevator_set_req_fn(q, rq, bio, gfp_mask);
701 return 0; 673 return 0;
702} 674}
703 675
@@ -801,8 +773,9 @@ static struct kobj_type elv_ktype = {
801 .release = elevator_release, 773 .release = elevator_release,
802}; 774};
803 775
804int __elv_register_queue(struct request_queue *q, struct elevator_queue *e) 776int elv_register_queue(struct request_queue *q)
805{ 777{
778 struct elevator_queue *e = q->elevator;
806 int error; 779 int error;
807 780
808 error = kobject_add(&e->kobj, &q->kobj, "%s", "iosched"); 781 error = kobject_add(&e->kobj, &q->kobj, "%s", "iosched");
@@ -820,11 +793,6 @@ int __elv_register_queue(struct request_queue *q, struct elevator_queue *e)
820 } 793 }
821 return error; 794 return error;
822} 795}
823
824int elv_register_queue(struct request_queue *q)
825{
826 return __elv_register_queue(q, q->elevator);
827}
828EXPORT_SYMBOL(elv_register_queue); 796EXPORT_SYMBOL(elv_register_queue);
829 797
830void elv_unregister_queue(struct request_queue *q) 798void elv_unregister_queue(struct request_queue *q)
@@ -907,53 +875,60 @@ EXPORT_SYMBOL_GPL(elv_unregister);
907 */ 875 */
908static int elevator_switch(struct request_queue *q, struct elevator_type *new_e) 876static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
909{ 877{
910 struct elevator_queue *old_elevator, *e; 878 struct elevator_queue *old = q->elevator;
879 bool registered = old->registered;
911 int err; 880 int err;
912 881
913 /* allocate new elevator */ 882 /*
914 e = elevator_alloc(q, new_e); 883 * Turn on BYPASS and drain all requests w/ elevator private data.
915 if (!e) 884 * Block layer doesn't call into a quiesced elevator - all requests
916 return -ENOMEM; 885 * are directly put on the dispatch list without elevator data
886 * using INSERT_BACK. All requests have SOFTBARRIER set and no
887 * merge happens either.
888 */
889 blk_queue_bypass_start(q);
890
891 /* unregister and clear all auxiliary data of the old elevator */
892 if (registered)
893 elv_unregister_queue(q);
894
895 spin_lock_irq(q->queue_lock);
896 ioc_clear_queue(q);
897 spin_unlock_irq(q->queue_lock);
917 898
918 err = elevator_init_queue(q, e); 899 /* allocate, init and register new elevator */
900 err = -ENOMEM;
901 q->elevator = elevator_alloc(q, new_e);
902 if (!q->elevator)
903 goto fail_init;
904
905 err = new_e->ops.elevator_init_fn(q);
919 if (err) { 906 if (err) {
920 kobject_put(&e->kobj); 907 kobject_put(&q->elevator->kobj);
921 return err; 908 goto fail_init;
922 } 909 }
923 910
924 /* turn on BYPASS and drain all requests w/ elevator private data */ 911 if (registered) {
925 elv_quiesce_start(q); 912 err = elv_register_queue(q);
926
927 /* unregister old queue, register new one and kill old elevator */
928 if (q->elevator->registered) {
929 elv_unregister_queue(q);
930 err = __elv_register_queue(q, e);
931 if (err) 913 if (err)
932 goto fail_register; 914 goto fail_register;
933 } 915 }
934 916
935 /* done, clear io_cq's, switch elevators and turn off BYPASS */ 917 /* done, kill the old one and finish */
936 spin_lock_irq(q->queue_lock); 918 elevator_exit(old);
937 ioc_clear_queue(q); 919 blk_queue_bypass_end(q);
938 old_elevator = q->elevator;
939 q->elevator = e;
940 spin_unlock_irq(q->queue_lock);
941
942 elevator_exit(old_elevator);
943 elv_quiesce_end(q);
944 920
945 blk_add_trace_msg(q, "elv switch: %s", e->type->elevator_name); 921 blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name);
946 922
947 return 0; 923 return 0;
948 924
949fail_register: 925fail_register:
950 /* 926 elevator_exit(q->elevator);
951 * switch failed, exit the new io scheduler and reattach the old 927fail_init:
952 * one again (along with re-adding the sysfs dir) 928 /* switch failed, restore and re-register old elevator */
953 */ 929 q->elevator = old;
954 elevator_exit(e);
955 elv_register_queue(q); 930 elv_register_queue(q);
956 elv_quiesce_end(q); 931 blk_queue_bypass_end(q);
957 932
958 return err; 933 return err;
959} 934}
diff --git a/block/noop-iosched.c b/block/noop-iosched.c
index 413a0b1d788c..5d1bf70e33d5 100644
--- a/block/noop-iosched.c
+++ b/block/noop-iosched.c
@@ -59,15 +59,17 @@ noop_latter_request(struct request_queue *q, struct request *rq)
59 return list_entry(rq->queuelist.next, struct request, queuelist); 59 return list_entry(rq->queuelist.next, struct request, queuelist);
60} 60}
61 61
62static void *noop_init_queue(struct request_queue *q) 62static int noop_init_queue(struct request_queue *q)
63{ 63{
64 struct noop_data *nd; 64 struct noop_data *nd;
65 65
66 nd = kmalloc_node(sizeof(*nd), GFP_KERNEL, q->node); 66 nd = kmalloc_node(sizeof(*nd), GFP_KERNEL, q->node);
67 if (!nd) 67 if (!nd)
68 return NULL; 68 return -ENOMEM;
69
69 INIT_LIST_HEAD(&nd->queue); 70 INIT_LIST_HEAD(&nd->queue);
70 return nd; 71 q->elevator->elevator_data = nd;
72 return 0;
71} 73}
72 74
73static void noop_exit_queue(struct elevator_queue *e) 75static void noop_exit_queue(struct elevator_queue *e)