summaryrefslogtreecommitdiffstats
path: root/block/blk-throttle.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2012-05-30 11:52:42 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2012-05-30 11:52:42 -0400
commit0d167518e045cc8bb63f0a8a0a85ad4fa4e0044f (patch)
tree101a9b5d425d79f663e4f25f1e90b7a8cc6604f1 /block/blk-throttle.c
parent2f83766d4b18774c856329a8fca4c9338dfeda39 (diff)
parentff26eaadf4d914e397872b99885d45756104e9ae (diff)
Merge branch 'for-3.5/core' of git://git.kernel.dk/linux-block
Merge block/IO core bits from Jens Axboe: "This is a bit bigger on the core side than usual, but that is purely because we decided to hold off on parts of Tejun's submission on 3.4 to give it a bit more time to simmer. As a consequence, it's seen a long cycle in for-next. It contains: - Bug fix from Dan, wrong locking type. - Relax splice gifting restriction from Eric. - A ton of updates from Tejun, primarily for blkcg. This improves the code a lot, making the API nicer and cleaner, and also includes fixes for how we handle and tie policies and re-activate on switches. The changes also include generic bug fixes. - A simple fix from Vivek, along with a fix for doing proper delayed allocation of the blkcg stats." Fix up annoying conflict just due to different merge resolution in Documentation/feature-removal-schedule.txt * 'for-3.5/core' of git://git.kernel.dk/linux-block: (92 commits) blkcg: tg_stats_alloc_lock is an irq lock vmsplice: relax alignement requirements for SPLICE_F_GIFT blkcg: use radix tree to index blkgs from blkcg blkcg: fix blkcg->css ref leak in __blkg_lookup_create() block: fix elvpriv allocation failure handling block: collapse blk_alloc_request() into get_request() blkcg: collapse blkcg_policy_ops into blkcg_policy blkcg: embed struct blkg_policy_data in policy specific data blkcg: mass rename of blkcg API blkcg: style cleanups for blk-cgroup.h blkcg: remove blkio_group->path[] blkcg: blkg_rwstat_read() was missing inline blkcg: shoot down blkgs if all policies are deactivated blkcg: drop stuff unused after per-queue policy activation update blkcg: implement per-queue policy activation blkcg: add request_queue->root_blkg blkcg: make request_queue bypassing on allocation blkcg: make sure blkg_lookup() returns %NULL if @q is bypassing blkcg: make blkg_conf_prep() take @pol and return with queue lock held blkcg: remove static policy ID enums ...
Diffstat (limited to 'block/blk-throttle.c')
-rw-r--r--block/blk-throttle.c697
1 files changed, 325 insertions, 372 deletions
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index f2ddb94626bd..5b0659512047 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -21,6 +21,8 @@ static int throtl_quantum = 32;
21/* Throttling is performed over 100ms slice and after that slice is renewed */ 21/* Throttling is performed over 100ms slice and after that slice is renewed */
22static unsigned long throtl_slice = HZ/10; /* 100 ms */ 22static unsigned long throtl_slice = HZ/10; /* 100 ms */
23 23
24static struct blkcg_policy blkcg_policy_throtl;
25
24/* A workqueue to queue throttle related work */ 26/* A workqueue to queue throttle related work */
25static struct workqueue_struct *kthrotld_workqueue; 27static struct workqueue_struct *kthrotld_workqueue;
26static void throtl_schedule_delayed_work(struct throtl_data *td, 28static void throtl_schedule_delayed_work(struct throtl_data *td,
@@ -38,9 +40,17 @@ struct throtl_rb_root {
38 40
39#define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node) 41#define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node)
40 42
43/* Per-cpu group stats */
44struct tg_stats_cpu {
45 /* total bytes transferred */
46 struct blkg_rwstat service_bytes;
47 /* total IOs serviced, post merge */
48 struct blkg_rwstat serviced;
49};
50
41struct throtl_grp { 51struct throtl_grp {
42 /* List of throtl groups on the request queue*/ 52 /* must be the first member */
43 struct hlist_node tg_node; 53 struct blkg_policy_data pd;
44 54
45 /* active throtl group service_tree member */ 55 /* active throtl group service_tree member */
46 struct rb_node rb_node; 56 struct rb_node rb_node;
@@ -52,8 +62,6 @@ struct throtl_grp {
52 */ 62 */
53 unsigned long disptime; 63 unsigned long disptime;
54 64
55 struct blkio_group blkg;
56 atomic_t ref;
57 unsigned int flags; 65 unsigned int flags;
58 66
59 /* Two lists for READ and WRITE */ 67 /* Two lists for READ and WRITE */
@@ -80,18 +88,18 @@ struct throtl_grp {
80 /* Some throttle limits got updated for the group */ 88 /* Some throttle limits got updated for the group */
81 int limits_changed; 89 int limits_changed;
82 90
83 struct rcu_head rcu_head; 91 /* Per cpu stats pointer */
92 struct tg_stats_cpu __percpu *stats_cpu;
93
94 /* List of tgs waiting for per cpu stats memory to be allocated */
95 struct list_head stats_alloc_node;
84}; 96};
85 97
86struct throtl_data 98struct throtl_data
87{ 99{
88 /* List of throtl groups */
89 struct hlist_head tg_list;
90
91 /* service tree for active throtl groups */ 100 /* service tree for active throtl groups */
92 struct throtl_rb_root tg_service_tree; 101 struct throtl_rb_root tg_service_tree;
93 102
94 struct throtl_grp *root_tg;
95 struct request_queue *queue; 103 struct request_queue *queue;
96 104
97 /* Total Number of queued bios on READ and WRITE lists */ 105 /* Total Number of queued bios on READ and WRITE lists */
@@ -108,6 +116,33 @@ struct throtl_data
108 int limits_changed; 116 int limits_changed;
109}; 117};
110 118
119/* list and work item to allocate percpu group stats */
120static DEFINE_SPINLOCK(tg_stats_alloc_lock);
121static LIST_HEAD(tg_stats_alloc_list);
122
123static void tg_stats_alloc_fn(struct work_struct *);
124static DECLARE_DELAYED_WORK(tg_stats_alloc_work, tg_stats_alloc_fn);
125
126static inline struct throtl_grp *pd_to_tg(struct blkg_policy_data *pd)
127{
128 return pd ? container_of(pd, struct throtl_grp, pd) : NULL;
129}
130
131static inline struct throtl_grp *blkg_to_tg(struct blkcg_gq *blkg)
132{
133 return pd_to_tg(blkg_to_pd(blkg, &blkcg_policy_throtl));
134}
135
136static inline struct blkcg_gq *tg_to_blkg(struct throtl_grp *tg)
137{
138 return pd_to_blkg(&tg->pd);
139}
140
141static inline struct throtl_grp *td_root_tg(struct throtl_data *td)
142{
143 return blkg_to_tg(td->queue->root_blkg);
144}
145
111enum tg_state_flags { 146enum tg_state_flags {
112 THROTL_TG_FLAG_on_rr = 0, /* on round-robin busy list */ 147 THROTL_TG_FLAG_on_rr = 0, /* on round-robin busy list */
113}; 148};
@@ -128,244 +163,150 @@ static inline int throtl_tg_##name(const struct throtl_grp *tg) \
128 163
129THROTL_TG_FNS(on_rr); 164THROTL_TG_FNS(on_rr);
130 165
131#define throtl_log_tg(td, tg, fmt, args...) \ 166#define throtl_log_tg(td, tg, fmt, args...) do { \
132 blk_add_trace_msg((td)->queue, "throtl %s " fmt, \ 167 char __pbuf[128]; \
133 blkg_path(&(tg)->blkg), ##args); \ 168 \
169 blkg_path(tg_to_blkg(tg), __pbuf, sizeof(__pbuf)); \
170 blk_add_trace_msg((td)->queue, "throtl %s " fmt, __pbuf, ##args); \
171} while (0)
134 172
135#define throtl_log(td, fmt, args...) \ 173#define throtl_log(td, fmt, args...) \
136 blk_add_trace_msg((td)->queue, "throtl " fmt, ##args) 174 blk_add_trace_msg((td)->queue, "throtl " fmt, ##args)
137 175
138static inline struct throtl_grp *tg_of_blkg(struct blkio_group *blkg)
139{
140 if (blkg)
141 return container_of(blkg, struct throtl_grp, blkg);
142
143 return NULL;
144}
145
146static inline unsigned int total_nr_queued(struct throtl_data *td) 176static inline unsigned int total_nr_queued(struct throtl_data *td)
147{ 177{
148 return td->nr_queued[0] + td->nr_queued[1]; 178 return td->nr_queued[0] + td->nr_queued[1];
149} 179}
150 180
151static inline struct throtl_grp *throtl_ref_get_tg(struct throtl_grp *tg) 181/*
152{ 182 * Worker for allocating per cpu stat for tgs. This is scheduled on the
153 atomic_inc(&tg->ref); 183 * system_nrt_wq once there are some groups on the alloc_list waiting for
154 return tg; 184 * allocation.
155} 185 */
156 186static void tg_stats_alloc_fn(struct work_struct *work)
157static void throtl_free_tg(struct rcu_head *head)
158{ 187{
159 struct throtl_grp *tg; 188 static struct tg_stats_cpu *stats_cpu; /* this fn is non-reentrant */
189 struct delayed_work *dwork = to_delayed_work(work);
190 bool empty = false;
191
192alloc_stats:
193 if (!stats_cpu) {
194 stats_cpu = alloc_percpu(struct tg_stats_cpu);
195 if (!stats_cpu) {
196 /* allocation failed, try again after some time */
197 queue_delayed_work(system_nrt_wq, dwork,
198 msecs_to_jiffies(10));
199 return;
200 }
201 }
160 202
161 tg = container_of(head, struct throtl_grp, rcu_head); 203 spin_lock_irq(&tg_stats_alloc_lock);
162 free_percpu(tg->blkg.stats_cpu);
163 kfree(tg);
164}
165 204
166static void throtl_put_tg(struct throtl_grp *tg) 205 if (!list_empty(&tg_stats_alloc_list)) {
167{ 206 struct throtl_grp *tg = list_first_entry(&tg_stats_alloc_list,
168 BUG_ON(atomic_read(&tg->ref) <= 0); 207 struct throtl_grp,
169 if (!atomic_dec_and_test(&tg->ref)) 208 stats_alloc_node);
170 return; 209 swap(tg->stats_cpu, stats_cpu);
210 list_del_init(&tg->stats_alloc_node);
211 }
171 212
172 /* 213 empty = list_empty(&tg_stats_alloc_list);
173 * A group is freed in rcu manner. But having an rcu lock does not 214 spin_unlock_irq(&tg_stats_alloc_lock);
174 * mean that one can access all the fields of blkg and assume these 215 if (!empty)
175 * are valid. For example, don't try to follow throtl_data and 216 goto alloc_stats;
176 * request queue links.
177 *
178 * Having a reference to blkg under an rcu allows acess to only
179 * values local to groups like group stats and group rate limits
180 */
181 call_rcu(&tg->rcu_head, throtl_free_tg);
182} 217}
183 218
184static void throtl_init_group(struct throtl_grp *tg) 219static void throtl_pd_init(struct blkcg_gq *blkg)
185{ 220{
186 INIT_HLIST_NODE(&tg->tg_node); 221 struct throtl_grp *tg = blkg_to_tg(blkg);
222 unsigned long flags;
223
187 RB_CLEAR_NODE(&tg->rb_node); 224 RB_CLEAR_NODE(&tg->rb_node);
188 bio_list_init(&tg->bio_lists[0]); 225 bio_list_init(&tg->bio_lists[0]);
189 bio_list_init(&tg->bio_lists[1]); 226 bio_list_init(&tg->bio_lists[1]);
190 tg->limits_changed = false; 227 tg->limits_changed = false;
191 228
192 /* Practically unlimited BW */ 229 tg->bps[READ] = -1;
193 tg->bps[0] = tg->bps[1] = -1; 230 tg->bps[WRITE] = -1;
194 tg->iops[0] = tg->iops[1] = -1; 231 tg->iops[READ] = -1;
232 tg->iops[WRITE] = -1;
195 233
196 /* 234 /*
197 * Take the initial reference that will be released on destroy 235 * Ugh... We need to perform per-cpu allocation for tg->stats_cpu
198 * This can be thought of a joint reference by cgroup and 236 * but percpu allocator can't be called from IO path. Queue tg on
199 * request queue which will be dropped by either request queue 237 * tg_stats_alloc_list and allocate from work item.
200 * exit or cgroup deletion path depending on who is exiting first.
201 */ 238 */
202 atomic_set(&tg->ref, 1); 239 spin_lock_irqsave(&tg_stats_alloc_lock, flags);
240 list_add(&tg->stats_alloc_node, &tg_stats_alloc_list);
241 queue_delayed_work(system_nrt_wq, &tg_stats_alloc_work, 0);
242 spin_unlock_irqrestore(&tg_stats_alloc_lock, flags);
203} 243}
204 244
205/* Should be called with rcu read lock held (needed for blkcg) */ 245static void throtl_pd_exit(struct blkcg_gq *blkg)
206static void
207throtl_add_group_to_td_list(struct throtl_data *td, struct throtl_grp *tg)
208{ 246{
209 hlist_add_head(&tg->tg_node, &td->tg_list); 247 struct throtl_grp *tg = blkg_to_tg(blkg);
210 td->nr_undestroyed_grps++; 248 unsigned long flags;
211}
212
213static void
214__throtl_tg_fill_dev_details(struct throtl_data *td, struct throtl_grp *tg)
215{
216 struct backing_dev_info *bdi = &td->queue->backing_dev_info;
217 unsigned int major, minor;
218
219 if (!tg || tg->blkg.dev)
220 return;
221
222 /*
223 * Fill in device details for a group which might not have been
224 * filled at group creation time as queue was being instantiated
225 * and driver had not attached a device yet
226 */
227 if (bdi->dev && dev_name(bdi->dev)) {
228 sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
229 tg->blkg.dev = MKDEV(major, minor);
230 }
231}
232
233/*
234 * Should be called with without queue lock held. Here queue lock will be
235 * taken rarely. It will be taken only once during life time of a group
236 * if need be
237 */
238static void
239throtl_tg_fill_dev_details(struct throtl_data *td, struct throtl_grp *tg)
240{
241 if (!tg || tg->blkg.dev)
242 return;
243
244 spin_lock_irq(td->queue->queue_lock);
245 __throtl_tg_fill_dev_details(td, tg);
246 spin_unlock_irq(td->queue->queue_lock);
247}
248
249static void throtl_init_add_tg_lists(struct throtl_data *td,
250 struct throtl_grp *tg, struct blkio_cgroup *blkcg)
251{
252 __throtl_tg_fill_dev_details(td, tg);
253
254 /* Add group onto cgroup list */
255 blkiocg_add_blkio_group(blkcg, &tg->blkg, (void *)td,
256 tg->blkg.dev, BLKIO_POLICY_THROTL);
257 249
258 tg->bps[READ] = blkcg_get_read_bps(blkcg, tg->blkg.dev); 250 spin_lock_irqsave(&tg_stats_alloc_lock, flags);
259 tg->bps[WRITE] = blkcg_get_write_bps(blkcg, tg->blkg.dev); 251 list_del_init(&tg->stats_alloc_node);
260 tg->iops[READ] = blkcg_get_read_iops(blkcg, tg->blkg.dev); 252 spin_unlock_irqrestore(&tg_stats_alloc_lock, flags);
261 tg->iops[WRITE] = blkcg_get_write_iops(blkcg, tg->blkg.dev);
262 253
263 throtl_add_group_to_td_list(td, tg); 254 free_percpu(tg->stats_cpu);
264} 255}
265 256
266/* Should be called without queue lock and outside of rcu period */ 257static void throtl_pd_reset_stats(struct blkcg_gq *blkg)
267static struct throtl_grp *throtl_alloc_tg(struct throtl_data *td)
268{ 258{
269 struct throtl_grp *tg = NULL; 259 struct throtl_grp *tg = blkg_to_tg(blkg);
270 int ret; 260 int cpu;
271 261
272 tg = kzalloc_node(sizeof(*tg), GFP_ATOMIC, td->queue->node); 262 if (tg->stats_cpu == NULL)
273 if (!tg) 263 return;
274 return NULL;
275 264
276 ret = blkio_alloc_blkg_stats(&tg->blkg); 265 for_each_possible_cpu(cpu) {
266 struct tg_stats_cpu *sc = per_cpu_ptr(tg->stats_cpu, cpu);
277 267
278 if (ret) { 268 blkg_rwstat_reset(&sc->service_bytes);
279 kfree(tg); 269 blkg_rwstat_reset(&sc->serviced);
280 return NULL;
281 } 270 }
282
283 throtl_init_group(tg);
284 return tg;
285} 271}
286 272
287static struct 273static struct throtl_grp *throtl_lookup_tg(struct throtl_data *td,
288throtl_grp *throtl_find_tg(struct throtl_data *td, struct blkio_cgroup *blkcg) 274 struct blkcg *blkcg)
289{ 275{
290 struct throtl_grp *tg = NULL;
291 void *key = td;
292
293 /* 276 /*
294 * This is the common case when there are no blkio cgroups. 277 * This is the common case when there are no blkcgs. Avoid lookup
295 * Avoid lookup in this case 278 * in this case
296 */ 279 */
297 if (blkcg == &blkio_root_cgroup) 280 if (blkcg == &blkcg_root)
298 tg = td->root_tg; 281 return td_root_tg(td);
299 else
300 tg = tg_of_blkg(blkiocg_lookup_group(blkcg, key));
301 282
302 __throtl_tg_fill_dev_details(td, tg); 283 return blkg_to_tg(blkg_lookup(blkcg, td->queue));
303 return tg;
304} 284}
305 285
306static struct throtl_grp * throtl_get_tg(struct throtl_data *td) 286static struct throtl_grp *throtl_lookup_create_tg(struct throtl_data *td,
287 struct blkcg *blkcg)
307{ 288{
308 struct throtl_grp *tg = NULL, *__tg = NULL;
309 struct blkio_cgroup *blkcg;
310 struct request_queue *q = td->queue; 289 struct request_queue *q = td->queue;
311 290 struct throtl_grp *tg = NULL;
312 /* no throttling for dead queue */
313 if (unlikely(blk_queue_dead(q)))
314 return NULL;
315
316 rcu_read_lock();
317 blkcg = task_blkio_cgroup(current);
318 tg = throtl_find_tg(td, blkcg);
319 if (tg) {
320 rcu_read_unlock();
321 return tg;
322 }
323
324 /*
325 * Need to allocate a group. Allocation of group also needs allocation
326 * of per cpu stats which in-turn takes a mutex() and can block. Hence
327 * we need to drop rcu lock and queue_lock before we call alloc.
328 */
329 rcu_read_unlock();
330 spin_unlock_irq(q->queue_lock);
331
332 tg = throtl_alloc_tg(td);
333
334 /* Group allocated and queue is still alive. take the lock */
335 spin_lock_irq(q->queue_lock);
336
337 /* Make sure @q is still alive */
338 if (unlikely(blk_queue_dead(q))) {
339 kfree(tg);
340 return NULL;
341 }
342
343 /*
344 * Initialize the new group. After sleeping, read the blkcg again.
345 */
346 rcu_read_lock();
347 blkcg = task_blkio_cgroup(current);
348 291
349 /* 292 /*
350 * If some other thread already allocated the group while we were 293 * This is the common case when there are no blkcgs. Avoid lookup
351 * not holding queue lock, free up the group 294 * in this case
352 */ 295 */
353 __tg = throtl_find_tg(td, blkcg); 296 if (blkcg == &blkcg_root) {
354 297 tg = td_root_tg(td);
355 if (__tg) { 298 } else {
356 kfree(tg); 299 struct blkcg_gq *blkg;
357 rcu_read_unlock(); 300
358 return __tg; 301 blkg = blkg_lookup_create(blkcg, q);
359 } 302
360 303 /* if %NULL and @q is alive, fall back to root_tg */
361 /* Group allocation failed. Account the IO to root group */ 304 if (!IS_ERR(blkg))
362 if (!tg) { 305 tg = blkg_to_tg(blkg);
363 tg = td->root_tg; 306 else if (!blk_queue_dead(q))
364 return tg; 307 tg = td_root_tg(td);
365 } 308 }
366 309
367 throtl_init_add_tg_lists(td, tg, blkcg);
368 rcu_read_unlock();
369 return tg; 310 return tg;
370} 311}
371 312
@@ -734,16 +675,41 @@ static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg,
734 return 0; 675 return 0;
735} 676}
736 677
678static void throtl_update_dispatch_stats(struct blkcg_gq *blkg, u64 bytes,
679 int rw)
680{
681 struct throtl_grp *tg = blkg_to_tg(blkg);
682 struct tg_stats_cpu *stats_cpu;
683 unsigned long flags;
684
685 /* If per cpu stats are not allocated yet, don't do any accounting. */
686 if (tg->stats_cpu == NULL)
687 return;
688
689 /*
690 * Disabling interrupts to provide mutual exclusion between two
691 * writes on same cpu. It probably is not needed for 64bit. Not
692 * optimizing that case yet.
693 */
694 local_irq_save(flags);
695
696 stats_cpu = this_cpu_ptr(tg->stats_cpu);
697
698 blkg_rwstat_add(&stats_cpu->serviced, rw, 1);
699 blkg_rwstat_add(&stats_cpu->service_bytes, rw, bytes);
700
701 local_irq_restore(flags);
702}
703
737static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) 704static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
738{ 705{
739 bool rw = bio_data_dir(bio); 706 bool rw = bio_data_dir(bio);
740 bool sync = rw_is_sync(bio->bi_rw);
741 707
742 /* Charge the bio to the group */ 708 /* Charge the bio to the group */
743 tg->bytes_disp[rw] += bio->bi_size; 709 tg->bytes_disp[rw] += bio->bi_size;
744 tg->io_disp[rw]++; 710 tg->io_disp[rw]++;
745 711
746 blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size, rw, sync); 712 throtl_update_dispatch_stats(tg_to_blkg(tg), bio->bi_size, bio->bi_rw);
747} 713}
748 714
749static void throtl_add_bio_tg(struct throtl_data *td, struct throtl_grp *tg, 715static void throtl_add_bio_tg(struct throtl_data *td, struct throtl_grp *tg,
@@ -753,7 +719,7 @@ static void throtl_add_bio_tg(struct throtl_data *td, struct throtl_grp *tg,
753 719
754 bio_list_add(&tg->bio_lists[rw], bio); 720 bio_list_add(&tg->bio_lists[rw], bio);
755 /* Take a bio reference on tg */ 721 /* Take a bio reference on tg */
756 throtl_ref_get_tg(tg); 722 blkg_get(tg_to_blkg(tg));
757 tg->nr_queued[rw]++; 723 tg->nr_queued[rw]++;
758 td->nr_queued[rw]++; 724 td->nr_queued[rw]++;
759 throtl_enqueue_tg(td, tg); 725 throtl_enqueue_tg(td, tg);
@@ -786,8 +752,8 @@ static void tg_dispatch_one_bio(struct throtl_data *td, struct throtl_grp *tg,
786 752
787 bio = bio_list_pop(&tg->bio_lists[rw]); 753 bio = bio_list_pop(&tg->bio_lists[rw]);
788 tg->nr_queued[rw]--; 754 tg->nr_queued[rw]--;
789 /* Drop bio reference on tg */ 755 /* Drop bio reference on blkg */
790 throtl_put_tg(tg); 756 blkg_put(tg_to_blkg(tg));
791 757
792 BUG_ON(td->nr_queued[rw] <= 0); 758 BUG_ON(td->nr_queued[rw] <= 0);
793 td->nr_queued[rw]--; 759 td->nr_queued[rw]--;
@@ -865,8 +831,8 @@ static int throtl_select_dispatch(struct throtl_data *td, struct bio_list *bl)
865 831
866static void throtl_process_limit_change(struct throtl_data *td) 832static void throtl_process_limit_change(struct throtl_data *td)
867{ 833{
868 struct throtl_grp *tg; 834 struct request_queue *q = td->queue;
869 struct hlist_node *pos, *n; 835 struct blkcg_gq *blkg, *n;
870 836
871 if (!td->limits_changed) 837 if (!td->limits_changed)
872 return; 838 return;
@@ -875,7 +841,9 @@ static void throtl_process_limit_change(struct throtl_data *td)
875 841
876 throtl_log(td, "limits changed"); 842 throtl_log(td, "limits changed");
877 843
878 hlist_for_each_entry_safe(tg, pos, n, &td->tg_list, tg_node) { 844 list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) {
845 struct throtl_grp *tg = blkg_to_tg(blkg);
846
879 if (!tg->limits_changed) 847 if (!tg->limits_changed)
880 continue; 848 continue;
881 849
@@ -973,120 +941,159 @@ throtl_schedule_delayed_work(struct throtl_data *td, unsigned long delay)
973 } 941 }
974} 942}
975 943
976static void 944static u64 tg_prfill_cpu_rwstat(struct seq_file *sf,
977throtl_destroy_tg(struct throtl_data *td, struct throtl_grp *tg) 945 struct blkg_policy_data *pd, int off)
978{ 946{
979 /* Something wrong if we are trying to remove same group twice */ 947 struct throtl_grp *tg = pd_to_tg(pd);
980 BUG_ON(hlist_unhashed(&tg->tg_node)); 948 struct blkg_rwstat rwstat = { }, tmp;
949 int i, cpu;
981 950
982 hlist_del_init(&tg->tg_node); 951 for_each_possible_cpu(cpu) {
952 struct tg_stats_cpu *sc = per_cpu_ptr(tg->stats_cpu, cpu);
983 953
984 /* 954 tmp = blkg_rwstat_read((void *)sc + off);
985 * Put the reference taken at the time of creation so that when all 955 for (i = 0; i < BLKG_RWSTAT_NR; i++)
986 * queues are gone, group can be destroyed. 956 rwstat.cnt[i] += tmp.cnt[i];
987 */ 957 }
988 throtl_put_tg(tg); 958
989 td->nr_undestroyed_grps--; 959 return __blkg_prfill_rwstat(sf, pd, &rwstat);
990} 960}
991 961
992static void throtl_release_tgs(struct throtl_data *td) 962static int tg_print_cpu_rwstat(struct cgroup *cgrp, struct cftype *cft,
963 struct seq_file *sf)
993{ 964{
994 struct hlist_node *pos, *n; 965 struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
995 struct throtl_grp *tg;
996 966
997 hlist_for_each_entry_safe(tg, pos, n, &td->tg_list, tg_node) { 967 blkcg_print_blkgs(sf, blkcg, tg_prfill_cpu_rwstat, &blkcg_policy_throtl,
998 /* 968 cft->private, true);
999 * If cgroup removal path got to blk_group first and removed 969 return 0;
1000 * it from cgroup list, then it will take care of destroying
1001 * cfqg also.
1002 */
1003 if (!blkiocg_del_blkio_group(&tg->blkg))
1004 throtl_destroy_tg(td, tg);
1005 }
1006} 970}
1007 971
1008/* 972static u64 tg_prfill_conf_u64(struct seq_file *sf, struct blkg_policy_data *pd,
1009 * Blk cgroup controller notification saying that blkio_group object is being 973 int off)
1010 * delinked as associated cgroup object is going away. That also means that
1011 * no new IO will come in this group. So get rid of this group as soon as
1012 * any pending IO in the group is finished.
1013 *
1014 * This function is called under rcu_read_lock(). key is the rcu protected
1015 * pointer. That means "key" is a valid throtl_data pointer as long as we are
1016 * rcu read lock.
1017 *
1018 * "key" was fetched from blkio_group under blkio_cgroup->lock. That means
1019 * it should not be NULL as even if queue was going away, cgroup deltion
1020 * path got to it first.
1021 */
1022void throtl_unlink_blkio_group(void *key, struct blkio_group *blkg)
1023{ 974{
1024 unsigned long flags; 975 struct throtl_grp *tg = pd_to_tg(pd);
1025 struct throtl_data *td = key; 976 u64 v = *(u64 *)((void *)tg + off);
1026 977
1027 spin_lock_irqsave(td->queue->queue_lock, flags); 978 if (v == -1)
1028 throtl_destroy_tg(td, tg_of_blkg(blkg)); 979 return 0;
1029 spin_unlock_irqrestore(td->queue->queue_lock, flags); 980 return __blkg_prfill_u64(sf, pd, v);
1030} 981}
1031 982
1032static void throtl_update_blkio_group_common(struct throtl_data *td, 983static u64 tg_prfill_conf_uint(struct seq_file *sf, struct blkg_policy_data *pd,
1033 struct throtl_grp *tg) 984 int off)
1034{ 985{
1035 xchg(&tg->limits_changed, true); 986 struct throtl_grp *tg = pd_to_tg(pd);
1036 xchg(&td->limits_changed, true); 987 unsigned int v = *(unsigned int *)((void *)tg + off);
1037 /* Schedule a work now to process the limit change */ 988
1038 throtl_schedule_delayed_work(td, 0); 989 if (v == -1)
990 return 0;
991 return __blkg_prfill_u64(sf, pd, v);
1039} 992}
1040 993
1041/* 994static int tg_print_conf_u64(struct cgroup *cgrp, struct cftype *cft,
1042 * For all update functions, key should be a valid pointer because these 995 struct seq_file *sf)
1043 * update functions are called under blkcg_lock, that means, blkg is
1044 * valid and in turn key is valid. queue exit path can not race because
1045 * of blkcg_lock
1046 *
1047 * Can not take queue lock in update functions as queue lock under blkcg_lock
1048 * is not allowed. Under other paths we take blkcg_lock under queue_lock.
1049 */
1050static void throtl_update_blkio_group_read_bps(void *key,
1051 struct blkio_group *blkg, u64 read_bps)
1052{ 996{
1053 struct throtl_data *td = key; 997 blkcg_print_blkgs(sf, cgroup_to_blkcg(cgrp), tg_prfill_conf_u64,
1054 struct throtl_grp *tg = tg_of_blkg(blkg); 998 &blkcg_policy_throtl, cft->private, false);
1055 999 return 0;
1056 tg->bps[READ] = read_bps;
1057 throtl_update_blkio_group_common(td, tg);
1058} 1000}
1059 1001
1060static void throtl_update_blkio_group_write_bps(void *key, 1002static int tg_print_conf_uint(struct cgroup *cgrp, struct cftype *cft,
1061 struct blkio_group *blkg, u64 write_bps) 1003 struct seq_file *sf)
1062{ 1004{
1063 struct throtl_data *td = key; 1005 blkcg_print_blkgs(sf, cgroup_to_blkcg(cgrp), tg_prfill_conf_uint,
1064 struct throtl_grp *tg = tg_of_blkg(blkg); 1006 &blkcg_policy_throtl, cft->private, false);
1065 1007 return 0;
1066 tg->bps[WRITE] = write_bps;
1067 throtl_update_blkio_group_common(td, tg);
1068} 1008}
1069 1009
1070static void throtl_update_blkio_group_read_iops(void *key, 1010static int tg_set_conf(struct cgroup *cgrp, struct cftype *cft, const char *buf,
1071 struct blkio_group *blkg, unsigned int read_iops) 1011 bool is_u64)
1072{ 1012{
1073 struct throtl_data *td = key; 1013 struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
1074 struct throtl_grp *tg = tg_of_blkg(blkg); 1014 struct blkg_conf_ctx ctx;
1015 struct throtl_grp *tg;
1016 struct throtl_data *td;
1017 int ret;
1018
1019 ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx);
1020 if (ret)
1021 return ret;
1022
1023 tg = blkg_to_tg(ctx.blkg);
1024 td = ctx.blkg->q->td;
1025
1026 if (!ctx.v)
1027 ctx.v = -1;
1028
1029 if (is_u64)
1030 *(u64 *)((void *)tg + cft->private) = ctx.v;
1031 else
1032 *(unsigned int *)((void *)tg + cft->private) = ctx.v;
1033
1034 /* XXX: we don't need the following deferred processing */
1035 xchg(&tg->limits_changed, true);
1036 xchg(&td->limits_changed, true);
1037 throtl_schedule_delayed_work(td, 0);
1075 1038
1076 tg->iops[READ] = read_iops; 1039 blkg_conf_finish(&ctx);
1077 throtl_update_blkio_group_common(td, tg); 1040 return 0;
1078} 1041}
1079 1042
1080static void throtl_update_blkio_group_write_iops(void *key, 1043static int tg_set_conf_u64(struct cgroup *cgrp, struct cftype *cft,
1081 struct blkio_group *blkg, unsigned int write_iops) 1044 const char *buf)
1082{ 1045{
1083 struct throtl_data *td = key; 1046 return tg_set_conf(cgrp, cft, buf, true);
1084 struct throtl_grp *tg = tg_of_blkg(blkg); 1047}
1085 1048
1086 tg->iops[WRITE] = write_iops; 1049static int tg_set_conf_uint(struct cgroup *cgrp, struct cftype *cft,
1087 throtl_update_blkio_group_common(td, tg); 1050 const char *buf)
1051{
1052 return tg_set_conf(cgrp, cft, buf, false);
1088} 1053}
1089 1054
1055static struct cftype throtl_files[] = {
1056 {
1057 .name = "throttle.read_bps_device",
1058 .private = offsetof(struct throtl_grp, bps[READ]),
1059 .read_seq_string = tg_print_conf_u64,
1060 .write_string = tg_set_conf_u64,
1061 .max_write_len = 256,
1062 },
1063 {
1064 .name = "throttle.write_bps_device",
1065 .private = offsetof(struct throtl_grp, bps[WRITE]),
1066 .read_seq_string = tg_print_conf_u64,
1067 .write_string = tg_set_conf_u64,
1068 .max_write_len = 256,
1069 },
1070 {
1071 .name = "throttle.read_iops_device",
1072 .private = offsetof(struct throtl_grp, iops[READ]),
1073 .read_seq_string = tg_print_conf_uint,
1074 .write_string = tg_set_conf_uint,
1075 .max_write_len = 256,
1076 },
1077 {
1078 .name = "throttle.write_iops_device",
1079 .private = offsetof(struct throtl_grp, iops[WRITE]),
1080 .read_seq_string = tg_print_conf_uint,
1081 .write_string = tg_set_conf_uint,
1082 .max_write_len = 256,
1083 },
1084 {
1085 .name = "throttle.io_service_bytes",
1086 .private = offsetof(struct tg_stats_cpu, service_bytes),
1087 .read_seq_string = tg_print_cpu_rwstat,
1088 },
1089 {
1090 .name = "throttle.io_serviced",
1091 .private = offsetof(struct tg_stats_cpu, serviced),
1092 .read_seq_string = tg_print_cpu_rwstat,
1093 },
1094 { } /* terminate */
1095};
1096
1090static void throtl_shutdown_wq(struct request_queue *q) 1097static void throtl_shutdown_wq(struct request_queue *q)
1091{ 1098{
1092 struct throtl_data *td = q->td; 1099 struct throtl_data *td = q->td;
@@ -1094,19 +1101,13 @@ static void throtl_shutdown_wq(struct request_queue *q)
1094 cancel_delayed_work_sync(&td->throtl_work); 1101 cancel_delayed_work_sync(&td->throtl_work);
1095} 1102}
1096 1103
1097static struct blkio_policy_type blkio_policy_throtl = { 1104static struct blkcg_policy blkcg_policy_throtl = {
1098 .ops = { 1105 .pd_size = sizeof(struct throtl_grp),
1099 .blkio_unlink_group_fn = throtl_unlink_blkio_group, 1106 .cftypes = throtl_files,
1100 .blkio_update_group_read_bps_fn = 1107
1101 throtl_update_blkio_group_read_bps, 1108 .pd_init_fn = throtl_pd_init,
1102 .blkio_update_group_write_bps_fn = 1109 .pd_exit_fn = throtl_pd_exit,
1103 throtl_update_blkio_group_write_bps, 1110 .pd_reset_stats_fn = throtl_pd_reset_stats,
1104 .blkio_update_group_read_iops_fn =
1105 throtl_update_blkio_group_read_iops,
1106 .blkio_update_group_write_iops_fn =
1107 throtl_update_blkio_group_write_iops,
1108 },
1109 .plid = BLKIO_POLICY_THROTL,
1110}; 1111};
1111 1112
1112bool blk_throtl_bio(struct request_queue *q, struct bio *bio) 1113bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
@@ -1114,7 +1115,7 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
1114 struct throtl_data *td = q->td; 1115 struct throtl_data *td = q->td;
1115 struct throtl_grp *tg; 1116 struct throtl_grp *tg;
1116 bool rw = bio_data_dir(bio), update_disptime = true; 1117 bool rw = bio_data_dir(bio), update_disptime = true;
1117 struct blkio_cgroup *blkcg; 1118 struct blkcg *blkcg;
1118 bool throttled = false; 1119 bool throttled = false;
1119 1120
1120 if (bio->bi_rw & REQ_THROTTLED) { 1121 if (bio->bi_rw & REQ_THROTTLED) {
@@ -1122,33 +1123,31 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
1122 goto out; 1123 goto out;
1123 } 1124 }
1124 1125
1126 /* bio_associate_current() needs ioc, try creating */
1127 create_io_context(GFP_ATOMIC, q->node);
1128
1125 /* 1129 /*
1126 * A throtl_grp pointer retrieved under rcu can be used to access 1130 * A throtl_grp pointer retrieved under rcu can be used to access
1127 * basic fields like stats and io rates. If a group has no rules, 1131 * basic fields like stats and io rates. If a group has no rules,
1128 * just update the dispatch stats in lockless manner and return. 1132 * just update the dispatch stats in lockless manner and return.
1129 */ 1133 */
1130
1131 rcu_read_lock(); 1134 rcu_read_lock();
1132 blkcg = task_blkio_cgroup(current); 1135 blkcg = bio_blkcg(bio);
1133 tg = throtl_find_tg(td, blkcg); 1136 tg = throtl_lookup_tg(td, blkcg);
1134 if (tg) { 1137 if (tg) {
1135 throtl_tg_fill_dev_details(td, tg);
1136
1137 if (tg_no_rule_group(tg, rw)) { 1138 if (tg_no_rule_group(tg, rw)) {
1138 blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size, 1139 throtl_update_dispatch_stats(tg_to_blkg(tg),
1139 rw, rw_is_sync(bio->bi_rw)); 1140 bio->bi_size, bio->bi_rw);
1140 rcu_read_unlock(); 1141 goto out_unlock_rcu;
1141 goto out;
1142 } 1142 }
1143 } 1143 }
1144 rcu_read_unlock();
1145 1144
1146 /* 1145 /*
1147 * Either group has not been allocated yet or it is not an unlimited 1146 * Either group has not been allocated yet or it is not an unlimited
1148 * IO group 1147 * IO group
1149 */ 1148 */
1150 spin_lock_irq(q->queue_lock); 1149 spin_lock_irq(q->queue_lock);
1151 tg = throtl_get_tg(td); 1150 tg = throtl_lookup_create_tg(td, blkcg);
1152 if (unlikely(!tg)) 1151 if (unlikely(!tg))
1153 goto out_unlock; 1152 goto out_unlock;
1154 1153
@@ -1189,6 +1188,7 @@ queue_bio:
1189 tg->io_disp[rw], tg->iops[rw], 1188 tg->io_disp[rw], tg->iops[rw],
1190 tg->nr_queued[READ], tg->nr_queued[WRITE]); 1189 tg->nr_queued[READ], tg->nr_queued[WRITE]);
1191 1190
1191 bio_associate_current(bio);
1192 throtl_add_bio_tg(q->td, tg, bio); 1192 throtl_add_bio_tg(q->td, tg, bio);
1193 throttled = true; 1193 throttled = true;
1194 1194
@@ -1199,6 +1199,8 @@ queue_bio:
1199 1199
1200out_unlock: 1200out_unlock:
1201 spin_unlock_irq(q->queue_lock); 1201 spin_unlock_irq(q->queue_lock);
1202out_unlock_rcu:
1203 rcu_read_unlock();
1202out: 1204out:
1203 return throttled; 1205 return throttled;
1204} 1206}
@@ -1241,79 +1243,31 @@ void blk_throtl_drain(struct request_queue *q)
1241int blk_throtl_init(struct request_queue *q) 1243int blk_throtl_init(struct request_queue *q)
1242{ 1244{
1243 struct throtl_data *td; 1245 struct throtl_data *td;
1244 struct throtl_grp *tg; 1246 int ret;
1245 1247
1246 td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node); 1248 td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node);
1247 if (!td) 1249 if (!td)
1248 return -ENOMEM; 1250 return -ENOMEM;
1249 1251
1250 INIT_HLIST_HEAD(&td->tg_list);
1251 td->tg_service_tree = THROTL_RB_ROOT; 1252 td->tg_service_tree = THROTL_RB_ROOT;
1252 td->limits_changed = false; 1253 td->limits_changed = false;
1253 INIT_DELAYED_WORK(&td->throtl_work, blk_throtl_work); 1254 INIT_DELAYED_WORK(&td->throtl_work, blk_throtl_work);
1254 1255
1255 /* alloc and Init root group. */ 1256 q->td = td;
1256 td->queue = q; 1257 td->queue = q;
1257 tg = throtl_alloc_tg(td);
1258 1258
1259 if (!tg) { 1259 /* activate policy */
1260 ret = blkcg_activate_policy(q, &blkcg_policy_throtl);
1261 if (ret)
1260 kfree(td); 1262 kfree(td);
1261 return -ENOMEM; 1263 return ret;
1262 }
1263
1264 td->root_tg = tg;
1265
1266 rcu_read_lock();
1267 throtl_init_add_tg_lists(td, tg, &blkio_root_cgroup);
1268 rcu_read_unlock();
1269
1270 /* Attach throtl data to request queue */
1271 q->td = td;
1272 return 0;
1273} 1264}
1274 1265
1275void blk_throtl_exit(struct request_queue *q) 1266void blk_throtl_exit(struct request_queue *q)
1276{ 1267{
1277 struct throtl_data *td = q->td; 1268 BUG_ON(!q->td);
1278 bool wait = false;
1279
1280 BUG_ON(!td);
1281
1282 throtl_shutdown_wq(q);
1283
1284 spin_lock_irq(q->queue_lock);
1285 throtl_release_tgs(td);
1286
1287 /* If there are other groups */
1288 if (td->nr_undestroyed_grps > 0)
1289 wait = true;
1290
1291 spin_unlock_irq(q->queue_lock);
1292
1293 /*
1294 * Wait for tg->blkg->key accessors to exit their grace periods.
1295 * Do this wait only if there are other undestroyed groups out
1296 * there (other than root group). This can happen if cgroup deletion
1297 * path claimed the responsibility of cleaning up a group before
1298 * queue cleanup code get to the group.
1299 *
1300 * Do not call synchronize_rcu() unconditionally as there are drivers
1301 * which create/delete request queue hundreds of times during scan/boot
1302 * and synchronize_rcu() can take significant time and slow down boot.
1303 */
1304 if (wait)
1305 synchronize_rcu();
1306
1307 /*
1308 * Just being safe to make sure after previous flush if some body did
1309 * update limits through cgroup and another work got queued, cancel
1310 * it.
1311 */
1312 throtl_shutdown_wq(q); 1269 throtl_shutdown_wq(q);
1313} 1270 blkcg_deactivate_policy(q, &blkcg_policy_throtl);
1314
1315void blk_throtl_release(struct request_queue *q)
1316{
1317 kfree(q->td); 1271 kfree(q->td);
1318} 1272}
1319 1273
@@ -1323,8 +1277,7 @@ static int __init throtl_init(void)
1323 if (!kthrotld_workqueue) 1277 if (!kthrotld_workqueue)
1324 panic("Failed to create kthrotld\n"); 1278 panic("Failed to create kthrotld\n");
1325 1279
1326 blkio_policy_register(&blkio_policy_throtl); 1280 return blkcg_policy_register(&blkcg_policy_throtl);
1327 return 0;
1328} 1281}
1329 1282
1330module_init(throtl_init); 1283module_init(throtl_init);