diff options
Diffstat (limited to 'block/blk-throttle.c')
-rw-r--r-- | block/blk-throttle.c | 313 |
1 files changed, 237 insertions, 76 deletions
diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 252a81a306f7..a62be8d0dc1b 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c | |||
@@ -78,6 +78,8 @@ struct throtl_grp { | |||
78 | 78 | ||
79 | /* Some throttle limits got updated for the group */ | 79 | /* Some throttle limits got updated for the group */ |
80 | int limits_changed; | 80 | int limits_changed; |
81 | |||
82 | struct rcu_head rcu_head; | ||
81 | }; | 83 | }; |
82 | 84 | ||
83 | struct throtl_data | 85 | struct throtl_data |
@@ -88,7 +90,7 @@ struct throtl_data | |||
88 | /* service tree for active throtl groups */ | 90 | /* service tree for active throtl groups */ |
89 | struct throtl_rb_root tg_service_tree; | 91 | struct throtl_rb_root tg_service_tree; |
90 | 92 | ||
91 | struct throtl_grp root_tg; | 93 | struct throtl_grp *root_tg; |
92 | struct request_queue *queue; | 94 | struct request_queue *queue; |
93 | 95 | ||
94 | /* Total Number of queued bios on READ and WRITE lists */ | 96 | /* Total Number of queued bios on READ and WRITE lists */ |
@@ -151,56 +153,44 @@ static inline struct throtl_grp *throtl_ref_get_tg(struct throtl_grp *tg) | |||
151 | return tg; | 153 | return tg; |
152 | } | 154 | } |
153 | 155 | ||
154 | static void throtl_put_tg(struct throtl_grp *tg) | 156 | static void throtl_free_tg(struct rcu_head *head) |
155 | { | 157 | { |
156 | BUG_ON(atomic_read(&tg->ref) <= 0); | 158 | struct throtl_grp *tg; |
157 | if (!atomic_dec_and_test(&tg->ref)) | 159 | |
158 | return; | 160 | tg = container_of(head, struct throtl_grp, rcu_head); |
161 | free_percpu(tg->blkg.stats_cpu); | ||
159 | kfree(tg); | 162 | kfree(tg); |
160 | } | 163 | } |
161 | 164 | ||
162 | static struct throtl_grp * throtl_find_alloc_tg(struct throtl_data *td, | 165 | static void throtl_put_tg(struct throtl_grp *tg) |
163 | struct blkio_cgroup *blkcg) | ||
164 | { | 166 | { |
165 | struct throtl_grp *tg = NULL; | 167 | BUG_ON(atomic_read(&tg->ref) <= 0); |
166 | void *key = td; | 168 | if (!atomic_dec_and_test(&tg->ref)) |
167 | struct backing_dev_info *bdi = &td->queue->backing_dev_info; | 169 | return; |
168 | unsigned int major, minor; | ||
169 | 170 | ||
170 | /* | 171 | /* |
171 | * TODO: Speed up blkiocg_lookup_group() by maintaining a radix | 172 | * A group is freed in rcu manner. But having an rcu lock does not |
172 | * tree of blkg (instead of traversing through hash list all | 173 | * mean that one can access all the fields of blkg and assume these |
173 | * the time. | 174 | * are valid. For example, don't try to follow throtl_data and |
175 | * request queue links. | ||
176 | * | ||
177 | * Having a reference to blkg under an rcu allows acess to only | ||
178 | * values local to groups like group stats and group rate limits | ||
174 | */ | 179 | */ |
180 | call_rcu(&tg->rcu_head, throtl_free_tg); | ||
181 | } | ||
175 | 182 | ||
176 | /* | 183 | static void throtl_init_group(struct throtl_grp *tg) |
177 | * This is the common case when there are no blkio cgroups. | 184 | { |
178 | * Avoid lookup in this case | ||
179 | */ | ||
180 | if (blkcg == &blkio_root_cgroup) | ||
181 | tg = &td->root_tg; | ||
182 | else | ||
183 | tg = tg_of_blkg(blkiocg_lookup_group(blkcg, key)); | ||
184 | |||
185 | /* Fill in device details for root group */ | ||
186 | if (tg && !tg->blkg.dev && bdi->dev && dev_name(bdi->dev)) { | ||
187 | sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); | ||
188 | tg->blkg.dev = MKDEV(major, minor); | ||
189 | goto done; | ||
190 | } | ||
191 | |||
192 | if (tg) | ||
193 | goto done; | ||
194 | |||
195 | tg = kzalloc_node(sizeof(*tg), GFP_ATOMIC, td->queue->node); | ||
196 | if (!tg) | ||
197 | goto done; | ||
198 | |||
199 | INIT_HLIST_NODE(&tg->tg_node); | 185 | INIT_HLIST_NODE(&tg->tg_node); |
200 | RB_CLEAR_NODE(&tg->rb_node); | 186 | RB_CLEAR_NODE(&tg->rb_node); |
201 | bio_list_init(&tg->bio_lists[0]); | 187 | bio_list_init(&tg->bio_lists[0]); |
202 | bio_list_init(&tg->bio_lists[1]); | 188 | bio_list_init(&tg->bio_lists[1]); |
203 | td->limits_changed = false; | 189 | tg->limits_changed = false; |
190 | |||
191 | /* Practically unlimited BW */ | ||
192 | tg->bps[0] = tg->bps[1] = -1; | ||
193 | tg->iops[0] = tg->iops[1] = -1; | ||
204 | 194 | ||
205 | /* | 195 | /* |
206 | * Take the initial reference that will be released on destroy | 196 | * Take the initial reference that will be released on destroy |
@@ -209,33 +199,181 @@ static struct throtl_grp * throtl_find_alloc_tg(struct throtl_data *td, | |||
209 | * exit or cgroup deletion path depending on who is exiting first. | 199 | * exit or cgroup deletion path depending on who is exiting first. |
210 | */ | 200 | */ |
211 | atomic_set(&tg->ref, 1); | 201 | atomic_set(&tg->ref, 1); |
202 | } | ||
203 | |||
204 | /* Should be called with rcu read lock held (needed for blkcg) */ | ||
205 | static void | ||
206 | throtl_add_group_to_td_list(struct throtl_data *td, struct throtl_grp *tg) | ||
207 | { | ||
208 | hlist_add_head(&tg->tg_node, &td->tg_list); | ||
209 | td->nr_undestroyed_grps++; | ||
210 | } | ||
211 | |||
212 | static void | ||
213 | __throtl_tg_fill_dev_details(struct throtl_data *td, struct throtl_grp *tg) | ||
214 | { | ||
215 | struct backing_dev_info *bdi = &td->queue->backing_dev_info; | ||
216 | unsigned int major, minor; | ||
217 | |||
218 | if (!tg || tg->blkg.dev) | ||
219 | return; | ||
220 | |||
221 | /* | ||
222 | * Fill in device details for a group which might not have been | ||
223 | * filled at group creation time as queue was being instantiated | ||
224 | * and driver had not attached a device yet | ||
225 | */ | ||
226 | if (bdi->dev && dev_name(bdi->dev)) { | ||
227 | sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); | ||
228 | tg->blkg.dev = MKDEV(major, minor); | ||
229 | } | ||
230 | } | ||
231 | |||
232 | /* | ||
233 | * Should be called with without queue lock held. Here queue lock will be | ||
234 | * taken rarely. It will be taken only once during life time of a group | ||
235 | * if need be | ||
236 | */ | ||
237 | static void | ||
238 | throtl_tg_fill_dev_details(struct throtl_data *td, struct throtl_grp *tg) | ||
239 | { | ||
240 | if (!tg || tg->blkg.dev) | ||
241 | return; | ||
242 | |||
243 | spin_lock_irq(td->queue->queue_lock); | ||
244 | __throtl_tg_fill_dev_details(td, tg); | ||
245 | spin_unlock_irq(td->queue->queue_lock); | ||
246 | } | ||
247 | |||
248 | static void throtl_init_add_tg_lists(struct throtl_data *td, | ||
249 | struct throtl_grp *tg, struct blkio_cgroup *blkcg) | ||
250 | { | ||
251 | __throtl_tg_fill_dev_details(td, tg); | ||
212 | 252 | ||
213 | /* Add group onto cgroup list */ | 253 | /* Add group onto cgroup list */ |
214 | sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); | ||
215 | blkiocg_add_blkio_group(blkcg, &tg->blkg, (void *)td, | 254 | blkiocg_add_blkio_group(blkcg, &tg->blkg, (void *)td, |
216 | MKDEV(major, minor), BLKIO_POLICY_THROTL); | 255 | tg->blkg.dev, BLKIO_POLICY_THROTL); |
217 | 256 | ||
218 | tg->bps[READ] = blkcg_get_read_bps(blkcg, tg->blkg.dev); | 257 | tg->bps[READ] = blkcg_get_read_bps(blkcg, tg->blkg.dev); |
219 | tg->bps[WRITE] = blkcg_get_write_bps(blkcg, tg->blkg.dev); | 258 | tg->bps[WRITE] = blkcg_get_write_bps(blkcg, tg->blkg.dev); |
220 | tg->iops[READ] = blkcg_get_read_iops(blkcg, tg->blkg.dev); | 259 | tg->iops[READ] = blkcg_get_read_iops(blkcg, tg->blkg.dev); |
221 | tg->iops[WRITE] = blkcg_get_write_iops(blkcg, tg->blkg.dev); | 260 | tg->iops[WRITE] = blkcg_get_write_iops(blkcg, tg->blkg.dev); |
222 | 261 | ||
223 | hlist_add_head(&tg->tg_node, &td->tg_list); | 262 | throtl_add_group_to_td_list(td, tg); |
224 | td->nr_undestroyed_grps++; | 263 | } |
225 | done: | 264 | |
265 | /* Should be called without queue lock and outside of rcu period */ | ||
266 | static struct throtl_grp *throtl_alloc_tg(struct throtl_data *td) | ||
267 | { | ||
268 | struct throtl_grp *tg = NULL; | ||
269 | int ret; | ||
270 | |||
271 | tg = kzalloc_node(sizeof(*tg), GFP_ATOMIC, td->queue->node); | ||
272 | if (!tg) | ||
273 | return NULL; | ||
274 | |||
275 | ret = blkio_alloc_blkg_stats(&tg->blkg); | ||
276 | |||
277 | if (ret) { | ||
278 | kfree(tg); | ||
279 | return NULL; | ||
280 | } | ||
281 | |||
282 | throtl_init_group(tg); | ||
226 | return tg; | 283 | return tg; |
227 | } | 284 | } |
228 | 285 | ||
229 | static struct throtl_grp * throtl_get_tg(struct throtl_data *td) | 286 | static struct |
287 | throtl_grp *throtl_find_tg(struct throtl_data *td, struct blkio_cgroup *blkcg) | ||
230 | { | 288 | { |
231 | struct throtl_grp *tg = NULL; | 289 | struct throtl_grp *tg = NULL; |
290 | void *key = td; | ||
291 | |||
292 | /* | ||
293 | * This is the common case when there are no blkio cgroups. | ||
294 | * Avoid lookup in this case | ||
295 | */ | ||
296 | if (blkcg == &blkio_root_cgroup) | ||
297 | tg = td->root_tg; | ||
298 | else | ||
299 | tg = tg_of_blkg(blkiocg_lookup_group(blkcg, key)); | ||
300 | |||
301 | __throtl_tg_fill_dev_details(td, tg); | ||
302 | return tg; | ||
303 | } | ||
304 | |||
305 | /* | ||
306 | * This function returns with queue lock unlocked in case of error, like | ||
307 | * request queue is no more | ||
308 | */ | ||
309 | static struct throtl_grp * throtl_get_tg(struct throtl_data *td) | ||
310 | { | ||
311 | struct throtl_grp *tg = NULL, *__tg = NULL; | ||
232 | struct blkio_cgroup *blkcg; | 312 | struct blkio_cgroup *blkcg; |
313 | struct request_queue *q = td->queue; | ||
233 | 314 | ||
234 | rcu_read_lock(); | 315 | rcu_read_lock(); |
235 | blkcg = task_blkio_cgroup(current); | 316 | blkcg = task_blkio_cgroup(current); |
236 | tg = throtl_find_alloc_tg(td, blkcg); | 317 | tg = throtl_find_tg(td, blkcg); |
237 | if (!tg) | 318 | if (tg) { |
238 | tg = &td->root_tg; | 319 | rcu_read_unlock(); |
320 | return tg; | ||
321 | } | ||
322 | |||
323 | /* | ||
324 | * Need to allocate a group. Allocation of group also needs allocation | ||
325 | * of per cpu stats which in-turn takes a mutex() and can block. Hence | ||
326 | * we need to drop rcu lock and queue_lock before we call alloc | ||
327 | * | ||
328 | * Take the request queue reference to make sure queue does not | ||
329 | * go away once we return from allocation. | ||
330 | */ | ||
331 | blk_get_queue(q); | ||
332 | rcu_read_unlock(); | ||
333 | spin_unlock_irq(q->queue_lock); | ||
334 | |||
335 | tg = throtl_alloc_tg(td); | ||
336 | /* | ||
337 | * We might have slept in group allocation. Make sure queue is not | ||
338 | * dead | ||
339 | */ | ||
340 | if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) { | ||
341 | blk_put_queue(q); | ||
342 | if (tg) | ||
343 | kfree(tg); | ||
344 | |||
345 | return ERR_PTR(-ENODEV); | ||
346 | } | ||
347 | blk_put_queue(q); | ||
348 | |||
349 | /* Group allocated and queue is still alive. take the lock */ | ||
350 | spin_lock_irq(q->queue_lock); | ||
351 | |||
352 | /* | ||
353 | * Initialize the new group. After sleeping, read the blkcg again. | ||
354 | */ | ||
355 | rcu_read_lock(); | ||
356 | blkcg = task_blkio_cgroup(current); | ||
357 | |||
358 | /* | ||
359 | * If some other thread already allocated the group while we were | ||
360 | * not holding queue lock, free up the group | ||
361 | */ | ||
362 | __tg = throtl_find_tg(td, blkcg); | ||
363 | |||
364 | if (__tg) { | ||
365 | kfree(tg); | ||
366 | rcu_read_unlock(); | ||
367 | return __tg; | ||
368 | } | ||
369 | |||
370 | /* Group allocation failed. Account the IO to root group */ | ||
371 | if (!tg) { | ||
372 | tg = td->root_tg; | ||
373 | return tg; | ||
374 | } | ||
375 | |||
376 | throtl_init_add_tg_lists(td, tg, blkcg); | ||
239 | rcu_read_unlock(); | 377 | rcu_read_unlock(); |
240 | return tg; | 378 | return tg; |
241 | } | 379 | } |
@@ -544,6 +682,12 @@ static bool tg_with_in_bps_limit(struct throtl_data *td, struct throtl_grp *tg, | |||
544 | return 0; | 682 | return 0; |
545 | } | 683 | } |
546 | 684 | ||
685 | static bool tg_no_rule_group(struct throtl_grp *tg, bool rw) { | ||
686 | if (tg->bps[rw] == -1 && tg->iops[rw] == -1) | ||
687 | return 1; | ||
688 | return 0; | ||
689 | } | ||
690 | |||
547 | /* | 691 | /* |
548 | * Returns whether one can dispatch a bio or not. Also returns approx number | 692 | * Returns whether one can dispatch a bio or not. Also returns approx number |
549 | * of jiffies to wait before this bio is with-in IO rate and can be dispatched | 693 | * of jiffies to wait before this bio is with-in IO rate and can be dispatched |
@@ -608,10 +752,6 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) | |||
608 | tg->bytes_disp[rw] += bio->bi_size; | 752 | tg->bytes_disp[rw] += bio->bi_size; |
609 | tg->io_disp[rw]++; | 753 | tg->io_disp[rw]++; |
610 | 754 | ||
611 | /* | ||
612 | * TODO: This will take blkg->stats_lock. Figure out a way | ||
613 | * to avoid this cost. | ||
614 | */ | ||
615 | blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size, rw, sync); | 755 | blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size, rw, sync); |
616 | } | 756 | } |
617 | 757 | ||
@@ -989,15 +1129,51 @@ int blk_throtl_bio(struct request_queue *q, struct bio **biop) | |||
989 | struct throtl_grp *tg; | 1129 | struct throtl_grp *tg; |
990 | struct bio *bio = *biop; | 1130 | struct bio *bio = *biop; |
991 | bool rw = bio_data_dir(bio), update_disptime = true; | 1131 | bool rw = bio_data_dir(bio), update_disptime = true; |
1132 | struct blkio_cgroup *blkcg; | ||
992 | 1133 | ||
993 | if (bio->bi_rw & REQ_THROTTLED) { | 1134 | if (bio->bi_rw & REQ_THROTTLED) { |
994 | bio->bi_rw &= ~REQ_THROTTLED; | 1135 | bio->bi_rw &= ~REQ_THROTTLED; |
995 | return 0; | 1136 | return 0; |
996 | } | 1137 | } |
997 | 1138 | ||
1139 | /* | ||
1140 | * A throtl_grp pointer retrieved under rcu can be used to access | ||
1141 | * basic fields like stats and io rates. If a group has no rules, | ||
1142 | * just update the dispatch stats in lockless manner and return. | ||
1143 | */ | ||
1144 | |||
1145 | rcu_read_lock(); | ||
1146 | blkcg = task_blkio_cgroup(current); | ||
1147 | tg = throtl_find_tg(td, blkcg); | ||
1148 | if (tg) { | ||
1149 | throtl_tg_fill_dev_details(td, tg); | ||
1150 | |||
1151 | if (tg_no_rule_group(tg, rw)) { | ||
1152 | blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size, | ||
1153 | rw, bio->bi_rw & REQ_SYNC); | ||
1154 | rcu_read_unlock(); | ||
1155 | return 0; | ||
1156 | } | ||
1157 | } | ||
1158 | rcu_read_unlock(); | ||
1159 | |||
1160 | /* | ||
1161 | * Either group has not been allocated yet or it is not an unlimited | ||
1162 | * IO group | ||
1163 | */ | ||
1164 | |||
998 | spin_lock_irq(q->queue_lock); | 1165 | spin_lock_irq(q->queue_lock); |
999 | tg = throtl_get_tg(td); | 1166 | tg = throtl_get_tg(td); |
1000 | 1167 | ||
1168 | if (IS_ERR(tg)) { | ||
1169 | if (PTR_ERR(tg) == -ENODEV) { | ||
1170 | /* | ||
1171 | * Queue is gone. No queue lock held here. | ||
1172 | */ | ||
1173 | return -ENODEV; | ||
1174 | } | ||
1175 | } | ||
1176 | |||
1001 | if (tg->nr_queued[rw]) { | 1177 | if (tg->nr_queued[rw]) { |
1002 | /* | 1178 | /* |
1003 | * There is already another bio queued in same dir. No | 1179 | * There is already another bio queued in same dir. No |
@@ -1060,39 +1236,24 @@ int blk_throtl_init(struct request_queue *q) | |||
1060 | INIT_HLIST_HEAD(&td->tg_list); | 1236 | INIT_HLIST_HEAD(&td->tg_list); |
1061 | td->tg_service_tree = THROTL_RB_ROOT; | 1237 | td->tg_service_tree = THROTL_RB_ROOT; |
1062 | td->limits_changed = false; | 1238 | td->limits_changed = false; |
1239 | INIT_DELAYED_WORK(&td->throtl_work, blk_throtl_work); | ||
1063 | 1240 | ||
1064 | /* Init root group */ | 1241 | /* alloc and Init root group. */ |
1065 | tg = &td->root_tg; | 1242 | td->queue = q; |
1066 | INIT_HLIST_NODE(&tg->tg_node); | 1243 | tg = throtl_alloc_tg(td); |
1067 | RB_CLEAR_NODE(&tg->rb_node); | ||
1068 | bio_list_init(&tg->bio_lists[0]); | ||
1069 | bio_list_init(&tg->bio_lists[1]); | ||
1070 | |||
1071 | /* Practically unlimited BW */ | ||
1072 | tg->bps[0] = tg->bps[1] = -1; | ||
1073 | tg->iops[0] = tg->iops[1] = -1; | ||
1074 | td->limits_changed = false; | ||
1075 | 1244 | ||
1076 | /* | 1245 | if (!tg) { |
1077 | * Set root group reference to 2. One reference will be dropped when | 1246 | kfree(td); |
1078 | * all groups on tg_list are being deleted during queue exit. Other | 1247 | return -ENOMEM; |
1079 | * reference will remain there as we don't want to delete this group | 1248 | } |
1080 | * as it is statically allocated and gets destroyed when throtl_data | ||
1081 | * goes away. | ||
1082 | */ | ||
1083 | atomic_set(&tg->ref, 2); | ||
1084 | hlist_add_head(&tg->tg_node, &td->tg_list); | ||
1085 | td->nr_undestroyed_grps++; | ||
1086 | 1249 | ||
1087 | INIT_DELAYED_WORK(&td->throtl_work, blk_throtl_work); | 1250 | td->root_tg = tg; |
1088 | 1251 | ||
1089 | rcu_read_lock(); | 1252 | rcu_read_lock(); |
1090 | blkiocg_add_blkio_group(&blkio_root_cgroup, &tg->blkg, (void *)td, | 1253 | throtl_init_add_tg_lists(td, tg, &blkio_root_cgroup); |
1091 | 0, BLKIO_POLICY_THROTL); | ||
1092 | rcu_read_unlock(); | 1254 | rcu_read_unlock(); |
1093 | 1255 | ||
1094 | /* Attach throtl data to request queue */ | 1256 | /* Attach throtl data to request queue */ |
1095 | td->queue = q; | ||
1096 | q->td = td; | 1257 | q->td = td; |
1097 | return 0; | 1258 | return 0; |
1098 | } | 1259 | } |