aboutsummaryrefslogtreecommitdiffstats
path: root/block
diff options
context:
space:
mode:
authorVivek Goyal <vgoyal@redhat.com>2011-05-19 15:38:23 -0400
committerJens Axboe <jaxboe@fusionio.com>2011-05-20 14:34:52 -0400
commitf469a7b4d5b1d1d053200a9015fd25d59c057f49 (patch)
tree2c68c0689e40955b186e350b15d44d0b260f4655 /block
parent56edf7d75db5b14d628b46623c414ffbeed68d7f (diff)
blk-cgroup: Allow sleeping while dynamically allocating a group
Currently, all the cfq_group or throtl_group allocations happen while we are holding ->queue_lock and sleeping is not allowed. Soon, we will move to per cpu stats and also need to allocate the per group stats. As one can not call alloc_percpu() from atomic context as it can sleep, we need to drop ->queue_lock, allocate the group, retake the lock and continue processing. In throttling code, I check the queue DEAD flag again to make sure that driver did not call blk_cleanup_queue() in the mean time. Signed-off-by: Vivek Goyal <vgoyal@redhat.com> Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
Diffstat (limited to 'block')
-rw-r--r--block/blk-core.c3
-rw-r--r--block/blk-throttle.c141
-rw-r--r--block/cfq-iosched.c128
3 files changed, 205 insertions, 67 deletions
diff --git a/block/blk-core.c b/block/blk-core.c
index 3fe00a14822a..9e8e297374b9 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1550,7 +1550,8 @@ static inline void __generic_make_request(struct bio *bio)
1550 goto end_io; 1550 goto end_io;
1551 } 1551 }
1552 1552
1553 blk_throtl_bio(q, &bio); 1553 if (blk_throtl_bio(q, &bio))
1554 goto end_io;
1554 1555
1555 /* 1556 /*
1556 * If bio = NULL, bio has been throttled and will be submitted 1557 * If bio = NULL, bio has been throttled and will be submitted
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index fa9a900c1254..c201967b33cd 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -188,8 +188,40 @@ throtl_add_group_to_td_list(struct throtl_data *td, struct throtl_grp *tg)
188 td->nr_undestroyed_grps++; 188 td->nr_undestroyed_grps++;
189} 189}
190 190
191static struct throtl_grp * throtl_find_alloc_tg(struct throtl_data *td, 191static void throtl_init_add_tg_lists(struct throtl_data *td,
192 struct blkio_cgroup *blkcg) 192 struct throtl_grp *tg, struct blkio_cgroup *blkcg)
193{
194 struct backing_dev_info *bdi = &td->queue->backing_dev_info;
195 unsigned int major, minor;
196
197 /* Add group onto cgroup list */
198 sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
199 blkiocg_add_blkio_group(blkcg, &tg->blkg, (void *)td,
200 MKDEV(major, minor), BLKIO_POLICY_THROTL);
201
202 tg->bps[READ] = blkcg_get_read_bps(blkcg, tg->blkg.dev);
203 tg->bps[WRITE] = blkcg_get_write_bps(blkcg, tg->blkg.dev);
204 tg->iops[READ] = blkcg_get_read_iops(blkcg, tg->blkg.dev);
205 tg->iops[WRITE] = blkcg_get_write_iops(blkcg, tg->blkg.dev);
206
207 throtl_add_group_to_td_list(td, tg);
208}
209
210/* Should be called without queue lock and outside of rcu period */
211static struct throtl_grp *throtl_alloc_tg(struct throtl_data *td)
212{
213 struct throtl_grp *tg = NULL;
214
215 tg = kzalloc_node(sizeof(*tg), GFP_ATOMIC, td->queue->node);
216 if (!tg)
217 return NULL;
218
219 throtl_init_group(tg);
220 return tg;
221}
222
223static struct
224throtl_grp *throtl_find_tg(struct throtl_data *td, struct blkio_cgroup *blkcg)
193{ 225{
194 struct throtl_grp *tg = NULL; 226 struct throtl_grp *tg = NULL;
195 void *key = td; 227 void *key = td;
@@ -197,12 +229,6 @@ static struct throtl_grp * throtl_find_alloc_tg(struct throtl_data *td,
197 unsigned int major, minor; 229 unsigned int major, minor;
198 230
199 /* 231 /*
200 * TODO: Speed up blkiocg_lookup_group() by maintaining a radix
201 * tree of blkg (instead of traversing through hash list all
202 * the time.
203 */
204
205 /*
206 * This is the common case when there are no blkio cgroups. 232 * This is the common case when there are no blkio cgroups.
207 * Avoid lookup in this case 233 * Avoid lookup in this case
208 */ 234 */
@@ -215,43 +241,83 @@ static struct throtl_grp * throtl_find_alloc_tg(struct throtl_data *td,
215 if (tg && !tg->blkg.dev && bdi->dev && dev_name(bdi->dev)) { 241 if (tg && !tg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
216 sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); 242 sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
217 tg->blkg.dev = MKDEV(major, minor); 243 tg->blkg.dev = MKDEV(major, minor);
218 goto done;
219 } 244 }
220 245
221 if (tg)
222 goto done;
223
224 tg = kzalloc_node(sizeof(*tg), GFP_ATOMIC, td->queue->node);
225 if (!tg)
226 goto done;
227
228 throtl_init_group(tg);
229
230 /* Add group onto cgroup list */
231 sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
232 blkiocg_add_blkio_group(blkcg, &tg->blkg, (void *)td,
233 MKDEV(major, minor), BLKIO_POLICY_THROTL);
234
235 tg->bps[READ] = blkcg_get_read_bps(blkcg, tg->blkg.dev);
236 tg->bps[WRITE] = blkcg_get_write_bps(blkcg, tg->blkg.dev);
237 tg->iops[READ] = blkcg_get_read_iops(blkcg, tg->blkg.dev);
238 tg->iops[WRITE] = blkcg_get_write_iops(blkcg, tg->blkg.dev);
239
240 throtl_add_group_to_td_list(td, tg);
241done:
242 return tg; 246 return tg;
243} 247}
244 248
249/*
250 * This function returns with queue lock unlocked in case of error, like
251 * request queue is no more
252 */
245static struct throtl_grp * throtl_get_tg(struct throtl_data *td) 253static struct throtl_grp * throtl_get_tg(struct throtl_data *td)
246{ 254{
247 struct throtl_grp *tg = NULL; 255 struct throtl_grp *tg = NULL, *__tg = NULL;
248 struct blkio_cgroup *blkcg; 256 struct blkio_cgroup *blkcg;
257 struct request_queue *q = td->queue;
249 258
250 rcu_read_lock(); 259 rcu_read_lock();
251 blkcg = task_blkio_cgroup(current); 260 blkcg = task_blkio_cgroup(current);
252 tg = throtl_find_alloc_tg(td, blkcg); 261 tg = throtl_find_tg(td, blkcg);
253 if (!tg) 262 if (tg) {
263 rcu_read_unlock();
264 return tg;
265 }
266
267 /*
268 * Need to allocate a group. Allocation of group also needs allocation
269 * of per cpu stats which in-turn takes a mutex() and can block. Hence
270 * we need to drop rcu lock and queue_lock before we call alloc
271 *
272 * Take the request queue reference to make sure queue does not
273 * go away once we return from allocation.
274 */
275 blk_get_queue(q);
276 rcu_read_unlock();
277 spin_unlock_irq(q->queue_lock);
278
279 tg = throtl_alloc_tg(td);
280 /*
281 * We might have slept in group allocation. Make sure queue is not
282 * dead
283 */
284 if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) {
285 blk_put_queue(q);
286 if (tg)
287 kfree(tg);
288
289 return ERR_PTR(-ENODEV);
290 }
291 blk_put_queue(q);
292
293 /* Group allocated and queue is still alive. take the lock */
294 spin_lock_irq(q->queue_lock);
295
296 /*
297 * Initialize the new group. After sleeping, read the blkcg again.
298 */
299 rcu_read_lock();
300 blkcg = task_blkio_cgroup(current);
301
302 /*
303 * If some other thread already allocated the group while we were
304 * not holding queue lock, free up the group
305 */
306 __tg = throtl_find_tg(td, blkcg);
307
308 if (__tg) {
309 kfree(tg);
310 rcu_read_unlock();
311 return __tg;
312 }
313
314 /* Group allocation failed. Account the IO to root group */
315 if (!tg) {
254 tg = &td->root_tg; 316 tg = &td->root_tg;
317 return tg;
318 }
319
320 throtl_init_add_tg_lists(td, tg, blkcg);
255 rcu_read_unlock(); 321 rcu_read_unlock();
256 return tg; 322 return tg;
257} 323}
@@ -1014,6 +1080,15 @@ int blk_throtl_bio(struct request_queue *q, struct bio **biop)
1014 spin_lock_irq(q->queue_lock); 1080 spin_lock_irq(q->queue_lock);
1015 tg = throtl_get_tg(td); 1081 tg = throtl_get_tg(td);
1016 1082
1083 if (IS_ERR(tg)) {
1084 if (PTR_ERR(tg) == -ENODEV) {
1085 /*
1086 * Queue is gone. No queue lock held here.
1087 */
1088 return -ENODEV;
1089 }
1090 }
1091
1017 if (tg->nr_queued[rw]) { 1092 if (tg->nr_queued[rw]) {
1018 /* 1093 /*
1019 * There is already another bio queued in same dir. No 1094 * There is already another bio queued in same dir. No
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index e2e6719832e1..606020fe93f3 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1016,28 +1016,47 @@ void cfq_update_blkio_group_weight(void *key, struct blkio_group *blkg,
1016 cfqg->needs_update = true; 1016 cfqg->needs_update = true;
1017} 1017}
1018 1018
1019static struct cfq_group * cfq_find_alloc_cfqg(struct cfq_data *cfqd, 1019static void cfq_init_add_cfqg_lists(struct cfq_data *cfqd,
1020 struct blkio_cgroup *blkcg) 1020 struct cfq_group *cfqg, struct blkio_cgroup *blkcg)
1021{ 1021{
1022 struct cfq_group *cfqg = NULL;
1023 void *key = cfqd;
1024 int i, j;
1025 struct cfq_rb_root *st;
1026 struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info; 1022 struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
1027 unsigned int major, minor; 1023 unsigned int major, minor;
1028 1024
1029 cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key)); 1025 /*
1030 if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) { 1026 * Add group onto cgroup list. It might happen that bdi->dev is
1027 * not initialized yet. Initialize this new group without major
1028 * and minor info and this info will be filled in once a new thread
1029 * comes for IO.
1030 */
1031 if (bdi->dev) {
1031 sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); 1032 sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
1032 cfqg->blkg.dev = MKDEV(major, minor); 1033 cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg,
1033 goto done; 1034 (void *)cfqd, MKDEV(major, minor));
1034 } 1035 } else
1035 if (cfqg) 1036 cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg,
1036 goto done; 1037 (void *)cfqd, 0);
1038
1039 cfqd->nr_blkcg_linked_grps++;
1040 cfqg->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
1041
1042 /* Add group on cfqd list */
1043 hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
1044}
1045
1046/*
1047 * Should be called from sleepable context. No request queue lock as per
1048 * cpu stats are allocated dynamically and alloc_percpu needs to be called
1049 * from sleepable context.
1050 */
1051static struct cfq_group * cfq_alloc_cfqg(struct cfq_data *cfqd)
1052{
1053 struct cfq_group *cfqg = NULL;
1054 int i, j;
1055 struct cfq_rb_root *st;
1037 1056
1038 cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node); 1057 cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node);
1039 if (!cfqg) 1058 if (!cfqg)
1040 goto done; 1059 return NULL;
1041 1060
1042 for_each_cfqg_st(cfqg, i, j, st) 1061 for_each_cfqg_st(cfqg, i, j, st)
1043 *st = CFQ_RB_ROOT; 1062 *st = CFQ_RB_ROOT;
@@ -1050,28 +1069,31 @@ static struct cfq_group * cfq_find_alloc_cfqg(struct cfq_data *cfqd,
1050 * or cgroup deletion path depending on who is exiting first. 1069 * or cgroup deletion path depending on who is exiting first.
1051 */ 1070 */
1052 cfqg->ref = 1; 1071 cfqg->ref = 1;
1072 return cfqg;
1073}
1074
1075static struct cfq_group *
1076cfq_find_cfqg(struct cfq_data *cfqd, struct blkio_cgroup *blkcg)
1077{
1078 struct cfq_group *cfqg = NULL;
1079 void *key = cfqd;
1080 struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
1081 unsigned int major, minor;
1053 1082
1054 /* 1083 /*
1055 * Add group onto cgroup list. It might happen that bdi->dev is 1084 * This is the common case when there are no blkio cgroups.
1056 * not initialized yet. Initialize this new group without major 1085 * Avoid lookup in this case
1057 * and minor info and this info will be filled in once a new thread
1058 * comes for IO. See code above.
1059 */ 1086 */
1060 if (bdi->dev) { 1087 if (blkcg == &blkio_root_cgroup)
1061 sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); 1088 cfqg = &cfqd->root_group;
1062 cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd, 1089 else
1063 MKDEV(major, minor)); 1090 cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
1064 } else
1065 cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
1066 0);
1067
1068 cfqd->nr_blkcg_linked_grps++;
1069 cfqg->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
1070 1091
1071 /* Add group on cfqd list */ 1092 if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
1072 hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list); 1093 sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
1094 cfqg->blkg.dev = MKDEV(major, minor);
1095 }
1073 1096
1074done:
1075 return cfqg; 1097 return cfqg;
1076} 1098}
1077 1099
@@ -1082,13 +1104,53 @@ done:
1082static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd) 1104static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd)
1083{ 1105{
1084 struct blkio_cgroup *blkcg; 1106 struct blkio_cgroup *blkcg;
1085 struct cfq_group *cfqg = NULL; 1107 struct cfq_group *cfqg = NULL, *__cfqg = NULL;
1108 struct request_queue *q = cfqd->queue;
1109
1110 rcu_read_lock();
1111 blkcg = task_blkio_cgroup(current);
1112 cfqg = cfq_find_cfqg(cfqd, blkcg);
1113 if (cfqg) {
1114 rcu_read_unlock();
1115 return cfqg;
1116 }
1117
1118 /*
1119 * Need to allocate a group. Allocation of group also needs allocation
1120 * of per cpu stats which in-turn takes a mutex() and can block. Hence
1121 * we need to drop rcu lock and queue_lock before we call alloc.
1122 *
1123 * Not taking any queue reference here and assuming that queue is
1124 * around by the time we return. CFQ queue allocation code does
1125 * the same. It might be racy though.
1126 */
1127
1128 rcu_read_unlock();
1129 spin_unlock_irq(q->queue_lock);
1130
1131 cfqg = cfq_alloc_cfqg(cfqd);
1132
1133 spin_lock_irq(q->queue_lock);
1086 1134
1087 rcu_read_lock(); 1135 rcu_read_lock();
1088 blkcg = task_blkio_cgroup(current); 1136 blkcg = task_blkio_cgroup(current);
1089 cfqg = cfq_find_alloc_cfqg(cfqd, blkcg); 1137
1138 /*
1139 * If some other thread already allocated the group while we were
1140 * not holding queue lock, free up the group
1141 */
1142 __cfqg = cfq_find_cfqg(cfqd, blkcg);
1143
1144 if (__cfqg) {
1145 kfree(cfqg);
1146 rcu_read_unlock();
1147 return __cfqg;
1148 }
1149
1090 if (!cfqg) 1150 if (!cfqg)
1091 cfqg = &cfqd->root_group; 1151 cfqg = &cfqd->root_group;
1152
1153 cfq_init_add_cfqg_lists(cfqd, cfqg, blkcg);
1092 rcu_read_unlock(); 1154 rcu_read_unlock();
1093 return cfqg; 1155 return cfqg;
1094} 1156}