diff options
author | Vivek Goyal <vgoyal@redhat.com> | 2011-05-19 15:38:23 -0400 |
---|---|---|
committer | Jens Axboe <jaxboe@fusionio.com> | 2011-05-20 14:34:52 -0400 |
commit | f469a7b4d5b1d1d053200a9015fd25d59c057f49 (patch) | |
tree | 2c68c0689e40955b186e350b15d44d0b260f4655 /block | |
parent | 56edf7d75db5b14d628b46623c414ffbeed68d7f (diff) |
blk-cgroup: Allow sleeping while dynamically allocating a group
Currently, all the cfq_group or throtl_group allocations happen while
we are holding ->queue_lock and sleeping is not allowed.
Soon, we will move to per cpu stats and also need to allocate the
per group stats. As one can not call alloc_percpu() from atomic
context as it can sleep, we need to drop ->queue_lock, allocate the
group, retake the lock and continue processing.
In throttling code, I check the queue DEAD flag again to make sure
that driver did not call blk_cleanup_queue() in the mean time.
Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
Diffstat (limited to 'block')
-rw-r--r-- | block/blk-core.c | 3 | ||||
-rw-r--r-- | block/blk-throttle.c | 141 | ||||
-rw-r--r-- | block/cfq-iosched.c | 128 |
3 files changed, 205 insertions, 67 deletions
diff --git a/block/blk-core.c b/block/blk-core.c index 3fe00a14822..9e8e297374b 100644 --- a/block/blk-core.c +++ b/block/blk-core.c | |||
@@ -1550,7 +1550,8 @@ static inline void __generic_make_request(struct bio *bio) | |||
1550 | goto end_io; | 1550 | goto end_io; |
1551 | } | 1551 | } |
1552 | 1552 | ||
1553 | blk_throtl_bio(q, &bio); | 1553 | if (blk_throtl_bio(q, &bio)) |
1554 | goto end_io; | ||
1554 | 1555 | ||
1555 | /* | 1556 | /* |
1556 | * If bio = NULL, bio has been throttled and will be submitted | 1557 | * If bio = NULL, bio has been throttled and will be submitted |
diff --git a/block/blk-throttle.c b/block/blk-throttle.c index fa9a900c125..c201967b33c 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c | |||
@@ -188,8 +188,40 @@ throtl_add_group_to_td_list(struct throtl_data *td, struct throtl_grp *tg) | |||
188 | td->nr_undestroyed_grps++; | 188 | td->nr_undestroyed_grps++; |
189 | } | 189 | } |
190 | 190 | ||
191 | static struct throtl_grp * throtl_find_alloc_tg(struct throtl_data *td, | 191 | static void throtl_init_add_tg_lists(struct throtl_data *td, |
192 | struct blkio_cgroup *blkcg) | 192 | struct throtl_grp *tg, struct blkio_cgroup *blkcg) |
193 | { | ||
194 | struct backing_dev_info *bdi = &td->queue->backing_dev_info; | ||
195 | unsigned int major, minor; | ||
196 | |||
197 | /* Add group onto cgroup list */ | ||
198 | sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); | ||
199 | blkiocg_add_blkio_group(blkcg, &tg->blkg, (void *)td, | ||
200 | MKDEV(major, minor), BLKIO_POLICY_THROTL); | ||
201 | |||
202 | tg->bps[READ] = blkcg_get_read_bps(blkcg, tg->blkg.dev); | ||
203 | tg->bps[WRITE] = blkcg_get_write_bps(blkcg, tg->blkg.dev); | ||
204 | tg->iops[READ] = blkcg_get_read_iops(blkcg, tg->blkg.dev); | ||
205 | tg->iops[WRITE] = blkcg_get_write_iops(blkcg, tg->blkg.dev); | ||
206 | |||
207 | throtl_add_group_to_td_list(td, tg); | ||
208 | } | ||
209 | |||
210 | /* Should be called without queue lock and outside of rcu period */ | ||
211 | static struct throtl_grp *throtl_alloc_tg(struct throtl_data *td) | ||
212 | { | ||
213 | struct throtl_grp *tg = NULL; | ||
214 | |||
215 | tg = kzalloc_node(sizeof(*tg), GFP_ATOMIC, td->queue->node); | ||
216 | if (!tg) | ||
217 | return NULL; | ||
218 | |||
219 | throtl_init_group(tg); | ||
220 | return tg; | ||
221 | } | ||
222 | |||
223 | static struct | ||
224 | throtl_grp *throtl_find_tg(struct throtl_data *td, struct blkio_cgroup *blkcg) | ||
193 | { | 225 | { |
194 | struct throtl_grp *tg = NULL; | 226 | struct throtl_grp *tg = NULL; |
195 | void *key = td; | 227 | void *key = td; |
@@ -197,12 +229,6 @@ static struct throtl_grp * throtl_find_alloc_tg(struct throtl_data *td, | |||
197 | unsigned int major, minor; | 229 | unsigned int major, minor; |
198 | 230 | ||
199 | /* | 231 | /* |
200 | * TODO: Speed up blkiocg_lookup_group() by maintaining a radix | ||
201 | * tree of blkg (instead of traversing through hash list all | ||
202 | * the time. | ||
203 | */ | ||
204 | |||
205 | /* | ||
206 | * This is the common case when there are no blkio cgroups. | 232 | * This is the common case when there are no blkio cgroups. |
207 | * Avoid lookup in this case | 233 | * Avoid lookup in this case |
208 | */ | 234 | */ |
@@ -215,43 +241,83 @@ static struct throtl_grp * throtl_find_alloc_tg(struct throtl_data *td, | |||
215 | if (tg && !tg->blkg.dev && bdi->dev && dev_name(bdi->dev)) { | 241 | if (tg && !tg->blkg.dev && bdi->dev && dev_name(bdi->dev)) { |
216 | sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); | 242 | sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); |
217 | tg->blkg.dev = MKDEV(major, minor); | 243 | tg->blkg.dev = MKDEV(major, minor); |
218 | goto done; | ||
219 | } | 244 | } |
220 | 245 | ||
221 | if (tg) | ||
222 | goto done; | ||
223 | |||
224 | tg = kzalloc_node(sizeof(*tg), GFP_ATOMIC, td->queue->node); | ||
225 | if (!tg) | ||
226 | goto done; | ||
227 | |||
228 | throtl_init_group(tg); | ||
229 | |||
230 | /* Add group onto cgroup list */ | ||
231 | sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); | ||
232 | blkiocg_add_blkio_group(blkcg, &tg->blkg, (void *)td, | ||
233 | MKDEV(major, minor), BLKIO_POLICY_THROTL); | ||
234 | |||
235 | tg->bps[READ] = blkcg_get_read_bps(blkcg, tg->blkg.dev); | ||
236 | tg->bps[WRITE] = blkcg_get_write_bps(blkcg, tg->blkg.dev); | ||
237 | tg->iops[READ] = blkcg_get_read_iops(blkcg, tg->blkg.dev); | ||
238 | tg->iops[WRITE] = blkcg_get_write_iops(blkcg, tg->blkg.dev); | ||
239 | |||
240 | throtl_add_group_to_td_list(td, tg); | ||
241 | done: | ||
242 | return tg; | 246 | return tg; |
243 | } | 247 | } |
244 | 248 | ||
249 | /* | ||
250 | * This function returns with queue lock unlocked in case of error, like | ||
251 | * request queue is no more | ||
252 | */ | ||
245 | static struct throtl_grp * throtl_get_tg(struct throtl_data *td) | 253 | static struct throtl_grp * throtl_get_tg(struct throtl_data *td) |
246 | { | 254 | { |
247 | struct throtl_grp *tg = NULL; | 255 | struct throtl_grp *tg = NULL, *__tg = NULL; |
248 | struct blkio_cgroup *blkcg; | 256 | struct blkio_cgroup *blkcg; |
257 | struct request_queue *q = td->queue; | ||
249 | 258 | ||
250 | rcu_read_lock(); | 259 | rcu_read_lock(); |
251 | blkcg = task_blkio_cgroup(current); | 260 | blkcg = task_blkio_cgroup(current); |
252 | tg = throtl_find_alloc_tg(td, blkcg); | 261 | tg = throtl_find_tg(td, blkcg); |
253 | if (!tg) | 262 | if (tg) { |
263 | rcu_read_unlock(); | ||
264 | return tg; | ||
265 | } | ||
266 | |||
267 | /* | ||
268 | * Need to allocate a group. Allocation of group also needs allocation | ||
269 | * of per cpu stats which in-turn takes a mutex() and can block. Hence | ||
270 | * we need to drop rcu lock and queue_lock before we call alloc | ||
271 | * | ||
272 | * Take the request queue reference to make sure queue does not | ||
273 | * go away once we return from allocation. | ||
274 | */ | ||
275 | blk_get_queue(q); | ||
276 | rcu_read_unlock(); | ||
277 | spin_unlock_irq(q->queue_lock); | ||
278 | |||
279 | tg = throtl_alloc_tg(td); | ||
280 | /* | ||
281 | * We might have slept in group allocation. Make sure queue is not | ||
282 | * dead | ||
283 | */ | ||
284 | if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) { | ||
285 | blk_put_queue(q); | ||
286 | if (tg) | ||
287 | kfree(tg); | ||
288 | |||
289 | return ERR_PTR(-ENODEV); | ||
290 | } | ||
291 | blk_put_queue(q); | ||
292 | |||
293 | /* Group allocated and queue is still alive. take the lock */ | ||
294 | spin_lock_irq(q->queue_lock); | ||
295 | |||
296 | /* | ||
297 | * Initialize the new group. After sleeping, read the blkcg again. | ||
298 | */ | ||
299 | rcu_read_lock(); | ||
300 | blkcg = task_blkio_cgroup(current); | ||
301 | |||
302 | /* | ||
303 | * If some other thread already allocated the group while we were | ||
304 | * not holding queue lock, free up the group | ||
305 | */ | ||
306 | __tg = throtl_find_tg(td, blkcg); | ||
307 | |||
308 | if (__tg) { | ||
309 | kfree(tg); | ||
310 | rcu_read_unlock(); | ||
311 | return __tg; | ||
312 | } | ||
313 | |||
314 | /* Group allocation failed. Account the IO to root group */ | ||
315 | if (!tg) { | ||
254 | tg = &td->root_tg; | 316 | tg = &td->root_tg; |
317 | return tg; | ||
318 | } | ||
319 | |||
320 | throtl_init_add_tg_lists(td, tg, blkcg); | ||
255 | rcu_read_unlock(); | 321 | rcu_read_unlock(); |
256 | return tg; | 322 | return tg; |
257 | } | 323 | } |
@@ -1014,6 +1080,15 @@ int blk_throtl_bio(struct request_queue *q, struct bio **biop) | |||
1014 | spin_lock_irq(q->queue_lock); | 1080 | spin_lock_irq(q->queue_lock); |
1015 | tg = throtl_get_tg(td); | 1081 | tg = throtl_get_tg(td); |
1016 | 1082 | ||
1083 | if (IS_ERR(tg)) { | ||
1084 | if (PTR_ERR(tg) == -ENODEV) { | ||
1085 | /* | ||
1086 | * Queue is gone. No queue lock held here. | ||
1087 | */ | ||
1088 | return -ENODEV; | ||
1089 | } | ||
1090 | } | ||
1091 | |||
1017 | if (tg->nr_queued[rw]) { | 1092 | if (tg->nr_queued[rw]) { |
1018 | /* | 1093 | /* |
1019 | * There is already another bio queued in same dir. No | 1094 | * There is already another bio queued in same dir. No |
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index e2e6719832e..606020fe93f 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c | |||
@@ -1016,28 +1016,47 @@ void cfq_update_blkio_group_weight(void *key, struct blkio_group *blkg, | |||
1016 | cfqg->needs_update = true; | 1016 | cfqg->needs_update = true; |
1017 | } | 1017 | } |
1018 | 1018 | ||
1019 | static struct cfq_group * cfq_find_alloc_cfqg(struct cfq_data *cfqd, | 1019 | static void cfq_init_add_cfqg_lists(struct cfq_data *cfqd, |
1020 | struct blkio_cgroup *blkcg) | 1020 | struct cfq_group *cfqg, struct blkio_cgroup *blkcg) |
1021 | { | 1021 | { |
1022 | struct cfq_group *cfqg = NULL; | ||
1023 | void *key = cfqd; | ||
1024 | int i, j; | ||
1025 | struct cfq_rb_root *st; | ||
1026 | struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info; | 1022 | struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info; |
1027 | unsigned int major, minor; | 1023 | unsigned int major, minor; |
1028 | 1024 | ||
1029 | cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key)); | 1025 | /* |
1030 | if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) { | 1026 | * Add group onto cgroup list. It might happen that bdi->dev is |
1027 | * not initialized yet. Initialize this new group without major | ||
1028 | * and minor info and this info will be filled in once a new thread | ||
1029 | * comes for IO. | ||
1030 | */ | ||
1031 | if (bdi->dev) { | ||
1031 | sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); | 1032 | sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); |
1032 | cfqg->blkg.dev = MKDEV(major, minor); | 1033 | cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, |
1033 | goto done; | 1034 | (void *)cfqd, MKDEV(major, minor)); |
1034 | } | 1035 | } else |
1035 | if (cfqg) | 1036 | cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, |
1036 | goto done; | 1037 | (void *)cfqd, 0); |
1038 | |||
1039 | cfqd->nr_blkcg_linked_grps++; | ||
1040 | cfqg->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev); | ||
1041 | |||
1042 | /* Add group on cfqd list */ | ||
1043 | hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list); | ||
1044 | } | ||
1045 | |||
1046 | /* | ||
1047 | * Should be called from sleepable context. No request queue lock as per | ||
1048 | * cpu stats are allocated dynamically and alloc_percpu needs to be called | ||
1049 | * from sleepable context. | ||
1050 | */ | ||
1051 | static struct cfq_group * cfq_alloc_cfqg(struct cfq_data *cfqd) | ||
1052 | { | ||
1053 | struct cfq_group *cfqg = NULL; | ||
1054 | int i, j; | ||
1055 | struct cfq_rb_root *st; | ||
1037 | 1056 | ||
1038 | cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node); | 1057 | cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node); |
1039 | if (!cfqg) | 1058 | if (!cfqg) |
1040 | goto done; | 1059 | return NULL; |
1041 | 1060 | ||
1042 | for_each_cfqg_st(cfqg, i, j, st) | 1061 | for_each_cfqg_st(cfqg, i, j, st) |
1043 | *st = CFQ_RB_ROOT; | 1062 | *st = CFQ_RB_ROOT; |
@@ -1050,28 +1069,31 @@ static struct cfq_group * cfq_find_alloc_cfqg(struct cfq_data *cfqd, | |||
1050 | * or cgroup deletion path depending on who is exiting first. | 1069 | * or cgroup deletion path depending on who is exiting first. |
1051 | */ | 1070 | */ |
1052 | cfqg->ref = 1; | 1071 | cfqg->ref = 1; |
1072 | return cfqg; | ||
1073 | } | ||
1074 | |||
1075 | static struct cfq_group * | ||
1076 | cfq_find_cfqg(struct cfq_data *cfqd, struct blkio_cgroup *blkcg) | ||
1077 | { | ||
1078 | struct cfq_group *cfqg = NULL; | ||
1079 | void *key = cfqd; | ||
1080 | struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info; | ||
1081 | unsigned int major, minor; | ||
1053 | 1082 | ||
1054 | /* | 1083 | /* |
1055 | * Add group onto cgroup list. It might happen that bdi->dev is | 1084 | * This is the common case when there are no blkio cgroups. |
1056 | * not initialized yet. Initialize this new group without major | 1085 | * Avoid lookup in this case |
1057 | * and minor info and this info will be filled in once a new thread | ||
1058 | * comes for IO. See code above. | ||
1059 | */ | 1086 | */ |
1060 | if (bdi->dev) { | 1087 | if (blkcg == &blkio_root_cgroup) |
1061 | sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); | 1088 | cfqg = &cfqd->root_group; |
1062 | cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd, | 1089 | else |
1063 | MKDEV(major, minor)); | 1090 | cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key)); |
1064 | } else | ||
1065 | cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd, | ||
1066 | 0); | ||
1067 | |||
1068 | cfqd->nr_blkcg_linked_grps++; | ||
1069 | cfqg->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev); | ||
1070 | 1091 | ||
1071 | /* Add group on cfqd list */ | 1092 | if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) { |
1072 | hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list); | 1093 | sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); |
1094 | cfqg->blkg.dev = MKDEV(major, minor); | ||
1095 | } | ||
1073 | 1096 | ||
1074 | done: | ||
1075 | return cfqg; | 1097 | return cfqg; |
1076 | } | 1098 | } |
1077 | 1099 | ||
@@ -1082,13 +1104,53 @@ done: | |||
1082 | static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd) | 1104 | static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd) |
1083 | { | 1105 | { |
1084 | struct blkio_cgroup *blkcg; | 1106 | struct blkio_cgroup *blkcg; |
1085 | struct cfq_group *cfqg = NULL; | 1107 | struct cfq_group *cfqg = NULL, *__cfqg = NULL; |
1108 | struct request_queue *q = cfqd->queue; | ||
1109 | |||
1110 | rcu_read_lock(); | ||
1111 | blkcg = task_blkio_cgroup(current); | ||
1112 | cfqg = cfq_find_cfqg(cfqd, blkcg); | ||
1113 | if (cfqg) { | ||
1114 | rcu_read_unlock(); | ||
1115 | return cfqg; | ||
1116 | } | ||
1117 | |||
1118 | /* | ||
1119 | * Need to allocate a group. Allocation of group also needs allocation | ||
1120 | * of per cpu stats which in-turn takes a mutex() and can block. Hence | ||
1121 | * we need to drop rcu lock and queue_lock before we call alloc. | ||
1122 | * | ||
1123 | * Not taking any queue reference here and assuming that queue is | ||
1124 | * around by the time we return. CFQ queue allocation code does | ||
1125 | * the same. It might be racy though. | ||
1126 | */ | ||
1127 | |||
1128 | rcu_read_unlock(); | ||
1129 | spin_unlock_irq(q->queue_lock); | ||
1130 | |||
1131 | cfqg = cfq_alloc_cfqg(cfqd); | ||
1132 | |||
1133 | spin_lock_irq(q->queue_lock); | ||
1086 | 1134 | ||
1087 | rcu_read_lock(); | 1135 | rcu_read_lock(); |
1088 | blkcg = task_blkio_cgroup(current); | 1136 | blkcg = task_blkio_cgroup(current); |
1089 | cfqg = cfq_find_alloc_cfqg(cfqd, blkcg); | 1137 | |
1138 | /* | ||
1139 | * If some other thread already allocated the group while we were | ||
1140 | * not holding queue lock, free up the group | ||
1141 | */ | ||
1142 | __cfqg = cfq_find_cfqg(cfqd, blkcg); | ||
1143 | |||
1144 | if (__cfqg) { | ||
1145 | kfree(cfqg); | ||
1146 | rcu_read_unlock(); | ||
1147 | return __cfqg; | ||
1148 | } | ||
1149 | |||
1090 | if (!cfqg) | 1150 | if (!cfqg) |
1091 | cfqg = &cfqd->root_group; | 1151 | cfqg = &cfqd->root_group; |
1152 | |||
1153 | cfq_init_add_cfqg_lists(cfqd, cfqg, blkcg); | ||
1092 | rcu_read_unlock(); | 1154 | rcu_read_unlock(); |
1093 | return cfqg; | 1155 | return cfqg; |
1094 | } | 1156 | } |