diff options
author | Paolo Valente <paolo.valente@linaro.org> | 2017-06-05 04:11:15 -0400 |
---|---|---|
committer | Jens Axboe <axboe@fb.com> | 2017-06-08 11:51:10 -0400 |
commit | 8f9bebc33dd718283183582fc4a762e178552fb8 (patch) | |
tree | e3fe5ffbc42b51a54a5a9457e006bae360862a77 /block | |
parent | 85d0331aedff4646b2a2b14561c8be3678ffcee2 (diff) |
block, bfq: access and cache blkg data only when safe
In blk-cgroup, operations on blkg objects are protected with the
request_queue lock. This is no more the lock that protects
I/O-scheduler operations in blk-mq. In fact, the latter are now
protected with a finer-grained per-scheduler-instance lock. As a
consequence, although blkg lookups are also rcu-protected, blk-mq I/O
schedulers may see inconsistent data when they access blkg and
blkg-related objects. BFQ does access these objects, and does incur
this problem, in the following case.
The blkg_lookup performed in bfq_get_queue, being protected (only)
through rcu, may happen to return the address of a copy of the
original blkg. If this is the case, then the blkg_get performed in
bfq_get_queue, to pin down the blkg, is useless: it does not prevent
blk-cgroup code from destroying both the original blkg and all objects
directly or indirectly referred by the copy of the blkg. BFQ accesses
these objects, which typically causes a crash for NULL-pointer
dereference of memory-protection violation.
Some additional protection mechanism should be added to blk-cgroup to
address this issue. In the meantime, this commit provides a quick
temporary fix for BFQ: cache (when safe) blkg data that might
disappear right after a blkg_lookup.
In particular, this commit exploits the following facts to achieve its
goal without introducing further locks. Destroy operations on a blkg
invoke, as a first step, hooks of the scheduler associated with the
blkg. And these hooks are executed with bfqd->lock held for BFQ. As a
consequence, for any blkg associated with the request queue an
instance of BFQ is attached to, we are guaranteed that such a blkg is
not destroyed, and that all the pointers it contains are consistent,
while that instance is holding its bfqd->lock. A blkg_lookup performed
with bfqd->lock held then returns a fully consistent blkg, which
remains consistent until this lock is held. In more detail, this holds
even if the returned blkg is a copy of the original one.
Finally, also the object describing a group inside BFQ needs to be
protected from destruction on the blkg_free of the original blkg
(which invokes bfq_pd_free). This commit adds private refcounting for
this object, to let it disappear only after no bfq_queue refers to it
any longer.
This commit also removes or updates some stale comments on locking
issues related to blk-cgroup operations.
Reported-by: Tomas Konir <tomas.konir@gmail.com>
Reported-by: Lee Tibbert <lee.tibbert@gmail.com>
Reported-by: Marco Piazza <mpiazza@gmail.com>
Signed-off-by: Paolo Valente <paolo.valente@linaro.org>
Tested-by: Tomas Konir <tomas.konir@gmail.com>
Tested-by: Lee Tibbert <lee.tibbert@gmail.com>
Tested-by: Marco Piazza <mpiazza@gmail.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
Diffstat (limited to 'block')
-rw-r--r-- | block/bfq-cgroup.c | 116 | ||||
-rw-r--r-- | block/bfq-iosched.c | 2 | ||||
-rw-r--r-- | block/bfq-iosched.h | 23 |
3 files changed, 105 insertions, 36 deletions
diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index c8a32fb345cf..78b2e0db4fb2 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c | |||
@@ -52,7 +52,7 @@ BFQG_FLAG_FNS(idling) | |||
52 | BFQG_FLAG_FNS(empty) | 52 | BFQG_FLAG_FNS(empty) |
53 | #undef BFQG_FLAG_FNS | 53 | #undef BFQG_FLAG_FNS |
54 | 54 | ||
55 | /* This should be called with the queue_lock held. */ | 55 | /* This should be called with the scheduler lock held. */ |
56 | static void bfqg_stats_update_group_wait_time(struct bfqg_stats *stats) | 56 | static void bfqg_stats_update_group_wait_time(struct bfqg_stats *stats) |
57 | { | 57 | { |
58 | unsigned long long now; | 58 | unsigned long long now; |
@@ -67,7 +67,7 @@ static void bfqg_stats_update_group_wait_time(struct bfqg_stats *stats) | |||
67 | bfqg_stats_clear_waiting(stats); | 67 | bfqg_stats_clear_waiting(stats); |
68 | } | 68 | } |
69 | 69 | ||
70 | /* This should be called with the queue_lock held. */ | 70 | /* This should be called with the scheduler lock held. */ |
71 | static void bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg, | 71 | static void bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg, |
72 | struct bfq_group *curr_bfqg) | 72 | struct bfq_group *curr_bfqg) |
73 | { | 73 | { |
@@ -81,7 +81,7 @@ static void bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg, | |||
81 | bfqg_stats_mark_waiting(stats); | 81 | bfqg_stats_mark_waiting(stats); |
82 | } | 82 | } |
83 | 83 | ||
84 | /* This should be called with the queue_lock held. */ | 84 | /* This should be called with the scheduler lock held. */ |
85 | static void bfqg_stats_end_empty_time(struct bfqg_stats *stats) | 85 | static void bfqg_stats_end_empty_time(struct bfqg_stats *stats) |
86 | { | 86 | { |
87 | unsigned long long now; | 87 | unsigned long long now; |
@@ -203,12 +203,30 @@ struct bfq_group *bfqq_group(struct bfq_queue *bfqq) | |||
203 | 203 | ||
204 | static void bfqg_get(struct bfq_group *bfqg) | 204 | static void bfqg_get(struct bfq_group *bfqg) |
205 | { | 205 | { |
206 | return blkg_get(bfqg_to_blkg(bfqg)); | 206 | bfqg->ref++; |
207 | } | 207 | } |
208 | 208 | ||
209 | void bfqg_put(struct bfq_group *bfqg) | 209 | void bfqg_put(struct bfq_group *bfqg) |
210 | { | 210 | { |
211 | return blkg_put(bfqg_to_blkg(bfqg)); | 211 | bfqg->ref--; |
212 | |||
213 | if (bfqg->ref == 0) | ||
214 | kfree(bfqg); | ||
215 | } | ||
216 | |||
217 | static void bfqg_and_blkg_get(struct bfq_group *bfqg) | ||
218 | { | ||
219 | /* see comments in bfq_bic_update_cgroup for why refcounting bfqg */ | ||
220 | bfqg_get(bfqg); | ||
221 | |||
222 | blkg_get(bfqg_to_blkg(bfqg)); | ||
223 | } | ||
224 | |||
225 | void bfqg_and_blkg_put(struct bfq_group *bfqg) | ||
226 | { | ||
227 | bfqg_put(bfqg); | ||
228 | |||
229 | blkg_put(bfqg_to_blkg(bfqg)); | ||
212 | } | 230 | } |
213 | 231 | ||
214 | void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq, | 232 | void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq, |
@@ -312,7 +330,11 @@ void bfq_init_entity(struct bfq_entity *entity, struct bfq_group *bfqg) | |||
312 | if (bfqq) { | 330 | if (bfqq) { |
313 | bfqq->ioprio = bfqq->new_ioprio; | 331 | bfqq->ioprio = bfqq->new_ioprio; |
314 | bfqq->ioprio_class = bfqq->new_ioprio_class; | 332 | bfqq->ioprio_class = bfqq->new_ioprio_class; |
315 | bfqg_get(bfqg); | 333 | /* |
334 | * Make sure that bfqg and its associated blkg do not | ||
335 | * disappear before entity. | ||
336 | */ | ||
337 | bfqg_and_blkg_get(bfqg); | ||
316 | } | 338 | } |
317 | entity->parent = bfqg->my_entity; /* NULL for root group */ | 339 | entity->parent = bfqg->my_entity; /* NULL for root group */ |
318 | entity->sched_data = &bfqg->sched_data; | 340 | entity->sched_data = &bfqg->sched_data; |
@@ -399,6 +421,8 @@ struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, int node) | |||
399 | return NULL; | 421 | return NULL; |
400 | } | 422 | } |
401 | 423 | ||
424 | /* see comments in bfq_bic_update_cgroup for why refcounting */ | ||
425 | bfqg_get(bfqg); | ||
402 | return &bfqg->pd; | 426 | return &bfqg->pd; |
403 | } | 427 | } |
404 | 428 | ||
@@ -426,7 +450,7 @@ void bfq_pd_free(struct blkg_policy_data *pd) | |||
426 | struct bfq_group *bfqg = pd_to_bfqg(pd); | 450 | struct bfq_group *bfqg = pd_to_bfqg(pd); |
427 | 451 | ||
428 | bfqg_stats_exit(&bfqg->stats); | 452 | bfqg_stats_exit(&bfqg->stats); |
429 | return kfree(bfqg); | 453 | bfqg_put(bfqg); |
430 | } | 454 | } |
431 | 455 | ||
432 | void bfq_pd_reset_stats(struct blkg_policy_data *pd) | 456 | void bfq_pd_reset_stats(struct blkg_policy_data *pd) |
@@ -496,9 +520,10 @@ struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd, | |||
496 | * Move @bfqq to @bfqg, deactivating it from its old group and reactivating | 520 | * Move @bfqq to @bfqg, deactivating it from its old group and reactivating |
497 | * it on the new one. Avoid putting the entity on the old group idle tree. | 521 | * it on the new one. Avoid putting the entity on the old group idle tree. |
498 | * | 522 | * |
499 | * Must be called under the queue lock; the cgroup owning @bfqg must | 523 | * Must be called under the scheduler lock, to make sure that the blkg |
500 | * not disappear (by now this just means that we are called under | 524 | * owning @bfqg does not disappear (see comments in |
501 | * rcu_read_lock()). | 525 | * bfq_bic_update_cgroup on guaranteeing the consistency of blkg |
526 | * objects). | ||
502 | */ | 527 | */ |
503 | void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, | 528 | void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, |
504 | struct bfq_group *bfqg) | 529 | struct bfq_group *bfqg) |
@@ -519,16 +544,12 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, | |||
519 | bfq_deactivate_bfqq(bfqd, bfqq, false, false); | 544 | bfq_deactivate_bfqq(bfqd, bfqq, false, false); |
520 | else if (entity->on_st) | 545 | else if (entity->on_st) |
521 | bfq_put_idle_entity(bfq_entity_service_tree(entity), entity); | 546 | bfq_put_idle_entity(bfq_entity_service_tree(entity), entity); |
522 | bfqg_put(bfqq_group(bfqq)); | 547 | bfqg_and_blkg_put(bfqq_group(bfqq)); |
523 | 548 | ||
524 | /* | ||
525 | * Here we use a reference to bfqg. We don't need a refcounter | ||
526 | * as the cgroup reference will not be dropped, so that its | ||
527 | * destroy() callback will not be invoked. | ||
528 | */ | ||
529 | entity->parent = bfqg->my_entity; | 549 | entity->parent = bfqg->my_entity; |
530 | entity->sched_data = &bfqg->sched_data; | 550 | entity->sched_data = &bfqg->sched_data; |
531 | bfqg_get(bfqg); | 551 | /* pin down bfqg and its associated blkg */ |
552 | bfqg_and_blkg_get(bfqg); | ||
532 | 553 | ||
533 | if (bfq_bfqq_busy(bfqq)) { | 554 | if (bfq_bfqq_busy(bfqq)) { |
534 | bfq_pos_tree_add_move(bfqd, bfqq); | 555 | bfq_pos_tree_add_move(bfqd, bfqq); |
@@ -545,8 +566,9 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, | |||
545 | * @bic: the bic to move. | 566 | * @bic: the bic to move. |
546 | * @blkcg: the blk-cgroup to move to. | 567 | * @blkcg: the blk-cgroup to move to. |
547 | * | 568 | * |
548 | * Move bic to blkcg, assuming that bfqd->queue is locked; the caller | 569 | * Move bic to blkcg, assuming that bfqd->lock is held; which makes |
549 | * has to make sure that the reference to cgroup is valid across the call. | 570 | * sure that the reference to cgroup is valid across the call (see |
571 | * comments in bfq_bic_update_cgroup on this issue) | ||
550 | * | 572 | * |
551 | * NOTE: an alternative approach might have been to store the current | 573 | * NOTE: an alternative approach might have been to store the current |
552 | * cgroup in bfqq and getting a reference to it, reducing the lookup | 574 | * cgroup in bfqq and getting a reference to it, reducing the lookup |
@@ -604,6 +626,57 @@ void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) | |||
604 | goto out; | 626 | goto out; |
605 | 627 | ||
606 | bfqg = __bfq_bic_change_cgroup(bfqd, bic, bio_blkcg(bio)); | 628 | bfqg = __bfq_bic_change_cgroup(bfqd, bic, bio_blkcg(bio)); |
629 | /* | ||
630 | * Update blkg_path for bfq_log_* functions. We cache this | ||
631 | * path, and update it here, for the following | ||
632 | * reasons. Operations on blkg objects in blk-cgroup are | ||
633 | * protected with the request_queue lock, and not with the | ||
634 | * lock that protects the instances of this scheduler | ||
635 | * (bfqd->lock). This exposes BFQ to the following sort of | ||
636 | * race. | ||
637 | * | ||
638 | * The blkg_lookup performed in bfq_get_queue, protected | ||
639 | * through rcu, may happen to return the address of a copy of | ||
640 | * the original blkg. If this is the case, then the | ||
641 | * bfqg_and_blkg_get performed in bfq_get_queue, to pin down | ||
642 | * the blkg, is useless: it does not prevent blk-cgroup code | ||
643 | * from destroying both the original blkg and all objects | ||
644 | * directly or indirectly referred by the copy of the | ||
645 | * blkg. | ||
646 | * | ||
647 | * On the bright side, destroy operations on a blkg invoke, as | ||
648 | * a first step, hooks of the scheduler associated with the | ||
649 | * blkg. And these hooks are executed with bfqd->lock held for | ||
650 | * BFQ. As a consequence, for any blkg associated with the | ||
651 | * request queue this instance of the scheduler is attached | ||
652 | * to, we are guaranteed that such a blkg is not destroyed, and | ||
653 | * that all the pointers it contains are consistent, while we | ||
654 | * are holding bfqd->lock. A blkg_lookup performed with | ||
655 | * bfqd->lock held then returns a fully consistent blkg, which | ||
656 | * remains consistent until this lock is held. | ||
657 | * | ||
658 | * Thanks to the last fact, and to the fact that: (1) bfqg has | ||
659 | * been obtained through a blkg_lookup in the above | ||
660 | * assignment, and (2) bfqd->lock is being held, here we can | ||
661 | * safely use the policy data for the involved blkg (i.e., the | ||
662 | * field bfqg->pd) to get to the blkg associated with bfqg, | ||
663 | * and then we can safely use any field of blkg. After we | ||
664 | * release bfqd->lock, even just getting blkg through this | ||
665 | * bfqg may cause dangling references to be traversed, as | ||
666 | * bfqg->pd may not exist any more. | ||
667 | * | ||
668 | * In view of the above facts, here we cache, in the bfqg, any | ||
669 | * blkg data we may need for this bic, and for its associated | ||
670 | * bfq_queue. As of now, we need to cache only the path of the | ||
671 | * blkg, which is used in the bfq_log_* functions. | ||
672 | * | ||
673 | * Finally, note that bfqg itself needs to be protected from | ||
674 | * destruction on the blkg_free of the original blkg (which | ||
675 | * invokes bfq_pd_free). We use an additional private | ||
676 | * refcounter for bfqg, to let it disappear only after no | ||
677 | * bfq_queue refers to it any longer. | ||
678 | */ | ||
679 | blkg_path(bfqg_to_blkg(bfqg), bfqg->blkg_path, sizeof(bfqg->blkg_path)); | ||
607 | bic->blkcg_serial_nr = serial_nr; | 680 | bic->blkcg_serial_nr = serial_nr; |
608 | out: | 681 | out: |
609 | rcu_read_unlock(); | 682 | rcu_read_unlock(); |
@@ -640,8 +713,6 @@ static void bfq_reparent_leaf_entity(struct bfq_data *bfqd, | |||
640 | * @bfqd: the device data structure with the root group. | 713 | * @bfqd: the device data structure with the root group. |
641 | * @bfqg: the group to move from. | 714 | * @bfqg: the group to move from. |
642 | * @st: the service tree with the entities. | 715 | * @st: the service tree with the entities. |
643 | * | ||
644 | * Needs queue_lock to be taken and reference to be valid over the call. | ||
645 | */ | 716 | */ |
646 | static void bfq_reparent_active_entities(struct bfq_data *bfqd, | 717 | static void bfq_reparent_active_entities(struct bfq_data *bfqd, |
647 | struct bfq_group *bfqg, | 718 | struct bfq_group *bfqg, |
@@ -692,8 +763,7 @@ void bfq_pd_offline(struct blkg_policy_data *pd) | |||
692 | /* | 763 | /* |
693 | * The idle tree may still contain bfq_queues belonging | 764 | * The idle tree may still contain bfq_queues belonging |
694 | * to exited task because they never migrated to a different | 765 | * to exited task because they never migrated to a different |
695 | * cgroup from the one being destroyed now. No one else | 766 | * cgroup from the one being destroyed now. |
696 | * can access them so it's safe to act without any lock. | ||
697 | */ | 767 | */ |
698 | bfq_flush_idle_tree(st); | 768 | bfq_flush_idle_tree(st); |
699 | 769 | ||
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 08ce45096350..ed93da2462ab 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c | |||
@@ -3665,7 +3665,7 @@ void bfq_put_queue(struct bfq_queue *bfqq) | |||
3665 | 3665 | ||
3666 | kmem_cache_free(bfq_pool, bfqq); | 3666 | kmem_cache_free(bfq_pool, bfqq); |
3667 | #ifdef CONFIG_BFQ_GROUP_IOSCHED | 3667 | #ifdef CONFIG_BFQ_GROUP_IOSCHED |
3668 | bfqg_put(bfqg); | 3668 | bfqg_and_blkg_put(bfqg); |
3669 | #endif | 3669 | #endif |
3670 | } | 3670 | } |
3671 | 3671 | ||
diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index ae783c06dfd9..5c3bf9861492 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h | |||
@@ -759,6 +759,12 @@ struct bfq_group { | |||
759 | /* must be the first member */ | 759 | /* must be the first member */ |
760 | struct blkg_policy_data pd; | 760 | struct blkg_policy_data pd; |
761 | 761 | ||
762 | /* cached path for this blkg (see comments in bfq_bic_update_cgroup) */ | ||
763 | char blkg_path[128]; | ||
764 | |||
765 | /* reference counter (see comments in bfq_bic_update_cgroup) */ | ||
766 | int ref; | ||
767 | |||
762 | struct bfq_entity entity; | 768 | struct bfq_entity entity; |
763 | struct bfq_sched_data sched_data; | 769 | struct bfq_sched_data sched_data; |
764 | 770 | ||
@@ -838,7 +844,7 @@ struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd, | |||
838 | struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); | 844 | struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); |
839 | struct bfq_group *bfqq_group(struct bfq_queue *bfqq); | 845 | struct bfq_group *bfqq_group(struct bfq_queue *bfqq); |
840 | struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node); | 846 | struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node); |
841 | void bfqg_put(struct bfq_group *bfqg); | 847 | void bfqg_and_blkg_put(struct bfq_group *bfqg); |
842 | 848 | ||
843 | #ifdef CONFIG_BFQ_GROUP_IOSCHED | 849 | #ifdef CONFIG_BFQ_GROUP_IOSCHED |
844 | extern struct cftype bfq_blkcg_legacy_files[]; | 850 | extern struct cftype bfq_blkcg_legacy_files[]; |
@@ -910,20 +916,13 @@ void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq); | |||
910 | struct bfq_group *bfqq_group(struct bfq_queue *bfqq); | 916 | struct bfq_group *bfqq_group(struct bfq_queue *bfqq); |
911 | 917 | ||
912 | #define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ | 918 | #define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ |
913 | char __pbuf[128]; \ | 919 | blk_add_trace_msg((bfqd)->queue, "bfq%d%c %s " fmt, (bfqq)->pid,\ |
914 | \ | ||
915 | blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \ | ||
916 | blk_add_trace_msg((bfqd)->queue, "bfq%d%c %s " fmt, (bfqq)->pid, \ | ||
917 | bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ | 920 | bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ |
918 | __pbuf, ##args); \ | 921 | bfqq_group(bfqq)->blkg_path, ##args); \ |
919 | } while (0) | 922 | } while (0) |
920 | 923 | ||
921 | #define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \ | 924 | #define bfq_log_bfqg(bfqd, bfqg, fmt, args...) \ |
922 | char __pbuf[128]; \ | 925 | blk_add_trace_msg((bfqd)->queue, "%s " fmt, (bfqg)->blkg_path, ##args) |
923 | \ | ||
924 | blkg_path(bfqg_to_blkg(bfqg), __pbuf, sizeof(__pbuf)); \ | ||
925 | blk_add_trace_msg((bfqd)->queue, "%s " fmt, __pbuf, ##args); \ | ||
926 | } while (0) | ||
927 | 926 | ||
928 | #else /* CONFIG_BFQ_GROUP_IOSCHED */ | 927 | #else /* CONFIG_BFQ_GROUP_IOSCHED */ |
929 | 928 | ||