diff options
| author | Tejun Heo <tj@kernel.org> | 2012-04-19 19:29:24 -0400 |
|---|---|---|
| committer | Jens Axboe <axboe@kernel.dk> | 2012-04-20 04:06:40 -0400 |
| commit | a637120e49021d197e9578cba545bbaa459cbb51 (patch) | |
| tree | 0d502a8fcc55c89eb4d79a7578e46a9273d1f2c8 | |
| parent | 496fb7806d616185a46865a4f3a20ef19dc6c7e3 (diff) | |
blkcg: use radix tree to index blkgs from blkcg
blkg lookup is currently performed by traversing linked list anchored
at blkcg->blkg_list. This is very unscalable and with blk-throttle
enabled and enough request queues on the system, this can get very
ugly quickly (blk-throttle performs look up on every bio submission).
This patch makes blkcg use radix tree to index blkgs combined with
simple last-looked-up hint. This is mostly identical to how icqs are
indexed from ioc.
Note that because __blkg_lookup() may be invoked without holding queue
lock, hint is only updated from __blkg_lookup_create(). Due to cfq's
cfqq caching, this makes hint updates overly lazy. This will be
improved with scheduled blkcg aware request allocation.
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
| -rw-r--r-- | block/blk-cgroup.c | 52 | ||||
| -rw-r--r-- | block/blk-cgroup.h | 6 |
2 files changed, 50 insertions, 8 deletions
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 30a7a9c58b38..02cf6335e9bd 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c | |||
| @@ -142,11 +142,21 @@ static struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, | |||
| 142 | struct request_queue *q) | 142 | struct request_queue *q) |
| 143 | { | 143 | { |
| 144 | struct blkcg_gq *blkg; | 144 | struct blkcg_gq *blkg; |
| 145 | struct hlist_node *n; | ||
| 146 | 145 | ||
| 147 | hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) | 146 | blkg = rcu_dereference(blkcg->blkg_hint); |
| 148 | if (blkg->q == q) | 147 | if (blkg && blkg->q == q) |
| 149 | return blkg; | 148 | return blkg; |
| 149 | |||
| 150 | /* | ||
| 151 | * Hint didn't match. Look up from the radix tree. Note that we | ||
| 152 | * may not be holding queue_lock and thus are not sure whether | ||
| 153 | * @blkg from blkg_tree has already been removed or not, so we | ||
| 154 | * can't update hint to the lookup result. Leave it to the caller. | ||
| 155 | */ | ||
| 156 | blkg = radix_tree_lookup(&blkcg->blkg_tree, q->id); | ||
| 157 | if (blkg && blkg->q == q) | ||
| 158 | return blkg; | ||
| 159 | |||
| 150 | return NULL; | 160 | return NULL; |
| 151 | } | 161 | } |
| 152 | 162 | ||
| @@ -179,9 +189,12 @@ static struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg, | |||
| 179 | WARN_ON_ONCE(!rcu_read_lock_held()); | 189 | WARN_ON_ONCE(!rcu_read_lock_held()); |
| 180 | lockdep_assert_held(q->queue_lock); | 190 | lockdep_assert_held(q->queue_lock); |
| 181 | 191 | ||
| 192 | /* lookup and update hint on success, see __blkg_lookup() for details */ | ||
| 182 | blkg = __blkg_lookup(blkcg, q); | 193 | blkg = __blkg_lookup(blkcg, q); |
| 183 | if (blkg) | 194 | if (blkg) { |
| 195 | rcu_assign_pointer(blkcg->blkg_hint, blkg); | ||
| 184 | return blkg; | 196 | return blkg; |
| 197 | } | ||
| 185 | 198 | ||
| 186 | /* blkg holds a reference to blkcg */ | 199 | /* blkg holds a reference to blkcg */ |
| 187 | if (!css_tryget(&blkcg->css)) | 200 | if (!css_tryget(&blkcg->css)) |
| @@ -194,12 +207,24 @@ static struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg, | |||
| 194 | goto err_put; | 207 | goto err_put; |
| 195 | 208 | ||
| 196 | /* insert */ | 209 | /* insert */ |
| 210 | ret = radix_tree_preload(GFP_ATOMIC); | ||
| 211 | if (ret) | ||
| 212 | goto err_free; | ||
| 213 | |||
| 197 | spin_lock(&blkcg->lock); | 214 | spin_lock(&blkcg->lock); |
| 198 | hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list); | 215 | ret = radix_tree_insert(&blkcg->blkg_tree, q->id, blkg); |
| 199 | list_add(&blkg->q_node, &q->blkg_list); | 216 | if (likely(!ret)) { |
| 217 | hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list); | ||
| 218 | list_add(&blkg->q_node, &q->blkg_list); | ||
| 219 | } | ||
| 200 | spin_unlock(&blkcg->lock); | 220 | spin_unlock(&blkcg->lock); |
| 201 | return blkg; | ||
| 202 | 221 | ||
| 222 | radix_tree_preload_end(); | ||
| 223 | |||
| 224 | if (!ret) | ||
| 225 | return blkg; | ||
| 226 | err_free: | ||
| 227 | blkg_free(blkg); | ||
| 203 | err_put: | 228 | err_put: |
| 204 | css_put(&blkcg->css); | 229 | css_put(&blkcg->css); |
| 205 | return ERR_PTR(ret); | 230 | return ERR_PTR(ret); |
| @@ -229,10 +254,20 @@ static void blkg_destroy(struct blkcg_gq *blkg) | |||
| 229 | /* Something wrong if we are trying to remove same group twice */ | 254 | /* Something wrong if we are trying to remove same group twice */ |
| 230 | WARN_ON_ONCE(list_empty(&blkg->q_node)); | 255 | WARN_ON_ONCE(list_empty(&blkg->q_node)); |
| 231 | WARN_ON_ONCE(hlist_unhashed(&blkg->blkcg_node)); | 256 | WARN_ON_ONCE(hlist_unhashed(&blkg->blkcg_node)); |
| 257 | |||
| 258 | radix_tree_delete(&blkcg->blkg_tree, blkg->q->id); | ||
| 232 | list_del_init(&blkg->q_node); | 259 | list_del_init(&blkg->q_node); |
| 233 | hlist_del_init_rcu(&blkg->blkcg_node); | 260 | hlist_del_init_rcu(&blkg->blkcg_node); |
| 234 | 261 | ||
| 235 | /* | 262 | /* |
| 263 | * Both setting lookup hint to and clearing it from @blkg are done | ||
| 264 | * under queue_lock. If it's not pointing to @blkg now, it never | ||
| 265 | * will. Hint assignment itself can race safely. | ||
| 266 | */ | ||
| 267 | if (rcu_dereference_raw(blkcg->blkg_hint) == blkg) | ||
| 268 | rcu_assign_pointer(blkcg->blkg_hint, NULL); | ||
| 269 | |||
| 270 | /* | ||
| 236 | * Put the reference taken at the time of creation so that when all | 271 | * Put the reference taken at the time of creation so that when all |
| 237 | * queues are gone, group can be destroyed. | 272 | * queues are gone, group can be destroyed. |
| 238 | */ | 273 | */ |
| @@ -593,6 +628,7 @@ static struct cgroup_subsys_state *blkcg_create(struct cgroup *cgroup) | |||
| 593 | blkcg->id = atomic64_inc_return(&id_seq); /* root is 0, start from 1 */ | 628 | blkcg->id = atomic64_inc_return(&id_seq); /* root is 0, start from 1 */ |
| 594 | done: | 629 | done: |
| 595 | spin_lock_init(&blkcg->lock); | 630 | spin_lock_init(&blkcg->lock); |
| 631 | INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_ATOMIC); | ||
| 596 | INIT_HLIST_HEAD(&blkcg->blkg_list); | 632 | INIT_HLIST_HEAD(&blkcg->blkg_list); |
| 597 | 633 | ||
| 598 | return &blkcg->css; | 634 | return &blkcg->css; |
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index 44cb9086ed42..8ac457ce7783 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h | |||
| @@ -16,6 +16,7 @@ | |||
| 16 | #include <linux/cgroup.h> | 16 | #include <linux/cgroup.h> |
| 17 | #include <linux/u64_stats_sync.h> | 17 | #include <linux/u64_stats_sync.h> |
| 18 | #include <linux/seq_file.h> | 18 | #include <linux/seq_file.h> |
| 19 | #include <linux/radix-tree.h> | ||
| 19 | 20 | ||
| 20 | /* Max limits for throttle policy */ | 21 | /* Max limits for throttle policy */ |
| 21 | #define THROTL_IOPS_MAX UINT_MAX | 22 | #define THROTL_IOPS_MAX UINT_MAX |
| @@ -37,9 +38,14 @@ enum blkg_rwstat_type { | |||
| 37 | BLKG_RWSTAT_TOTAL = BLKG_RWSTAT_NR, | 38 | BLKG_RWSTAT_TOTAL = BLKG_RWSTAT_NR, |
| 38 | }; | 39 | }; |
| 39 | 40 | ||
| 41 | struct blkcg_gq; | ||
| 42 | |||
| 40 | struct blkcg { | 43 | struct blkcg { |
| 41 | struct cgroup_subsys_state css; | 44 | struct cgroup_subsys_state css; |
| 42 | spinlock_t lock; | 45 | spinlock_t lock; |
| 46 | |||
| 47 | struct radix_tree_root blkg_tree; | ||
| 48 | struct blkcg_gq *blkg_hint; | ||
| 43 | struct hlist_head blkg_list; | 49 | struct hlist_head blkg_list; |
| 44 | 50 | ||
| 45 | /* for policies to test whether associated blkcg has changed */ | 51 | /* for policies to test whether associated blkcg has changed */ |
