diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2012-05-30 11:52:42 -0400 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2012-05-30 11:52:42 -0400 |
| commit | 0d167518e045cc8bb63f0a8a0a85ad4fa4e0044f (patch) | |
| tree | 101a9b5d425d79f663e4f25f1e90b7a8cc6604f1 /block/blk-throttle.c | |
| parent | 2f83766d4b18774c856329a8fca4c9338dfeda39 (diff) | |
| parent | ff26eaadf4d914e397872b99885d45756104e9ae (diff) | |
Merge branch 'for-3.5/core' of git://git.kernel.dk/linux-block
Merge block/IO core bits from Jens Axboe:
"This is a bit bigger on the core side than usual, but that is purely
because we decided to hold off on parts of Tejun's submission on 3.4
to give it a bit more time to simmer. As a consequence, it's seen a
long cycle in for-next.
It contains:
- Bug fix from Dan, wrong locking type.
- Relax splice gifting restriction from Eric.
- A ton of updates from Tejun, primarily for blkcg. This improves
the code a lot, making the API nicer and cleaner, and also includes
fixes for how we handle and tie policies and re-activate on
switches. The changes also include generic bug fixes.
- A simple fix from Vivek, along with a fix for doing proper delayed
allocation of the blkcg stats."
Fix up annoying conflict just due to different merge resolution in
Documentation/feature-removal-schedule.txt
* 'for-3.5/core' of git://git.kernel.dk/linux-block: (92 commits)
blkcg: tg_stats_alloc_lock is an irq lock
vmsplice: relax alignement requirements for SPLICE_F_GIFT
blkcg: use radix tree to index blkgs from blkcg
blkcg: fix blkcg->css ref leak in __blkg_lookup_create()
block: fix elvpriv allocation failure handling
block: collapse blk_alloc_request() into get_request()
blkcg: collapse blkcg_policy_ops into blkcg_policy
blkcg: embed struct blkg_policy_data in policy specific data
blkcg: mass rename of blkcg API
blkcg: style cleanups for blk-cgroup.h
blkcg: remove blkio_group->path[]
blkcg: blkg_rwstat_read() was missing inline
blkcg: shoot down blkgs if all policies are deactivated
blkcg: drop stuff unused after per-queue policy activation update
blkcg: implement per-queue policy activation
blkcg: add request_queue->root_blkg
blkcg: make request_queue bypassing on allocation
blkcg: make sure blkg_lookup() returns %NULL if @q is bypassing
blkcg: make blkg_conf_prep() take @pol and return with queue lock held
blkcg: remove static policy ID enums
...
Diffstat (limited to 'block/blk-throttle.c')
| -rw-r--r-- | block/blk-throttle.c | 697 |
1 files changed, 325 insertions, 372 deletions
diff --git a/block/blk-throttle.c b/block/blk-throttle.c index f2ddb94626bd..5b0659512047 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c | |||
| @@ -21,6 +21,8 @@ static int throtl_quantum = 32; | |||
| 21 | /* Throttling is performed over 100ms slice and after that slice is renewed */ | 21 | /* Throttling is performed over 100ms slice and after that slice is renewed */ |
| 22 | static unsigned long throtl_slice = HZ/10; /* 100 ms */ | 22 | static unsigned long throtl_slice = HZ/10; /* 100 ms */ |
| 23 | 23 | ||
| 24 | static struct blkcg_policy blkcg_policy_throtl; | ||
| 25 | |||
| 24 | /* A workqueue to queue throttle related work */ | 26 | /* A workqueue to queue throttle related work */ |
| 25 | static struct workqueue_struct *kthrotld_workqueue; | 27 | static struct workqueue_struct *kthrotld_workqueue; |
| 26 | static void throtl_schedule_delayed_work(struct throtl_data *td, | 28 | static void throtl_schedule_delayed_work(struct throtl_data *td, |
| @@ -38,9 +40,17 @@ struct throtl_rb_root { | |||
| 38 | 40 | ||
| 39 | #define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node) | 41 | #define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node) |
| 40 | 42 | ||
| 43 | /* Per-cpu group stats */ | ||
| 44 | struct tg_stats_cpu { | ||
| 45 | /* total bytes transferred */ | ||
| 46 | struct blkg_rwstat service_bytes; | ||
| 47 | /* total IOs serviced, post merge */ | ||
| 48 | struct blkg_rwstat serviced; | ||
| 49 | }; | ||
| 50 | |||
| 41 | struct throtl_grp { | 51 | struct throtl_grp { |
| 42 | /* List of throtl groups on the request queue*/ | 52 | /* must be the first member */ |
| 43 | struct hlist_node tg_node; | 53 | struct blkg_policy_data pd; |
| 44 | 54 | ||
| 45 | /* active throtl group service_tree member */ | 55 | /* active throtl group service_tree member */ |
| 46 | struct rb_node rb_node; | 56 | struct rb_node rb_node; |
| @@ -52,8 +62,6 @@ struct throtl_grp { | |||
| 52 | */ | 62 | */ |
| 53 | unsigned long disptime; | 63 | unsigned long disptime; |
| 54 | 64 | ||
| 55 | struct blkio_group blkg; | ||
| 56 | atomic_t ref; | ||
| 57 | unsigned int flags; | 65 | unsigned int flags; |
| 58 | 66 | ||
| 59 | /* Two lists for READ and WRITE */ | 67 | /* Two lists for READ and WRITE */ |
| @@ -80,18 +88,18 @@ struct throtl_grp { | |||
| 80 | /* Some throttle limits got updated for the group */ | 88 | /* Some throttle limits got updated for the group */ |
| 81 | int limits_changed; | 89 | int limits_changed; |
| 82 | 90 | ||
| 83 | struct rcu_head rcu_head; | 91 | /* Per cpu stats pointer */ |
| 92 | struct tg_stats_cpu __percpu *stats_cpu; | ||
| 93 | |||
| 94 | /* List of tgs waiting for per cpu stats memory to be allocated */ | ||
| 95 | struct list_head stats_alloc_node; | ||
| 84 | }; | 96 | }; |
| 85 | 97 | ||
| 86 | struct throtl_data | 98 | struct throtl_data |
| 87 | { | 99 | { |
| 88 | /* List of throtl groups */ | ||
| 89 | struct hlist_head tg_list; | ||
| 90 | |||
| 91 | /* service tree for active throtl groups */ | 100 | /* service tree for active throtl groups */ |
| 92 | struct throtl_rb_root tg_service_tree; | 101 | struct throtl_rb_root tg_service_tree; |
| 93 | 102 | ||
| 94 | struct throtl_grp *root_tg; | ||
| 95 | struct request_queue *queue; | 103 | struct request_queue *queue; |
| 96 | 104 | ||
| 97 | /* Total Number of queued bios on READ and WRITE lists */ | 105 | /* Total Number of queued bios on READ and WRITE lists */ |
| @@ -108,6 +116,33 @@ struct throtl_data | |||
| 108 | int limits_changed; | 116 | int limits_changed; |
| 109 | }; | 117 | }; |
| 110 | 118 | ||
| 119 | /* list and work item to allocate percpu group stats */ | ||
| 120 | static DEFINE_SPINLOCK(tg_stats_alloc_lock); | ||
| 121 | static LIST_HEAD(tg_stats_alloc_list); | ||
| 122 | |||
| 123 | static void tg_stats_alloc_fn(struct work_struct *); | ||
| 124 | static DECLARE_DELAYED_WORK(tg_stats_alloc_work, tg_stats_alloc_fn); | ||
| 125 | |||
| 126 | static inline struct throtl_grp *pd_to_tg(struct blkg_policy_data *pd) | ||
| 127 | { | ||
| 128 | return pd ? container_of(pd, struct throtl_grp, pd) : NULL; | ||
| 129 | } | ||
| 130 | |||
| 131 | static inline struct throtl_grp *blkg_to_tg(struct blkcg_gq *blkg) | ||
| 132 | { | ||
| 133 | return pd_to_tg(blkg_to_pd(blkg, &blkcg_policy_throtl)); | ||
| 134 | } | ||
| 135 | |||
| 136 | static inline struct blkcg_gq *tg_to_blkg(struct throtl_grp *tg) | ||
| 137 | { | ||
| 138 | return pd_to_blkg(&tg->pd); | ||
| 139 | } | ||
| 140 | |||
| 141 | static inline struct throtl_grp *td_root_tg(struct throtl_data *td) | ||
| 142 | { | ||
| 143 | return blkg_to_tg(td->queue->root_blkg); | ||
| 144 | } | ||
| 145 | |||
| 111 | enum tg_state_flags { | 146 | enum tg_state_flags { |
| 112 | THROTL_TG_FLAG_on_rr = 0, /* on round-robin busy list */ | 147 | THROTL_TG_FLAG_on_rr = 0, /* on round-robin busy list */ |
| 113 | }; | 148 | }; |
| @@ -128,244 +163,150 @@ static inline int throtl_tg_##name(const struct throtl_grp *tg) \ | |||
| 128 | 163 | ||
| 129 | THROTL_TG_FNS(on_rr); | 164 | THROTL_TG_FNS(on_rr); |
| 130 | 165 | ||
| 131 | #define throtl_log_tg(td, tg, fmt, args...) \ | 166 | #define throtl_log_tg(td, tg, fmt, args...) do { \ |
| 132 | blk_add_trace_msg((td)->queue, "throtl %s " fmt, \ | 167 | char __pbuf[128]; \ |
| 133 | blkg_path(&(tg)->blkg), ##args); \ | 168 | \ |
| 169 | blkg_path(tg_to_blkg(tg), __pbuf, sizeof(__pbuf)); \ | ||
| 170 | blk_add_trace_msg((td)->queue, "throtl %s " fmt, __pbuf, ##args); \ | ||
| 171 | } while (0) | ||
| 134 | 172 | ||
| 135 | #define throtl_log(td, fmt, args...) \ | 173 | #define throtl_log(td, fmt, args...) \ |
| 136 | blk_add_trace_msg((td)->queue, "throtl " fmt, ##args) | 174 | blk_add_trace_msg((td)->queue, "throtl " fmt, ##args) |
| 137 | 175 | ||
| 138 | static inline struct throtl_grp *tg_of_blkg(struct blkio_group *blkg) | ||
| 139 | { | ||
| 140 | if (blkg) | ||
| 141 | return container_of(blkg, struct throtl_grp, blkg); | ||
| 142 | |||
| 143 | return NULL; | ||
| 144 | } | ||
| 145 | |||
| 146 | static inline unsigned int total_nr_queued(struct throtl_data *td) | 176 | static inline unsigned int total_nr_queued(struct throtl_data *td) |
| 147 | { | 177 | { |
| 148 | return td->nr_queued[0] + td->nr_queued[1]; | 178 | return td->nr_queued[0] + td->nr_queued[1]; |
| 149 | } | 179 | } |
| 150 | 180 | ||
| 151 | static inline struct throtl_grp *throtl_ref_get_tg(struct throtl_grp *tg) | 181 | /* |
| 152 | { | 182 | * Worker for allocating per cpu stat for tgs. This is scheduled on the |
| 153 | atomic_inc(&tg->ref); | 183 | * system_nrt_wq once there are some groups on the alloc_list waiting for |
| 154 | return tg; | 184 | * allocation. |
| 155 | } | 185 | */ |
| 156 | 186 | static void tg_stats_alloc_fn(struct work_struct *work) | |
| 157 | static void throtl_free_tg(struct rcu_head *head) | ||
| 158 | { | 187 | { |
| 159 | struct throtl_grp *tg; | 188 | static struct tg_stats_cpu *stats_cpu; /* this fn is non-reentrant */ |
| 189 | struct delayed_work *dwork = to_delayed_work(work); | ||
| 190 | bool empty = false; | ||
| 191 | |||
| 192 | alloc_stats: | ||
| 193 | if (!stats_cpu) { | ||
| 194 | stats_cpu = alloc_percpu(struct tg_stats_cpu); | ||
| 195 | if (!stats_cpu) { | ||
| 196 | /* allocation failed, try again after some time */ | ||
| 197 | queue_delayed_work(system_nrt_wq, dwork, | ||
| 198 | msecs_to_jiffies(10)); | ||
| 199 | return; | ||
| 200 | } | ||
| 201 | } | ||
| 160 | 202 | ||
| 161 | tg = container_of(head, struct throtl_grp, rcu_head); | 203 | spin_lock_irq(&tg_stats_alloc_lock); |
| 162 | free_percpu(tg->blkg.stats_cpu); | ||
| 163 | kfree(tg); | ||
| 164 | } | ||
| 165 | 204 | ||
| 166 | static void throtl_put_tg(struct throtl_grp *tg) | 205 | if (!list_empty(&tg_stats_alloc_list)) { |
| 167 | { | 206 | struct throtl_grp *tg = list_first_entry(&tg_stats_alloc_list, |
| 168 | BUG_ON(atomic_read(&tg->ref) <= 0); | 207 | struct throtl_grp, |
| 169 | if (!atomic_dec_and_test(&tg->ref)) | 208 | stats_alloc_node); |
| 170 | return; | 209 | swap(tg->stats_cpu, stats_cpu); |
| 210 | list_del_init(&tg->stats_alloc_node); | ||
| 211 | } | ||
| 171 | 212 | ||
| 172 | /* | 213 | empty = list_empty(&tg_stats_alloc_list); |
| 173 | * A group is freed in rcu manner. But having an rcu lock does not | 214 | spin_unlock_irq(&tg_stats_alloc_lock); |
| 174 | * mean that one can access all the fields of blkg and assume these | 215 | if (!empty) |
| 175 | * are valid. For example, don't try to follow throtl_data and | 216 | goto alloc_stats; |
| 176 | * request queue links. | ||
| 177 | * | ||
| 178 | * Having a reference to blkg under an rcu allows acess to only | ||
| 179 | * values local to groups like group stats and group rate limits | ||
| 180 | */ | ||
| 181 | call_rcu(&tg->rcu_head, throtl_free_tg); | ||
| 182 | } | 217 | } |
| 183 | 218 | ||
| 184 | static void throtl_init_group(struct throtl_grp *tg) | 219 | static void throtl_pd_init(struct blkcg_gq *blkg) |
| 185 | { | 220 | { |
| 186 | INIT_HLIST_NODE(&tg->tg_node); | 221 | struct throtl_grp *tg = blkg_to_tg(blkg); |
| 222 | unsigned long flags; | ||
| 223 | |||
| 187 | RB_CLEAR_NODE(&tg->rb_node); | 224 | RB_CLEAR_NODE(&tg->rb_node); |
| 188 | bio_list_init(&tg->bio_lists[0]); | 225 | bio_list_init(&tg->bio_lists[0]); |
| 189 | bio_list_init(&tg->bio_lists[1]); | 226 | bio_list_init(&tg->bio_lists[1]); |
| 190 | tg->limits_changed = false; | 227 | tg->limits_changed = false; |
| 191 | 228 | ||
| 192 | /* Practically unlimited BW */ | 229 | tg->bps[READ] = -1; |
| 193 | tg->bps[0] = tg->bps[1] = -1; | 230 | tg->bps[WRITE] = -1; |
| 194 | tg->iops[0] = tg->iops[1] = -1; | 231 | tg->iops[READ] = -1; |
| 232 | tg->iops[WRITE] = -1; | ||
| 195 | 233 | ||
| 196 | /* | 234 | /* |
| 197 | * Take the initial reference that will be released on destroy | 235 | * Ugh... We need to perform per-cpu allocation for tg->stats_cpu |
| 198 | * This can be thought of a joint reference by cgroup and | 236 | * but percpu allocator can't be called from IO path. Queue tg on |
| 199 | * request queue which will be dropped by either request queue | 237 | * tg_stats_alloc_list and allocate from work item. |
| 200 | * exit or cgroup deletion path depending on who is exiting first. | ||
| 201 | */ | 238 | */ |
| 202 | atomic_set(&tg->ref, 1); | 239 | spin_lock_irqsave(&tg_stats_alloc_lock, flags); |
| 240 | list_add(&tg->stats_alloc_node, &tg_stats_alloc_list); | ||
| 241 | queue_delayed_work(system_nrt_wq, &tg_stats_alloc_work, 0); | ||
| 242 | spin_unlock_irqrestore(&tg_stats_alloc_lock, flags); | ||
| 203 | } | 243 | } |
| 204 | 244 | ||
| 205 | /* Should be called with rcu read lock held (needed for blkcg) */ | 245 | static void throtl_pd_exit(struct blkcg_gq *blkg) |
| 206 | static void | ||
| 207 | throtl_add_group_to_td_list(struct throtl_data *td, struct throtl_grp *tg) | ||
| 208 | { | 246 | { |
| 209 | hlist_add_head(&tg->tg_node, &td->tg_list); | 247 | struct throtl_grp *tg = blkg_to_tg(blkg); |
| 210 | td->nr_undestroyed_grps++; | 248 | unsigned long flags; |
| 211 | } | ||
| 212 | |||
| 213 | static void | ||
| 214 | __throtl_tg_fill_dev_details(struct throtl_data *td, struct throtl_grp *tg) | ||
| 215 | { | ||
| 216 | struct backing_dev_info *bdi = &td->queue->backing_dev_info; | ||
| 217 | unsigned int major, minor; | ||
| 218 | |||
| 219 | if (!tg || tg->blkg.dev) | ||
| 220 | return; | ||
| 221 | |||
| 222 | /* | ||
| 223 | * Fill in device details for a group which might not have been | ||
| 224 | * filled at group creation time as queue was being instantiated | ||
| 225 | * and driver had not attached a device yet | ||
| 226 | */ | ||
| 227 | if (bdi->dev && dev_name(bdi->dev)) { | ||
| 228 | sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); | ||
| 229 | tg->blkg.dev = MKDEV(major, minor); | ||
| 230 | } | ||
| 231 | } | ||
| 232 | |||
| 233 | /* | ||
| 234 | * Should be called with without queue lock held. Here queue lock will be | ||
| 235 | * taken rarely. It will be taken only once during life time of a group | ||
| 236 | * if need be | ||
| 237 | */ | ||
| 238 | static void | ||
| 239 | throtl_tg_fill_dev_details(struct throtl_data *td, struct throtl_grp *tg) | ||
| 240 | { | ||
| 241 | if (!tg || tg->blkg.dev) | ||
| 242 | return; | ||
| 243 | |||
| 244 | spin_lock_irq(td->queue->queue_lock); | ||
| 245 | __throtl_tg_fill_dev_details(td, tg); | ||
| 246 | spin_unlock_irq(td->queue->queue_lock); | ||
| 247 | } | ||
| 248 | |||
| 249 | static void throtl_init_add_tg_lists(struct throtl_data *td, | ||
| 250 | struct throtl_grp *tg, struct blkio_cgroup *blkcg) | ||
| 251 | { | ||
| 252 | __throtl_tg_fill_dev_details(td, tg); | ||
| 253 | |||
| 254 | /* Add group onto cgroup list */ | ||
| 255 | blkiocg_add_blkio_group(blkcg, &tg->blkg, (void *)td, | ||
| 256 | tg->blkg.dev, BLKIO_POLICY_THROTL); | ||
| 257 | 249 | ||
| 258 | tg->bps[READ] = blkcg_get_read_bps(blkcg, tg->blkg.dev); | 250 | spin_lock_irqsave(&tg_stats_alloc_lock, flags); |
| 259 | tg->bps[WRITE] = blkcg_get_write_bps(blkcg, tg->blkg.dev); | 251 | list_del_init(&tg->stats_alloc_node); |
| 260 | tg->iops[READ] = blkcg_get_read_iops(blkcg, tg->blkg.dev); | 252 | spin_unlock_irqrestore(&tg_stats_alloc_lock, flags); |
| 261 | tg->iops[WRITE] = blkcg_get_write_iops(blkcg, tg->blkg.dev); | ||
| 262 | 253 | ||
| 263 | throtl_add_group_to_td_list(td, tg); | 254 | free_percpu(tg->stats_cpu); |
| 264 | } | 255 | } |
| 265 | 256 | ||
| 266 | /* Should be called without queue lock and outside of rcu period */ | 257 | static void throtl_pd_reset_stats(struct blkcg_gq *blkg) |
| 267 | static struct throtl_grp *throtl_alloc_tg(struct throtl_data *td) | ||
| 268 | { | 258 | { |
| 269 | struct throtl_grp *tg = NULL; | 259 | struct throtl_grp *tg = blkg_to_tg(blkg); |
| 270 | int ret; | 260 | int cpu; |
| 271 | 261 | ||
| 272 | tg = kzalloc_node(sizeof(*tg), GFP_ATOMIC, td->queue->node); | 262 | if (tg->stats_cpu == NULL) |
| 273 | if (!tg) | 263 | return; |
| 274 | return NULL; | ||
| 275 | 264 | ||
| 276 | ret = blkio_alloc_blkg_stats(&tg->blkg); | 265 | for_each_possible_cpu(cpu) { |
| 266 | struct tg_stats_cpu *sc = per_cpu_ptr(tg->stats_cpu, cpu); | ||
| 277 | 267 | ||
| 278 | if (ret) { | 268 | blkg_rwstat_reset(&sc->service_bytes); |
| 279 | kfree(tg); | 269 | blkg_rwstat_reset(&sc->serviced); |
| 280 | return NULL; | ||
| 281 | } | 270 | } |
| 282 | |||
| 283 | throtl_init_group(tg); | ||
| 284 | return tg; | ||
| 285 | } | 271 | } |
| 286 | 272 | ||
| 287 | static struct | 273 | static struct throtl_grp *throtl_lookup_tg(struct throtl_data *td, |
| 288 | throtl_grp *throtl_find_tg(struct throtl_data *td, struct blkio_cgroup *blkcg) | 274 | struct blkcg *blkcg) |
| 289 | { | 275 | { |
| 290 | struct throtl_grp *tg = NULL; | ||
| 291 | void *key = td; | ||
| 292 | |||
| 293 | /* | 276 | /* |
| 294 | * This is the common case when there are no blkio cgroups. | 277 | * This is the common case when there are no blkcgs. Avoid lookup |
| 295 | * Avoid lookup in this case | 278 | * in this case |
| 296 | */ | 279 | */ |
| 297 | if (blkcg == &blkio_root_cgroup) | 280 | if (blkcg == &blkcg_root) |
| 298 | tg = td->root_tg; | 281 | return td_root_tg(td); |
| 299 | else | ||
| 300 | tg = tg_of_blkg(blkiocg_lookup_group(blkcg, key)); | ||
| 301 | 282 | ||
| 302 | __throtl_tg_fill_dev_details(td, tg); | 283 | return blkg_to_tg(blkg_lookup(blkcg, td->queue)); |
| 303 | return tg; | ||
| 304 | } | 284 | } |
| 305 | 285 | ||
| 306 | static struct throtl_grp * throtl_get_tg(struct throtl_data *td) | 286 | static struct throtl_grp *throtl_lookup_create_tg(struct throtl_data *td, |
| 287 | struct blkcg *blkcg) | ||
| 307 | { | 288 | { |
| 308 | struct throtl_grp *tg = NULL, *__tg = NULL; | ||
| 309 | struct blkio_cgroup *blkcg; | ||
| 310 | struct request_queue *q = td->queue; | 289 | struct request_queue *q = td->queue; |
| 311 | 290 | struct throtl_grp *tg = NULL; | |
| 312 | /* no throttling for dead queue */ | ||
| 313 | if (unlikely(blk_queue_dead(q))) | ||
| 314 | return NULL; | ||
| 315 | |||
| 316 | rcu_read_lock(); | ||
| 317 | blkcg = task_blkio_cgroup(current); | ||
| 318 | tg = throtl_find_tg(td, blkcg); | ||
| 319 | if (tg) { | ||
| 320 | rcu_read_unlock(); | ||
| 321 | return tg; | ||
| 322 | } | ||
| 323 | |||
| 324 | /* | ||
| 325 | * Need to allocate a group. Allocation of group also needs allocation | ||
| 326 | * of per cpu stats which in-turn takes a mutex() and can block. Hence | ||
| 327 | * we need to drop rcu lock and queue_lock before we call alloc. | ||
| 328 | */ | ||
| 329 | rcu_read_unlock(); | ||
| 330 | spin_unlock_irq(q->queue_lock); | ||
| 331 | |||
| 332 | tg = throtl_alloc_tg(td); | ||
| 333 | |||
| 334 | /* Group allocated and queue is still alive. take the lock */ | ||
| 335 | spin_lock_irq(q->queue_lock); | ||
| 336 | |||
| 337 | /* Make sure @q is still alive */ | ||
| 338 | if (unlikely(blk_queue_dead(q))) { | ||
| 339 | kfree(tg); | ||
| 340 | return NULL; | ||
| 341 | } | ||
| 342 | |||
| 343 | /* | ||
| 344 | * Initialize the new group. After sleeping, read the blkcg again. | ||
| 345 | */ | ||
| 346 | rcu_read_lock(); | ||
| 347 | blkcg = task_blkio_cgroup(current); | ||
| 348 | 291 | ||
| 349 | /* | 292 | /* |
| 350 | * If some other thread already allocated the group while we were | 293 | * This is the common case when there are no blkcgs. Avoid lookup |
| 351 | * not holding queue lock, free up the group | 294 | * in this case |
| 352 | */ | 295 | */ |
| 353 | __tg = throtl_find_tg(td, blkcg); | 296 | if (blkcg == &blkcg_root) { |
| 354 | 297 | tg = td_root_tg(td); | |
| 355 | if (__tg) { | 298 | } else { |
| 356 | kfree(tg); | 299 | struct blkcg_gq *blkg; |
| 357 | rcu_read_unlock(); | 300 | |
| 358 | return __tg; | 301 | blkg = blkg_lookup_create(blkcg, q); |
| 359 | } | 302 | |
| 360 | 303 | /* if %NULL and @q is alive, fall back to root_tg */ | |
| 361 | /* Group allocation failed. Account the IO to root group */ | 304 | if (!IS_ERR(blkg)) |
| 362 | if (!tg) { | 305 | tg = blkg_to_tg(blkg); |
| 363 | tg = td->root_tg; | 306 | else if (!blk_queue_dead(q)) |
| 364 | return tg; | 307 | tg = td_root_tg(td); |
| 365 | } | 308 | } |
| 366 | 309 | ||
| 367 | throtl_init_add_tg_lists(td, tg, blkcg); | ||
| 368 | rcu_read_unlock(); | ||
| 369 | return tg; | 310 | return tg; |
| 370 | } | 311 | } |
| 371 | 312 | ||
| @@ -734,16 +675,41 @@ static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg, | |||
| 734 | return 0; | 675 | return 0; |
| 735 | } | 676 | } |
| 736 | 677 | ||
| 678 | static void throtl_update_dispatch_stats(struct blkcg_gq *blkg, u64 bytes, | ||
| 679 | int rw) | ||
| 680 | { | ||
| 681 | struct throtl_grp *tg = blkg_to_tg(blkg); | ||
| 682 | struct tg_stats_cpu *stats_cpu; | ||
| 683 | unsigned long flags; | ||
| 684 | |||
| 685 | /* If per cpu stats are not allocated yet, don't do any accounting. */ | ||
| 686 | if (tg->stats_cpu == NULL) | ||
| 687 | return; | ||
| 688 | |||
| 689 | /* | ||
| 690 | * Disabling interrupts to provide mutual exclusion between two | ||
| 691 | * writes on same cpu. It probably is not needed for 64bit. Not | ||
| 692 | * optimizing that case yet. | ||
| 693 | */ | ||
| 694 | local_irq_save(flags); | ||
| 695 | |||
| 696 | stats_cpu = this_cpu_ptr(tg->stats_cpu); | ||
| 697 | |||
| 698 | blkg_rwstat_add(&stats_cpu->serviced, rw, 1); | ||
| 699 | blkg_rwstat_add(&stats_cpu->service_bytes, rw, bytes); | ||
| 700 | |||
| 701 | local_irq_restore(flags); | ||
| 702 | } | ||
| 703 | |||
| 737 | static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) | 704 | static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) |
| 738 | { | 705 | { |
| 739 | bool rw = bio_data_dir(bio); | 706 | bool rw = bio_data_dir(bio); |
| 740 | bool sync = rw_is_sync(bio->bi_rw); | ||
| 741 | 707 | ||
| 742 | /* Charge the bio to the group */ | 708 | /* Charge the bio to the group */ |
| 743 | tg->bytes_disp[rw] += bio->bi_size; | 709 | tg->bytes_disp[rw] += bio->bi_size; |
| 744 | tg->io_disp[rw]++; | 710 | tg->io_disp[rw]++; |
| 745 | 711 | ||
| 746 | blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size, rw, sync); | 712 | throtl_update_dispatch_stats(tg_to_blkg(tg), bio->bi_size, bio->bi_rw); |
| 747 | } | 713 | } |
| 748 | 714 | ||
| 749 | static void throtl_add_bio_tg(struct throtl_data *td, struct throtl_grp *tg, | 715 | static void throtl_add_bio_tg(struct throtl_data *td, struct throtl_grp *tg, |
| @@ -753,7 +719,7 @@ static void throtl_add_bio_tg(struct throtl_data *td, struct throtl_grp *tg, | |||
| 753 | 719 | ||
| 754 | bio_list_add(&tg->bio_lists[rw], bio); | 720 | bio_list_add(&tg->bio_lists[rw], bio); |
| 755 | /* Take a bio reference on tg */ | 721 | /* Take a bio reference on tg */ |
| 756 | throtl_ref_get_tg(tg); | 722 | blkg_get(tg_to_blkg(tg)); |
| 757 | tg->nr_queued[rw]++; | 723 | tg->nr_queued[rw]++; |
| 758 | td->nr_queued[rw]++; | 724 | td->nr_queued[rw]++; |
| 759 | throtl_enqueue_tg(td, tg); | 725 | throtl_enqueue_tg(td, tg); |
| @@ -786,8 +752,8 @@ static void tg_dispatch_one_bio(struct throtl_data *td, struct throtl_grp *tg, | |||
| 786 | 752 | ||
| 787 | bio = bio_list_pop(&tg->bio_lists[rw]); | 753 | bio = bio_list_pop(&tg->bio_lists[rw]); |
| 788 | tg->nr_queued[rw]--; | 754 | tg->nr_queued[rw]--; |
| 789 | /* Drop bio reference on tg */ | 755 | /* Drop bio reference on blkg */ |
| 790 | throtl_put_tg(tg); | 756 | blkg_put(tg_to_blkg(tg)); |
| 791 | 757 | ||
| 792 | BUG_ON(td->nr_queued[rw] <= 0); | 758 | BUG_ON(td->nr_queued[rw] <= 0); |
| 793 | td->nr_queued[rw]--; | 759 | td->nr_queued[rw]--; |
| @@ -865,8 +831,8 @@ static int throtl_select_dispatch(struct throtl_data *td, struct bio_list *bl) | |||
| 865 | 831 | ||
| 866 | static void throtl_process_limit_change(struct throtl_data *td) | 832 | static void throtl_process_limit_change(struct throtl_data *td) |
| 867 | { | 833 | { |
| 868 | struct throtl_grp *tg; | 834 | struct request_queue *q = td->queue; |
| 869 | struct hlist_node *pos, *n; | 835 | struct blkcg_gq *blkg, *n; |
| 870 | 836 | ||
| 871 | if (!td->limits_changed) | 837 | if (!td->limits_changed) |
| 872 | return; | 838 | return; |
| @@ -875,7 +841,9 @@ static void throtl_process_limit_change(struct throtl_data *td) | |||
| 875 | 841 | ||
| 876 | throtl_log(td, "limits changed"); | 842 | throtl_log(td, "limits changed"); |
| 877 | 843 | ||
| 878 | hlist_for_each_entry_safe(tg, pos, n, &td->tg_list, tg_node) { | 844 | list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) { |
| 845 | struct throtl_grp *tg = blkg_to_tg(blkg); | ||
| 846 | |||
| 879 | if (!tg->limits_changed) | 847 | if (!tg->limits_changed) |
| 880 | continue; | 848 | continue; |
| 881 | 849 | ||
| @@ -973,120 +941,159 @@ throtl_schedule_delayed_work(struct throtl_data *td, unsigned long delay) | |||
| 973 | } | 941 | } |
| 974 | } | 942 | } |
| 975 | 943 | ||
| 976 | static void | 944 | static u64 tg_prfill_cpu_rwstat(struct seq_file *sf, |
| 977 | throtl_destroy_tg(struct throtl_data *td, struct throtl_grp *tg) | 945 | struct blkg_policy_data *pd, int off) |
| 978 | { | 946 | { |
| 979 | /* Something wrong if we are trying to remove same group twice */ | 947 | struct throtl_grp *tg = pd_to_tg(pd); |
| 980 | BUG_ON(hlist_unhashed(&tg->tg_node)); | 948 | struct blkg_rwstat rwstat = { }, tmp; |
| 949 | int i, cpu; | ||
| 981 | 950 | ||
| 982 | hlist_del_init(&tg->tg_node); | 951 | for_each_possible_cpu(cpu) { |
| 952 | struct tg_stats_cpu *sc = per_cpu_ptr(tg->stats_cpu, cpu); | ||
| 983 | 953 | ||
| 984 | /* | 954 | tmp = blkg_rwstat_read((void *)sc + off); |
| 985 | * Put the reference taken at the time of creation so that when all | 955 | for (i = 0; i < BLKG_RWSTAT_NR; i++) |
| 986 | * queues are gone, group can be destroyed. | 956 | rwstat.cnt[i] += tmp.cnt[i]; |
| 987 | */ | 957 | } |
| 988 | throtl_put_tg(tg); | 958 | |
| 989 | td->nr_undestroyed_grps--; | 959 | return __blkg_prfill_rwstat(sf, pd, &rwstat); |
| 990 | } | 960 | } |
| 991 | 961 | ||
| 992 | static void throtl_release_tgs(struct throtl_data *td) | 962 | static int tg_print_cpu_rwstat(struct cgroup *cgrp, struct cftype *cft, |
| 963 | struct seq_file *sf) | ||
| 993 | { | 964 | { |
| 994 | struct hlist_node *pos, *n; | 965 | struct blkcg *blkcg = cgroup_to_blkcg(cgrp); |
| 995 | struct throtl_grp *tg; | ||
| 996 | 966 | ||
| 997 | hlist_for_each_entry_safe(tg, pos, n, &td->tg_list, tg_node) { | 967 | blkcg_print_blkgs(sf, blkcg, tg_prfill_cpu_rwstat, &blkcg_policy_throtl, |
| 998 | /* | 968 | cft->private, true); |
| 999 | * If cgroup removal path got to blk_group first and removed | 969 | return 0; |
| 1000 | * it from cgroup list, then it will take care of destroying | ||
| 1001 | * cfqg also. | ||
| 1002 | */ | ||
| 1003 | if (!blkiocg_del_blkio_group(&tg->blkg)) | ||
| 1004 | throtl_destroy_tg(td, tg); | ||
| 1005 | } | ||
| 1006 | } | 970 | } |
| 1007 | 971 | ||
| 1008 | /* | 972 | static u64 tg_prfill_conf_u64(struct seq_file *sf, struct blkg_policy_data *pd, |
| 1009 | * Blk cgroup controller notification saying that blkio_group object is being | 973 | int off) |
| 1010 | * delinked as associated cgroup object is going away. That also means that | ||
| 1011 | * no new IO will come in this group. So get rid of this group as soon as | ||
| 1012 | * any pending IO in the group is finished. | ||
| 1013 | * | ||
| 1014 | * This function is called under rcu_read_lock(). key is the rcu protected | ||
| 1015 | * pointer. That means "key" is a valid throtl_data pointer as long as we are | ||
| 1016 | * rcu read lock. | ||
| 1017 | * | ||
| 1018 | * "key" was fetched from blkio_group under blkio_cgroup->lock. That means | ||
| 1019 | * it should not be NULL as even if queue was going away, cgroup deltion | ||
| 1020 | * path got to it first. | ||
| 1021 | */ | ||
| 1022 | void throtl_unlink_blkio_group(void *key, struct blkio_group *blkg) | ||
| 1023 | { | 974 | { |
| 1024 | unsigned long flags; | 975 | struct throtl_grp *tg = pd_to_tg(pd); |
| 1025 | struct throtl_data *td = key; | 976 | u64 v = *(u64 *)((void *)tg + off); |
| 1026 | 977 | ||
| 1027 | spin_lock_irqsave(td->queue->queue_lock, flags); | 978 | if (v == -1) |
| 1028 | throtl_destroy_tg(td, tg_of_blkg(blkg)); | 979 | return 0; |
| 1029 | spin_unlock_irqrestore(td->queue->queue_lock, flags); | 980 | return __blkg_prfill_u64(sf, pd, v); |
| 1030 | } | 981 | } |
| 1031 | 982 | ||
| 1032 | static void throtl_update_blkio_group_common(struct throtl_data *td, | 983 | static u64 tg_prfill_conf_uint(struct seq_file *sf, struct blkg_policy_data *pd, |
| 1033 | struct throtl_grp *tg) | 984 | int off) |
| 1034 | { | 985 | { |
| 1035 | xchg(&tg->limits_changed, true); | 986 | struct throtl_grp *tg = pd_to_tg(pd); |
| 1036 | xchg(&td->limits_changed, true); | 987 | unsigned int v = *(unsigned int *)((void *)tg + off); |
| 1037 | /* Schedule a work now to process the limit change */ | 988 | |
| 1038 | throtl_schedule_delayed_work(td, 0); | 989 | if (v == -1) |
| 990 | return 0; | ||
| 991 | return __blkg_prfill_u64(sf, pd, v); | ||
| 1039 | } | 992 | } |
| 1040 | 993 | ||
| 1041 | /* | 994 | static int tg_print_conf_u64(struct cgroup *cgrp, struct cftype *cft, |
| 1042 | * For all update functions, key should be a valid pointer because these | 995 | struct seq_file *sf) |
| 1043 | * update functions are called under blkcg_lock, that means, blkg is | ||
| 1044 | * valid and in turn key is valid. queue exit path can not race because | ||
| 1045 | * of blkcg_lock | ||
| 1046 | * | ||
| 1047 | * Can not take queue lock in update functions as queue lock under blkcg_lock | ||
| 1048 | * is not allowed. Under other paths we take blkcg_lock under queue_lock. | ||
| 1049 | */ | ||
| 1050 | static void throtl_update_blkio_group_read_bps(void *key, | ||
| 1051 | struct blkio_group *blkg, u64 read_bps) | ||
| 1052 | { | 996 | { |
| 1053 | struct throtl_data *td = key; | 997 | blkcg_print_blkgs(sf, cgroup_to_blkcg(cgrp), tg_prfill_conf_u64, |
| 1054 | struct throtl_grp *tg = tg_of_blkg(blkg); | 998 | &blkcg_policy_throtl, cft->private, false); |
| 1055 | 999 | return 0; | |
| 1056 | tg->bps[READ] = read_bps; | ||
| 1057 | throtl_update_blkio_group_common(td, tg); | ||
| 1058 | } | 1000 | } |
| 1059 | 1001 | ||
| 1060 | static void throtl_update_blkio_group_write_bps(void *key, | 1002 | static int tg_print_conf_uint(struct cgroup *cgrp, struct cftype *cft, |
| 1061 | struct blkio_group *blkg, u64 write_bps) | 1003 | struct seq_file *sf) |
| 1062 | { | 1004 | { |
| 1063 | struct throtl_data *td = key; | 1005 | blkcg_print_blkgs(sf, cgroup_to_blkcg(cgrp), tg_prfill_conf_uint, |
| 1064 | struct throtl_grp *tg = tg_of_blkg(blkg); | 1006 | &blkcg_policy_throtl, cft->private, false); |
| 1065 | 1007 | return 0; | |
| 1066 | tg->bps[WRITE] = write_bps; | ||
| 1067 | throtl_update_blkio_group_common(td, tg); | ||
| 1068 | } | 1008 | } |
| 1069 | 1009 | ||
| 1070 | static void throtl_update_blkio_group_read_iops(void *key, | 1010 | static int tg_set_conf(struct cgroup *cgrp, struct cftype *cft, const char *buf, |
| 1071 | struct blkio_group *blkg, unsigned int read_iops) | 1011 | bool is_u64) |
| 1072 | { | 1012 | { |
| 1073 | struct throtl_data *td = key; | 1013 | struct blkcg *blkcg = cgroup_to_blkcg(cgrp); |
| 1074 | struct throtl_grp *tg = tg_of_blkg(blkg); | 1014 | struct blkg_conf_ctx ctx; |
| 1015 | struct throtl_grp *tg; | ||
| 1016 | struct throtl_data *td; | ||
| 1017 | int ret; | ||
| 1018 | |||
| 1019 | ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx); | ||
| 1020 | if (ret) | ||
| 1021 | return ret; | ||
| 1022 | |||
| 1023 | tg = blkg_to_tg(ctx.blkg); | ||
| 1024 | td = ctx.blkg->q->td; | ||
| 1025 | |||
| 1026 | if (!ctx.v) | ||
| 1027 | ctx.v = -1; | ||
| 1028 | |||
| 1029 | if (is_u64) | ||
| 1030 | *(u64 *)((void *)tg + cft->private) = ctx.v; | ||
| 1031 | else | ||
| 1032 | *(unsigned int *)((void *)tg + cft->private) = ctx.v; | ||
| 1033 | |||
| 1034 | /* XXX: we don't need the following deferred processing */ | ||
| 1035 | xchg(&tg->limits_changed, true); | ||
| 1036 | xchg(&td->limits_changed, true); | ||
| 1037 | throtl_schedule_delayed_work(td, 0); | ||
| 1075 | 1038 | ||
| 1076 | tg->iops[READ] = read_iops; | 1039 | blkg_conf_finish(&ctx); |
| 1077 | throtl_update_blkio_group_common(td, tg); | 1040 | return 0; |
| 1078 | } | 1041 | } |
| 1079 | 1042 | ||
| 1080 | static void throtl_update_blkio_group_write_iops(void *key, | 1043 | static int tg_set_conf_u64(struct cgroup *cgrp, struct cftype *cft, |
| 1081 | struct blkio_group *blkg, unsigned int write_iops) | 1044 | const char *buf) |
| 1082 | { | 1045 | { |
| 1083 | struct throtl_data *td = key; | 1046 | return tg_set_conf(cgrp, cft, buf, true); |
| 1084 | struct throtl_grp *tg = tg_of_blkg(blkg); | 1047 | } |
| 1085 | 1048 | ||
| 1086 | tg->iops[WRITE] = write_iops; | 1049 | static int tg_set_conf_uint(struct cgroup *cgrp, struct cftype *cft, |
| 1087 | throtl_update_blkio_group_common(td, tg); | 1050 | const char *buf) |
| 1051 | { | ||
| 1052 | return tg_set_conf(cgrp, cft, buf, false); | ||
| 1088 | } | 1053 | } |
| 1089 | 1054 | ||
| 1055 | static struct cftype throtl_files[] = { | ||
| 1056 | { | ||
| 1057 | .name = "throttle.read_bps_device", | ||
| 1058 | .private = offsetof(struct throtl_grp, bps[READ]), | ||
| 1059 | .read_seq_string = tg_print_conf_u64, | ||
| 1060 | .write_string = tg_set_conf_u64, | ||
| 1061 | .max_write_len = 256, | ||
| 1062 | }, | ||
| 1063 | { | ||
| 1064 | .name = "throttle.write_bps_device", | ||
| 1065 | .private = offsetof(struct throtl_grp, bps[WRITE]), | ||
| 1066 | .read_seq_string = tg_print_conf_u64, | ||
| 1067 | .write_string = tg_set_conf_u64, | ||
| 1068 | .max_write_len = 256, | ||
| 1069 | }, | ||
| 1070 | { | ||
| 1071 | .name = "throttle.read_iops_device", | ||
| 1072 | .private = offsetof(struct throtl_grp, iops[READ]), | ||
| 1073 | .read_seq_string = tg_print_conf_uint, | ||
| 1074 | .write_string = tg_set_conf_uint, | ||
| 1075 | .max_write_len = 256, | ||
| 1076 | }, | ||
| 1077 | { | ||
| 1078 | .name = "throttle.write_iops_device", | ||
| 1079 | .private = offsetof(struct throtl_grp, iops[WRITE]), | ||
| 1080 | .read_seq_string = tg_print_conf_uint, | ||
| 1081 | .write_string = tg_set_conf_uint, | ||
| 1082 | .max_write_len = 256, | ||
| 1083 | }, | ||
| 1084 | { | ||
| 1085 | .name = "throttle.io_service_bytes", | ||
| 1086 | .private = offsetof(struct tg_stats_cpu, service_bytes), | ||
| 1087 | .read_seq_string = tg_print_cpu_rwstat, | ||
| 1088 | }, | ||
| 1089 | { | ||
| 1090 | .name = "throttle.io_serviced", | ||
| 1091 | .private = offsetof(struct tg_stats_cpu, serviced), | ||
| 1092 | .read_seq_string = tg_print_cpu_rwstat, | ||
| 1093 | }, | ||
| 1094 | { } /* terminate */ | ||
| 1095 | }; | ||
| 1096 | |||
| 1090 | static void throtl_shutdown_wq(struct request_queue *q) | 1097 | static void throtl_shutdown_wq(struct request_queue *q) |
| 1091 | { | 1098 | { |
| 1092 | struct throtl_data *td = q->td; | 1099 | struct throtl_data *td = q->td; |
| @@ -1094,19 +1101,13 @@ static void throtl_shutdown_wq(struct request_queue *q) | |||
| 1094 | cancel_delayed_work_sync(&td->throtl_work); | 1101 | cancel_delayed_work_sync(&td->throtl_work); |
| 1095 | } | 1102 | } |
| 1096 | 1103 | ||
| 1097 | static struct blkio_policy_type blkio_policy_throtl = { | 1104 | static struct blkcg_policy blkcg_policy_throtl = { |
| 1098 | .ops = { | 1105 | .pd_size = sizeof(struct throtl_grp), |
| 1099 | .blkio_unlink_group_fn = throtl_unlink_blkio_group, | 1106 | .cftypes = throtl_files, |
| 1100 | .blkio_update_group_read_bps_fn = | 1107 | |
| 1101 | throtl_update_blkio_group_read_bps, | 1108 | .pd_init_fn = throtl_pd_init, |
| 1102 | .blkio_update_group_write_bps_fn = | 1109 | .pd_exit_fn = throtl_pd_exit, |
| 1103 | throtl_update_blkio_group_write_bps, | 1110 | .pd_reset_stats_fn = throtl_pd_reset_stats, |
| 1104 | .blkio_update_group_read_iops_fn = | ||
| 1105 | throtl_update_blkio_group_read_iops, | ||
| 1106 | .blkio_update_group_write_iops_fn = | ||
| 1107 | throtl_update_blkio_group_write_iops, | ||
| 1108 | }, | ||
| 1109 | .plid = BLKIO_POLICY_THROTL, | ||
| 1110 | }; | 1111 | }; |
| 1111 | 1112 | ||
| 1112 | bool blk_throtl_bio(struct request_queue *q, struct bio *bio) | 1113 | bool blk_throtl_bio(struct request_queue *q, struct bio *bio) |
| @@ -1114,7 +1115,7 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio) | |||
| 1114 | struct throtl_data *td = q->td; | 1115 | struct throtl_data *td = q->td; |
| 1115 | struct throtl_grp *tg; | 1116 | struct throtl_grp *tg; |
| 1116 | bool rw = bio_data_dir(bio), update_disptime = true; | 1117 | bool rw = bio_data_dir(bio), update_disptime = true; |
| 1117 | struct blkio_cgroup *blkcg; | 1118 | struct blkcg *blkcg; |
| 1118 | bool throttled = false; | 1119 | bool throttled = false; |
| 1119 | 1120 | ||
| 1120 | if (bio->bi_rw & REQ_THROTTLED) { | 1121 | if (bio->bi_rw & REQ_THROTTLED) { |
| @@ -1122,33 +1123,31 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio) | |||
| 1122 | goto out; | 1123 | goto out; |
| 1123 | } | 1124 | } |
| 1124 | 1125 | ||
| 1126 | /* bio_associate_current() needs ioc, try creating */ | ||
| 1127 | create_io_context(GFP_ATOMIC, q->node); | ||
| 1128 | |||
| 1125 | /* | 1129 | /* |
| 1126 | * A throtl_grp pointer retrieved under rcu can be used to access | 1130 | * A throtl_grp pointer retrieved under rcu can be used to access |
| 1127 | * basic fields like stats and io rates. If a group has no rules, | 1131 | * basic fields like stats and io rates. If a group has no rules, |
| 1128 | * just update the dispatch stats in lockless manner and return. | 1132 | * just update the dispatch stats in lockless manner and return. |
| 1129 | */ | 1133 | */ |
| 1130 | |||
| 1131 | rcu_read_lock(); | 1134 | rcu_read_lock(); |
| 1132 | blkcg = task_blkio_cgroup(current); | 1135 | blkcg = bio_blkcg(bio); |
| 1133 | tg = throtl_find_tg(td, blkcg); | 1136 | tg = throtl_lookup_tg(td, blkcg); |
| 1134 | if (tg) { | 1137 | if (tg) { |
| 1135 | throtl_tg_fill_dev_details(td, tg); | ||
| 1136 | |||
| 1137 | if (tg_no_rule_group(tg, rw)) { | 1138 | if (tg_no_rule_group(tg, rw)) { |
| 1138 | blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size, | 1139 | throtl_update_dispatch_stats(tg_to_blkg(tg), |
| 1139 | rw, rw_is_sync(bio->bi_rw)); | 1140 | bio->bi_size, bio->bi_rw); |
| 1140 | rcu_read_unlock(); | 1141 | goto out_unlock_rcu; |
| 1141 | goto out; | ||
| 1142 | } | 1142 | } |
| 1143 | } | 1143 | } |
| 1144 | rcu_read_unlock(); | ||
| 1145 | 1144 | ||
| 1146 | /* | 1145 | /* |
| 1147 | * Either group has not been allocated yet or it is not an unlimited | 1146 | * Either group has not been allocated yet or it is not an unlimited |
| 1148 | * IO group | 1147 | * IO group |
| 1149 | */ | 1148 | */ |
| 1150 | spin_lock_irq(q->queue_lock); | 1149 | spin_lock_irq(q->queue_lock); |
| 1151 | tg = throtl_get_tg(td); | 1150 | tg = throtl_lookup_create_tg(td, blkcg); |
| 1152 | if (unlikely(!tg)) | 1151 | if (unlikely(!tg)) |
| 1153 | goto out_unlock; | 1152 | goto out_unlock; |
| 1154 | 1153 | ||
| @@ -1189,6 +1188,7 @@ queue_bio: | |||
| 1189 | tg->io_disp[rw], tg->iops[rw], | 1188 | tg->io_disp[rw], tg->iops[rw], |
| 1190 | tg->nr_queued[READ], tg->nr_queued[WRITE]); | 1189 | tg->nr_queued[READ], tg->nr_queued[WRITE]); |
| 1191 | 1190 | ||
| 1191 | bio_associate_current(bio); | ||
| 1192 | throtl_add_bio_tg(q->td, tg, bio); | 1192 | throtl_add_bio_tg(q->td, tg, bio); |
| 1193 | throttled = true; | 1193 | throttled = true; |
| 1194 | 1194 | ||
| @@ -1199,6 +1199,8 @@ queue_bio: | |||
| 1199 | 1199 | ||
| 1200 | out_unlock: | 1200 | out_unlock: |
| 1201 | spin_unlock_irq(q->queue_lock); | 1201 | spin_unlock_irq(q->queue_lock); |
| 1202 | out_unlock_rcu: | ||
| 1203 | rcu_read_unlock(); | ||
| 1202 | out: | 1204 | out: |
| 1203 | return throttled; | 1205 | return throttled; |
| 1204 | } | 1206 | } |
| @@ -1241,79 +1243,31 @@ void blk_throtl_drain(struct request_queue *q) | |||
| 1241 | int blk_throtl_init(struct request_queue *q) | 1243 | int blk_throtl_init(struct request_queue *q) |
| 1242 | { | 1244 | { |
| 1243 | struct throtl_data *td; | 1245 | struct throtl_data *td; |
| 1244 | struct throtl_grp *tg; | 1246 | int ret; |
| 1245 | 1247 | ||
| 1246 | td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node); | 1248 | td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node); |
| 1247 | if (!td) | 1249 | if (!td) |
| 1248 | return -ENOMEM; | 1250 | return -ENOMEM; |
| 1249 | 1251 | ||
| 1250 | INIT_HLIST_HEAD(&td->tg_list); | ||
| 1251 | td->tg_service_tree = THROTL_RB_ROOT; | 1252 | td->tg_service_tree = THROTL_RB_ROOT; |
| 1252 | td->limits_changed = false; | 1253 | td->limits_changed = false; |
| 1253 | INIT_DELAYED_WORK(&td->throtl_work, blk_throtl_work); | 1254 | INIT_DELAYED_WORK(&td->throtl_work, blk_throtl_work); |
| 1254 | 1255 | ||
| 1255 | /* alloc and Init root group. */ | 1256 | q->td = td; |
| 1256 | td->queue = q; | 1257 | td->queue = q; |
| 1257 | tg = throtl_alloc_tg(td); | ||
| 1258 | 1258 | ||
| 1259 | if (!tg) { | 1259 | /* activate policy */ |
| 1260 | ret = blkcg_activate_policy(q, &blkcg_policy_throtl); | ||
| 1261 | if (ret) | ||
| 1260 | kfree(td); | 1262 | kfree(td); |
| 1261 | return -ENOMEM; | 1263 | return ret; |
| 1262 | } | ||
| 1263 | |||
| 1264 | td->root_tg = tg; | ||
| 1265 | |||
| 1266 | rcu_read_lock(); | ||
| 1267 | throtl_init_add_tg_lists(td, tg, &blkio_root_cgroup); | ||
| 1268 | rcu_read_unlock(); | ||
| 1269 | |||
| 1270 | /* Attach throtl data to request queue */ | ||
| 1271 | q->td = td; | ||
| 1272 | return 0; | ||
| 1273 | } | 1264 | } |
| 1274 | 1265 | ||
| 1275 | void blk_throtl_exit(struct request_queue *q) | 1266 | void blk_throtl_exit(struct request_queue *q) |
| 1276 | { | 1267 | { |
| 1277 | struct throtl_data *td = q->td; | 1268 | BUG_ON(!q->td); |
| 1278 | bool wait = false; | ||
| 1279 | |||
| 1280 | BUG_ON(!td); | ||
| 1281 | |||
| 1282 | throtl_shutdown_wq(q); | ||
| 1283 | |||
| 1284 | spin_lock_irq(q->queue_lock); | ||
| 1285 | throtl_release_tgs(td); | ||
| 1286 | |||
| 1287 | /* If there are other groups */ | ||
| 1288 | if (td->nr_undestroyed_grps > 0) | ||
| 1289 | wait = true; | ||
| 1290 | |||
| 1291 | spin_unlock_irq(q->queue_lock); | ||
| 1292 | |||
| 1293 | /* | ||
| 1294 | * Wait for tg->blkg->key accessors to exit their grace periods. | ||
| 1295 | * Do this wait only if there are other undestroyed groups out | ||
| 1296 | * there (other than root group). This can happen if cgroup deletion | ||
| 1297 | * path claimed the responsibility of cleaning up a group before | ||
| 1298 | * queue cleanup code get to the group. | ||
| 1299 | * | ||
| 1300 | * Do not call synchronize_rcu() unconditionally as there are drivers | ||
| 1301 | * which create/delete request queue hundreds of times during scan/boot | ||
| 1302 | * and synchronize_rcu() can take significant time and slow down boot. | ||
| 1303 | */ | ||
| 1304 | if (wait) | ||
| 1305 | synchronize_rcu(); | ||
| 1306 | |||
| 1307 | /* | ||
| 1308 | * Just being safe to make sure after previous flush if some body did | ||
| 1309 | * update limits through cgroup and another work got queued, cancel | ||
| 1310 | * it. | ||
| 1311 | */ | ||
| 1312 | throtl_shutdown_wq(q); | 1269 | throtl_shutdown_wq(q); |
| 1313 | } | 1270 | blkcg_deactivate_policy(q, &blkcg_policy_throtl); |
| 1314 | |||
| 1315 | void blk_throtl_release(struct request_queue *q) | ||
| 1316 | { | ||
| 1317 | kfree(q->td); | 1271 | kfree(q->td); |
| 1318 | } | 1272 | } |
| 1319 | 1273 | ||
| @@ -1323,8 +1277,7 @@ static int __init throtl_init(void) | |||
| 1323 | if (!kthrotld_workqueue) | 1277 | if (!kthrotld_workqueue) |
| 1324 | panic("Failed to create kthrotld\n"); | 1278 | panic("Failed to create kthrotld\n"); |
| 1325 | 1279 | ||
| 1326 | blkio_policy_register(&blkio_policy_throtl); | 1280 | return blkcg_policy_register(&blkcg_policy_throtl); |
| 1327 | return 0; | ||
| 1328 | } | 1281 | } |
| 1329 | 1282 | ||
| 1330 | module_init(throtl_init); | 1283 | module_init(throtl_init); |
