diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2012-05-30 11:52:42 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2012-05-30 11:52:42 -0400 |
commit | 0d167518e045cc8bb63f0a8a0a85ad4fa4e0044f (patch) | |
tree | 101a9b5d425d79f663e4f25f1e90b7a8cc6604f1 | |
parent | 2f83766d4b18774c856329a8fca4c9338dfeda39 (diff) | |
parent | ff26eaadf4d914e397872b99885d45756104e9ae (diff) |
Merge branch 'for-3.5/core' of git://git.kernel.dk/linux-block
Merge block/IO core bits from Jens Axboe:
"This is a bit bigger on the core side than usual, but that is purely
because we decided to hold off on parts of Tejun's submission on 3.4
to give it a bit more time to simmer. As a consequence, it's seen a
long cycle in for-next.
It contains:
- Bug fix from Dan, wrong locking type.
- Relax splice gifting restriction from Eric.
- A ton of updates from Tejun, primarily for blkcg. This improves
the code a lot, making the API nicer and cleaner, and also includes
fixes for how we handle and tie policies and re-activate on
switches. The changes also include generic bug fixes.
- A simple fix from Vivek, along with a fix for doing proper delayed
allocation of the blkcg stats."
Fix up annoying conflict just due to different merge resolution in
Documentation/feature-removal-schedule.txt
* 'for-3.5/core' of git://git.kernel.dk/linux-block: (92 commits)
blkcg: tg_stats_alloc_lock is an irq lock
vmsplice: relax alignement requirements for SPLICE_F_GIFT
blkcg: use radix tree to index blkgs from blkcg
blkcg: fix blkcg->css ref leak in __blkg_lookup_create()
block: fix elvpriv allocation failure handling
block: collapse blk_alloc_request() into get_request()
blkcg: collapse blkcg_policy_ops into blkcg_policy
blkcg: embed struct blkg_policy_data in policy specific data
blkcg: mass rename of blkcg API
blkcg: style cleanups for blk-cgroup.h
blkcg: remove blkio_group->path[]
blkcg: blkg_rwstat_read() was missing inline
blkcg: shoot down blkgs if all policies are deactivated
blkcg: drop stuff unused after per-queue policy activation update
blkcg: implement per-queue policy activation
blkcg: add request_queue->root_blkg
blkcg: make request_queue bypassing on allocation
blkcg: make sure blkg_lookup() returns %NULL if @q is bypassing
blkcg: make blkg_conf_prep() take @pol and return with queue lock held
blkcg: remove static policy ID enums
...
-rw-r--r-- | block/Kconfig.iosched | 4 | ||||
-rw-r--r-- | block/blk-cgroup.c | 2100 | ||||
-rw-r--r-- | block/blk-cgroup.h | 647 | ||||
-rw-r--r-- | block/blk-core.c | 281 | ||||
-rw-r--r-- | block/blk-ioc.c | 126 | ||||
-rw-r--r-- | block/blk-sysfs.c | 6 | ||||
-rw-r--r-- | block/blk-throttle.c | 697 | ||||
-rw-r--r-- | block/blk.h | 32 | ||||
-rw-r--r-- | block/cfq-iosched.c | 1072 | ||||
-rw-r--r-- | block/cfq.h | 115 | ||||
-rw-r--r-- | block/deadline-iosched.c | 8 | ||||
-rw-r--r-- | block/elevator.c | 121 | ||||
-rw-r--r-- | block/noop-iosched.c | 8 | ||||
-rw-r--r-- | fs/bio.c | 61 | ||||
-rw-r--r-- | fs/ioprio.c | 2 | ||||
-rw-r--r-- | fs/splice.c | 4 | ||||
-rw-r--r-- | include/linux/bio.h | 8 | ||||
-rw-r--r-- | include/linux/blk_types.h | 10 | ||||
-rw-r--r-- | include/linux/blkdev.h | 20 | ||||
-rw-r--r-- | include/linux/elevator.h | 8 | ||||
-rw-r--r-- | include/linux/iocontext.h | 39 | ||||
-rw-r--r-- | include/linux/ioprio.h | 22 | ||||
-rw-r--r-- | init/Kconfig | 2 | ||||
-rw-r--r-- | kernel/fork.c | 5 |
24 files changed, 2446 insertions, 2952 deletions
diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched index 3199b76f795d..421bef9c4c48 100644 --- a/block/Kconfig.iosched +++ b/block/Kconfig.iosched | |||
@@ -23,8 +23,6 @@ config IOSCHED_DEADLINE | |||
23 | 23 | ||
24 | config IOSCHED_CFQ | 24 | config IOSCHED_CFQ |
25 | tristate "CFQ I/O scheduler" | 25 | tristate "CFQ I/O scheduler" |
26 | # If BLK_CGROUP is a module, CFQ has to be built as module. | ||
27 | depends on (BLK_CGROUP=m && m) || !BLK_CGROUP || BLK_CGROUP=y | ||
28 | default y | 26 | default y |
29 | ---help--- | 27 | ---help--- |
30 | The CFQ I/O scheduler tries to distribute bandwidth equally | 28 | The CFQ I/O scheduler tries to distribute bandwidth equally |
@@ -34,8 +32,6 @@ config IOSCHED_CFQ | |||
34 | 32 | ||
35 | This is the default I/O scheduler. | 33 | This is the default I/O scheduler. |
36 | 34 | ||
37 | Note: If BLK_CGROUP=m, then CFQ can be built only as module. | ||
38 | |||
39 | config CFQ_GROUP_IOSCHED | 35 | config CFQ_GROUP_IOSCHED |
40 | bool "CFQ Group Scheduling support" | 36 | bool "CFQ Group Scheduling support" |
41 | depends on IOSCHED_CFQ && BLK_CGROUP | 37 | depends on IOSCHED_CFQ && BLK_CGROUP |
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 126c341955de..02cf6335e9bd 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c | |||
@@ -11,1570 +11,612 @@ | |||
11 | * Nauman Rafique <nauman@google.com> | 11 | * Nauman Rafique <nauman@google.com> |
12 | */ | 12 | */ |
13 | #include <linux/ioprio.h> | 13 | #include <linux/ioprio.h> |
14 | #include <linux/seq_file.h> | ||
15 | #include <linux/kdev_t.h> | 14 | #include <linux/kdev_t.h> |
16 | #include <linux/module.h> | 15 | #include <linux/module.h> |
17 | #include <linux/err.h> | 16 | #include <linux/err.h> |
18 | #include <linux/blkdev.h> | 17 | #include <linux/blkdev.h> |
19 | #include <linux/slab.h> | 18 | #include <linux/slab.h> |
20 | #include "blk-cgroup.h" | ||
21 | #include <linux/genhd.h> | 19 | #include <linux/genhd.h> |
20 | #include <linux/delay.h> | ||
21 | #include <linux/atomic.h> | ||
22 | #include "blk-cgroup.h" | ||
23 | #include "blk.h" | ||
22 | 24 | ||
23 | #define MAX_KEY_LEN 100 | 25 | #define MAX_KEY_LEN 100 |
24 | 26 | ||
25 | static DEFINE_SPINLOCK(blkio_list_lock); | 27 | static DEFINE_MUTEX(blkcg_pol_mutex); |
26 | static LIST_HEAD(blkio_list); | ||
27 | 28 | ||
28 | struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT }; | 29 | struct blkcg blkcg_root = { .cfq_weight = 2 * CFQ_WEIGHT_DEFAULT }; |
29 | EXPORT_SYMBOL_GPL(blkio_root_cgroup); | 30 | EXPORT_SYMBOL_GPL(blkcg_root); |
30 | 31 | ||
31 | /* for encoding cft->private value on file */ | 32 | static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS]; |
32 | #define BLKIOFILE_PRIVATE(x, val) (((x) << 16) | (val)) | ||
33 | /* What policy owns the file, proportional or throttle */ | ||
34 | #define BLKIOFILE_POLICY(val) (((val) >> 16) & 0xffff) | ||
35 | #define BLKIOFILE_ATTR(val) ((val) & 0xffff) | ||
36 | 33 | ||
37 | static inline void blkio_policy_insert_node(struct blkio_cgroup *blkcg, | 34 | struct blkcg *cgroup_to_blkcg(struct cgroup *cgroup) |
38 | struct blkio_policy_node *pn) | ||
39 | { | 35 | { |
40 | list_add(&pn->node, &blkcg->policy_list); | 36 | return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id), |
37 | struct blkcg, css); | ||
41 | } | 38 | } |
39 | EXPORT_SYMBOL_GPL(cgroup_to_blkcg); | ||
42 | 40 | ||
43 | static inline bool cftype_blkg_same_policy(struct cftype *cft, | 41 | static struct blkcg *task_blkcg(struct task_struct *tsk) |
44 | struct blkio_group *blkg) | ||
45 | { | 42 | { |
46 | enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); | 43 | return container_of(task_subsys_state(tsk, blkio_subsys_id), |
47 | 44 | struct blkcg, css); | |
48 | if (blkg->plid == plid) | ||
49 | return 1; | ||
50 | |||
51 | return 0; | ||
52 | } | 45 | } |
53 | 46 | ||
54 | /* Determines if policy node matches cgroup file being accessed */ | 47 | struct blkcg *bio_blkcg(struct bio *bio) |
55 | static inline bool pn_matches_cftype(struct cftype *cft, | ||
56 | struct blkio_policy_node *pn) | ||
57 | { | 48 | { |
58 | enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); | 49 | if (bio && bio->bi_css) |
59 | int fileid = BLKIOFILE_ATTR(cft->private); | 50 | return container_of(bio->bi_css, struct blkcg, css); |
60 | 51 | return task_blkcg(current); | |
61 | return (plid == pn->plid && fileid == pn->fileid); | ||
62 | } | 52 | } |
53 | EXPORT_SYMBOL_GPL(bio_blkcg); | ||
63 | 54 | ||
64 | /* Must be called with blkcg->lock held */ | 55 | static bool blkcg_policy_enabled(struct request_queue *q, |
65 | static inline void blkio_policy_delete_node(struct blkio_policy_node *pn) | 56 | const struct blkcg_policy *pol) |
66 | { | 57 | { |
67 | list_del(&pn->node); | 58 | return pol && test_bit(pol->plid, q->blkcg_pols); |
68 | } | 59 | } |
69 | 60 | ||
70 | /* Must be called with blkcg->lock held */ | 61 | /** |
71 | static struct blkio_policy_node * | 62 | * blkg_free - free a blkg |
72 | blkio_policy_search_node(const struct blkio_cgroup *blkcg, dev_t dev, | 63 | * @blkg: blkg to free |
73 | enum blkio_policy_id plid, int fileid) | 64 | * |
65 | * Free @blkg which may be partially allocated. | ||
66 | */ | ||
67 | static void blkg_free(struct blkcg_gq *blkg) | ||
74 | { | 68 | { |
75 | struct blkio_policy_node *pn; | 69 | int i; |
76 | |||
77 | list_for_each_entry(pn, &blkcg->policy_list, node) { | ||
78 | if (pn->dev == dev && pn->plid == plid && pn->fileid == fileid) | ||
79 | return pn; | ||
80 | } | ||
81 | 70 | ||
82 | return NULL; | 71 | if (!blkg) |
83 | } | 72 | return; |
84 | 73 | ||
85 | struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup) | 74 | for (i = 0; i < BLKCG_MAX_POLS; i++) { |
86 | { | 75 | struct blkcg_policy *pol = blkcg_policy[i]; |
87 | return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id), | 76 | struct blkg_policy_data *pd = blkg->pd[i]; |
88 | struct blkio_cgroup, css); | ||
89 | } | ||
90 | EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup); | ||
91 | 77 | ||
92 | struct blkio_cgroup *task_blkio_cgroup(struct task_struct *tsk) | 78 | if (!pd) |
93 | { | 79 | continue; |
94 | return container_of(task_subsys_state(tsk, blkio_subsys_id), | ||
95 | struct blkio_cgroup, css); | ||
96 | } | ||
97 | EXPORT_SYMBOL_GPL(task_blkio_cgroup); | ||
98 | 80 | ||
99 | static inline void | 81 | if (pol && pol->pd_exit_fn) |
100 | blkio_update_group_weight(struct blkio_group *blkg, unsigned int weight) | 82 | pol->pd_exit_fn(blkg); |
101 | { | ||
102 | struct blkio_policy_type *blkiop; | ||
103 | 83 | ||
104 | list_for_each_entry(blkiop, &blkio_list, list) { | 84 | kfree(pd); |
105 | /* If this policy does not own the blkg, do not send updates */ | ||
106 | if (blkiop->plid != blkg->plid) | ||
107 | continue; | ||
108 | if (blkiop->ops.blkio_update_group_weight_fn) | ||
109 | blkiop->ops.blkio_update_group_weight_fn(blkg->key, | ||
110 | blkg, weight); | ||
111 | } | 85 | } |
86 | |||
87 | kfree(blkg); | ||
112 | } | 88 | } |
113 | 89 | ||
114 | static inline void blkio_update_group_bps(struct blkio_group *blkg, u64 bps, | 90 | /** |
115 | int fileid) | 91 | * blkg_alloc - allocate a blkg |
92 | * @blkcg: block cgroup the new blkg is associated with | ||
93 | * @q: request_queue the new blkg is associated with | ||
94 | * | ||
95 | * Allocate a new blkg assocating @blkcg and @q. | ||
96 | */ | ||
97 | static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q) | ||
116 | { | 98 | { |
117 | struct blkio_policy_type *blkiop; | 99 | struct blkcg_gq *blkg; |
118 | 100 | int i; | |
119 | list_for_each_entry(blkiop, &blkio_list, list) { | ||
120 | |||
121 | /* If this policy does not own the blkg, do not send updates */ | ||
122 | if (blkiop->plid != blkg->plid) | ||
123 | continue; | ||
124 | |||
125 | if (fileid == BLKIO_THROTL_read_bps_device | ||
126 | && blkiop->ops.blkio_update_group_read_bps_fn) | ||
127 | blkiop->ops.blkio_update_group_read_bps_fn(blkg->key, | ||
128 | blkg, bps); | ||
129 | 101 | ||
130 | if (fileid == BLKIO_THROTL_write_bps_device | 102 | /* alloc and init base part */ |
131 | && blkiop->ops.blkio_update_group_write_bps_fn) | 103 | blkg = kzalloc_node(sizeof(*blkg), GFP_ATOMIC, q->node); |
132 | blkiop->ops.blkio_update_group_write_bps_fn(blkg->key, | 104 | if (!blkg) |
133 | blkg, bps); | 105 | return NULL; |
134 | } | ||
135 | } | ||
136 | 106 | ||
137 | static inline void blkio_update_group_iops(struct blkio_group *blkg, | 107 | blkg->q = q; |
138 | unsigned int iops, int fileid) | 108 | INIT_LIST_HEAD(&blkg->q_node); |
139 | { | 109 | blkg->blkcg = blkcg; |
140 | struct blkio_policy_type *blkiop; | 110 | blkg->refcnt = 1; |
141 | 111 | ||
142 | list_for_each_entry(blkiop, &blkio_list, list) { | 112 | for (i = 0; i < BLKCG_MAX_POLS; i++) { |
113 | struct blkcg_policy *pol = blkcg_policy[i]; | ||
114 | struct blkg_policy_data *pd; | ||
143 | 115 | ||
144 | /* If this policy does not own the blkg, do not send updates */ | 116 | if (!blkcg_policy_enabled(q, pol)) |
145 | if (blkiop->plid != blkg->plid) | ||
146 | continue; | 117 | continue; |
147 | 118 | ||
148 | if (fileid == BLKIO_THROTL_read_iops_device | 119 | /* alloc per-policy data and attach it to blkg */ |
149 | && blkiop->ops.blkio_update_group_read_iops_fn) | 120 | pd = kzalloc_node(pol->pd_size, GFP_ATOMIC, q->node); |
150 | blkiop->ops.blkio_update_group_read_iops_fn(blkg->key, | 121 | if (!pd) { |
151 | blkg, iops); | 122 | blkg_free(blkg); |
123 | return NULL; | ||
124 | } | ||
152 | 125 | ||
153 | if (fileid == BLKIO_THROTL_write_iops_device | 126 | blkg->pd[i] = pd; |
154 | && blkiop->ops.blkio_update_group_write_iops_fn) | 127 | pd->blkg = blkg; |
155 | blkiop->ops.blkio_update_group_write_iops_fn(blkg->key, | ||
156 | blkg,iops); | ||
157 | } | 128 | } |
158 | } | ||
159 | 129 | ||
160 | /* | 130 | /* invoke per-policy init */ |
161 | * Add to the appropriate stat variable depending on the request type. | 131 | for (i = 0; i < BLKCG_MAX_POLS; i++) { |
162 | * This should be called with the blkg->stats_lock held. | 132 | struct blkcg_policy *pol = blkcg_policy[i]; |
163 | */ | ||
164 | static void blkio_add_stat(uint64_t *stat, uint64_t add, bool direction, | ||
165 | bool sync) | ||
166 | { | ||
167 | if (direction) | ||
168 | stat[BLKIO_STAT_WRITE] += add; | ||
169 | else | ||
170 | stat[BLKIO_STAT_READ] += add; | ||
171 | if (sync) | ||
172 | stat[BLKIO_STAT_SYNC] += add; | ||
173 | else | ||
174 | stat[BLKIO_STAT_ASYNC] += add; | ||
175 | } | ||
176 | 133 | ||
177 | /* | 134 | if (blkcg_policy_enabled(blkg->q, pol)) |
178 | * Decrements the appropriate stat variable if non-zero depending on the | 135 | pol->pd_init_fn(blkg); |
179 | * request type. Panics on value being zero. | ||
180 | * This should be called with the blkg->stats_lock held. | ||
181 | */ | ||
182 | static void blkio_check_and_dec_stat(uint64_t *stat, bool direction, bool sync) | ||
183 | { | ||
184 | if (direction) { | ||
185 | BUG_ON(stat[BLKIO_STAT_WRITE] == 0); | ||
186 | stat[BLKIO_STAT_WRITE]--; | ||
187 | } else { | ||
188 | BUG_ON(stat[BLKIO_STAT_READ] == 0); | ||
189 | stat[BLKIO_STAT_READ]--; | ||
190 | } | ||
191 | if (sync) { | ||
192 | BUG_ON(stat[BLKIO_STAT_SYNC] == 0); | ||
193 | stat[BLKIO_STAT_SYNC]--; | ||
194 | } else { | ||
195 | BUG_ON(stat[BLKIO_STAT_ASYNC] == 0); | ||
196 | stat[BLKIO_STAT_ASYNC]--; | ||
197 | } | 136 | } |
198 | } | ||
199 | 137 | ||
200 | #ifdef CONFIG_DEBUG_BLK_CGROUP | 138 | return blkg; |
201 | /* This should be called with the blkg->stats_lock held. */ | ||
202 | static void blkio_set_start_group_wait_time(struct blkio_group *blkg, | ||
203 | struct blkio_group *curr_blkg) | ||
204 | { | ||
205 | if (blkio_blkg_waiting(&blkg->stats)) | ||
206 | return; | ||
207 | if (blkg == curr_blkg) | ||
208 | return; | ||
209 | blkg->stats.start_group_wait_time = sched_clock(); | ||
210 | blkio_mark_blkg_waiting(&blkg->stats); | ||
211 | } | 139 | } |
212 | 140 | ||
213 | /* This should be called with the blkg->stats_lock held. */ | 141 | static struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, |
214 | static void blkio_update_group_wait_time(struct blkio_group_stats *stats) | 142 | struct request_queue *q) |
215 | { | 143 | { |
216 | unsigned long long now; | 144 | struct blkcg_gq *blkg; |
217 | 145 | ||
218 | if (!blkio_blkg_waiting(stats)) | 146 | blkg = rcu_dereference(blkcg->blkg_hint); |
219 | return; | 147 | if (blkg && blkg->q == q) |
148 | return blkg; | ||
220 | 149 | ||
221 | now = sched_clock(); | 150 | /* |
222 | if (time_after64(now, stats->start_group_wait_time)) | 151 | * Hint didn't match. Look up from the radix tree. Note that we |
223 | stats->group_wait_time += now - stats->start_group_wait_time; | 152 | * may not be holding queue_lock and thus are not sure whether |
224 | blkio_clear_blkg_waiting(stats); | 153 | * @blkg from blkg_tree has already been removed or not, so we |
154 | * can't update hint to the lookup result. Leave it to the caller. | ||
155 | */ | ||
156 | blkg = radix_tree_lookup(&blkcg->blkg_tree, q->id); | ||
157 | if (blkg && blkg->q == q) | ||
158 | return blkg; | ||
159 | |||
160 | return NULL; | ||
225 | } | 161 | } |
226 | 162 | ||
227 | /* This should be called with the blkg->stats_lock held. */ | 163 | /** |
228 | static void blkio_end_empty_time(struct blkio_group_stats *stats) | 164 | * blkg_lookup - lookup blkg for the specified blkcg - q pair |
165 | * @blkcg: blkcg of interest | ||
166 | * @q: request_queue of interest | ||
167 | * | ||
168 | * Lookup blkg for the @blkcg - @q pair. This function should be called | ||
169 | * under RCU read lock and is guaranteed to return %NULL if @q is bypassing | ||
170 | * - see blk_queue_bypass_start() for details. | ||
171 | */ | ||
172 | struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q) | ||
229 | { | 173 | { |
230 | unsigned long long now; | 174 | WARN_ON_ONCE(!rcu_read_lock_held()); |
231 | |||
232 | if (!blkio_blkg_empty(stats)) | ||
233 | return; | ||
234 | 175 | ||
235 | now = sched_clock(); | 176 | if (unlikely(blk_queue_bypass(q))) |
236 | if (time_after64(now, stats->start_empty_time)) | 177 | return NULL; |
237 | stats->empty_time += now - stats->start_empty_time; | 178 | return __blkg_lookup(blkcg, q); |
238 | blkio_clear_blkg_empty(stats); | ||
239 | } | 179 | } |
180 | EXPORT_SYMBOL_GPL(blkg_lookup); | ||
240 | 181 | ||
241 | void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg) | 182 | static struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg, |
183 | struct request_queue *q) | ||
184 | __releases(q->queue_lock) __acquires(q->queue_lock) | ||
242 | { | 185 | { |
243 | unsigned long flags; | 186 | struct blkcg_gq *blkg; |
187 | int ret; | ||
244 | 188 | ||
245 | spin_lock_irqsave(&blkg->stats_lock, flags); | 189 | WARN_ON_ONCE(!rcu_read_lock_held()); |
246 | BUG_ON(blkio_blkg_idling(&blkg->stats)); | 190 | lockdep_assert_held(q->queue_lock); |
247 | blkg->stats.start_idle_time = sched_clock(); | ||
248 | blkio_mark_blkg_idling(&blkg->stats); | ||
249 | spin_unlock_irqrestore(&blkg->stats_lock, flags); | ||
250 | } | ||
251 | EXPORT_SYMBOL_GPL(blkiocg_update_set_idle_time_stats); | ||
252 | 191 | ||
253 | void blkiocg_update_idle_time_stats(struct blkio_group *blkg) | 192 | /* lookup and update hint on success, see __blkg_lookup() for details */ |
254 | { | 193 | blkg = __blkg_lookup(blkcg, q); |
255 | unsigned long flags; | 194 | if (blkg) { |
256 | unsigned long long now; | 195 | rcu_assign_pointer(blkcg->blkg_hint, blkg); |
257 | struct blkio_group_stats *stats; | 196 | return blkg; |
258 | |||
259 | spin_lock_irqsave(&blkg->stats_lock, flags); | ||
260 | stats = &blkg->stats; | ||
261 | if (blkio_blkg_idling(stats)) { | ||
262 | now = sched_clock(); | ||
263 | if (time_after64(now, stats->start_idle_time)) | ||
264 | stats->idle_time += now - stats->start_idle_time; | ||
265 | blkio_clear_blkg_idling(stats); | ||
266 | } | 197 | } |
267 | spin_unlock_irqrestore(&blkg->stats_lock, flags); | ||
268 | } | ||
269 | EXPORT_SYMBOL_GPL(blkiocg_update_idle_time_stats); | ||
270 | 198 | ||
271 | void blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg) | 199 | /* blkg holds a reference to blkcg */ |
272 | { | 200 | if (!css_tryget(&blkcg->css)) |
273 | unsigned long flags; | 201 | return ERR_PTR(-EINVAL); |
274 | struct blkio_group_stats *stats; | ||
275 | |||
276 | spin_lock_irqsave(&blkg->stats_lock, flags); | ||
277 | stats = &blkg->stats; | ||
278 | stats->avg_queue_size_sum += | ||
279 | stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] + | ||
280 | stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE]; | ||
281 | stats->avg_queue_size_samples++; | ||
282 | blkio_update_group_wait_time(stats); | ||
283 | spin_unlock_irqrestore(&blkg->stats_lock, flags); | ||
284 | } | ||
285 | EXPORT_SYMBOL_GPL(blkiocg_update_avg_queue_size_stats); | ||
286 | 202 | ||
287 | void blkiocg_set_start_empty_time(struct blkio_group *blkg) | 203 | /* allocate */ |
288 | { | 204 | ret = -ENOMEM; |
289 | unsigned long flags; | 205 | blkg = blkg_alloc(blkcg, q); |
290 | struct blkio_group_stats *stats; | 206 | if (unlikely(!blkg)) |
207 | goto err_put; | ||
291 | 208 | ||
292 | spin_lock_irqsave(&blkg->stats_lock, flags); | 209 | /* insert */ |
293 | stats = &blkg->stats; | 210 | ret = radix_tree_preload(GFP_ATOMIC); |
211 | if (ret) | ||
212 | goto err_free; | ||
294 | 213 | ||
295 | if (stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] || | 214 | spin_lock(&blkcg->lock); |
296 | stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE]) { | 215 | ret = radix_tree_insert(&blkcg->blkg_tree, q->id, blkg); |
297 | spin_unlock_irqrestore(&blkg->stats_lock, flags); | 216 | if (likely(!ret)) { |
298 | return; | 217 | hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list); |
218 | list_add(&blkg->q_node, &q->blkg_list); | ||
299 | } | 219 | } |
220 | spin_unlock(&blkcg->lock); | ||
300 | 221 | ||
301 | /* | 222 | radix_tree_preload_end(); |
302 | * group is already marked empty. This can happen if cfqq got new | ||
303 | * request in parent group and moved to this group while being added | ||
304 | * to service tree. Just ignore the event and move on. | ||
305 | */ | ||
306 | if(blkio_blkg_empty(stats)) { | ||
307 | spin_unlock_irqrestore(&blkg->stats_lock, flags); | ||
308 | return; | ||
309 | } | ||
310 | 223 | ||
311 | stats->start_empty_time = sched_clock(); | 224 | if (!ret) |
312 | blkio_mark_blkg_empty(stats); | 225 | return blkg; |
313 | spin_unlock_irqrestore(&blkg->stats_lock, flags); | 226 | err_free: |
227 | blkg_free(blkg); | ||
228 | err_put: | ||
229 | css_put(&blkcg->css); | ||
230 | return ERR_PTR(ret); | ||
314 | } | 231 | } |
315 | EXPORT_SYMBOL_GPL(blkiocg_set_start_empty_time); | ||
316 | 232 | ||
317 | void blkiocg_update_dequeue_stats(struct blkio_group *blkg, | 233 | struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, |
318 | unsigned long dequeue) | 234 | struct request_queue *q) |
319 | { | 235 | { |
320 | blkg->stats.dequeue += dequeue; | 236 | /* |
321 | } | 237 | * This could be the first entry point of blkcg implementation and |
322 | EXPORT_SYMBOL_GPL(blkiocg_update_dequeue_stats); | 238 | * we shouldn't allow anything to go through for a bypassing queue. |
323 | #else | 239 | */ |
324 | static inline void blkio_set_start_group_wait_time(struct blkio_group *blkg, | 240 | if (unlikely(blk_queue_bypass(q))) |
325 | struct blkio_group *curr_blkg) {} | 241 | return ERR_PTR(blk_queue_dead(q) ? -EINVAL : -EBUSY); |
326 | static inline void blkio_end_empty_time(struct blkio_group_stats *stats) {} | 242 | return __blkg_lookup_create(blkcg, q); |
327 | #endif | ||
328 | |||
329 | void blkiocg_update_io_add_stats(struct blkio_group *blkg, | ||
330 | struct blkio_group *curr_blkg, bool direction, | ||
331 | bool sync) | ||
332 | { | ||
333 | unsigned long flags; | ||
334 | |||
335 | spin_lock_irqsave(&blkg->stats_lock, flags); | ||
336 | blkio_add_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED], 1, direction, | ||
337 | sync); | ||
338 | blkio_end_empty_time(&blkg->stats); | ||
339 | blkio_set_start_group_wait_time(blkg, curr_blkg); | ||
340 | spin_unlock_irqrestore(&blkg->stats_lock, flags); | ||
341 | } | 243 | } |
342 | EXPORT_SYMBOL_GPL(blkiocg_update_io_add_stats); | 244 | EXPORT_SYMBOL_GPL(blkg_lookup_create); |
343 | 245 | ||
344 | void blkiocg_update_io_remove_stats(struct blkio_group *blkg, | 246 | static void blkg_destroy(struct blkcg_gq *blkg) |
345 | bool direction, bool sync) | ||
346 | { | 247 | { |
347 | unsigned long flags; | 248 | struct request_queue *q = blkg->q; |
249 | struct blkcg *blkcg = blkg->blkcg; | ||
348 | 250 | ||
349 | spin_lock_irqsave(&blkg->stats_lock, flags); | 251 | lockdep_assert_held(q->queue_lock); |
350 | blkio_check_and_dec_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED], | 252 | lockdep_assert_held(&blkcg->lock); |
351 | direction, sync); | ||
352 | spin_unlock_irqrestore(&blkg->stats_lock, flags); | ||
353 | } | ||
354 | EXPORT_SYMBOL_GPL(blkiocg_update_io_remove_stats); | ||
355 | 253 | ||
356 | void blkiocg_update_timeslice_used(struct blkio_group *blkg, unsigned long time, | 254 | /* Something wrong if we are trying to remove same group twice */ |
357 | unsigned long unaccounted_time) | 255 | WARN_ON_ONCE(list_empty(&blkg->q_node)); |
358 | { | 256 | WARN_ON_ONCE(hlist_unhashed(&blkg->blkcg_node)); |
359 | unsigned long flags; | ||
360 | |||
361 | spin_lock_irqsave(&blkg->stats_lock, flags); | ||
362 | blkg->stats.time += time; | ||
363 | #ifdef CONFIG_DEBUG_BLK_CGROUP | ||
364 | blkg->stats.unaccounted_time += unaccounted_time; | ||
365 | #endif | ||
366 | spin_unlock_irqrestore(&blkg->stats_lock, flags); | ||
367 | } | ||
368 | EXPORT_SYMBOL_GPL(blkiocg_update_timeslice_used); | ||
369 | 257 | ||
370 | /* | 258 | radix_tree_delete(&blkcg->blkg_tree, blkg->q->id); |
371 | * should be called under rcu read lock or queue lock to make sure blkg pointer | 259 | list_del_init(&blkg->q_node); |
372 | * is valid. | 260 | hlist_del_init_rcu(&blkg->blkcg_node); |
373 | */ | ||
374 | void blkiocg_update_dispatch_stats(struct blkio_group *blkg, | ||
375 | uint64_t bytes, bool direction, bool sync) | ||
376 | { | ||
377 | struct blkio_group_stats_cpu *stats_cpu; | ||
378 | unsigned long flags; | ||
379 | 261 | ||
380 | /* | 262 | /* |
381 | * Disabling interrupts to provide mutual exclusion between two | 263 | * Both setting lookup hint to and clearing it from @blkg are done |
382 | * writes on same cpu. It probably is not needed for 64bit. Not | 264 | * under queue_lock. If it's not pointing to @blkg now, it never |
383 | * optimizing that case yet. | 265 | * will. Hint assignment itself can race safely. |
384 | */ | 266 | */ |
385 | local_irq_save(flags); | 267 | if (rcu_dereference_raw(blkcg->blkg_hint) == blkg) |
386 | 268 | rcu_assign_pointer(blkcg->blkg_hint, NULL); | |
387 | stats_cpu = this_cpu_ptr(blkg->stats_cpu); | ||
388 | |||
389 | u64_stats_update_begin(&stats_cpu->syncp); | ||
390 | stats_cpu->sectors += bytes >> 9; | ||
391 | blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_SERVICED], | ||
392 | 1, direction, sync); | ||
393 | blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_SERVICE_BYTES], | ||
394 | bytes, direction, sync); | ||
395 | u64_stats_update_end(&stats_cpu->syncp); | ||
396 | local_irq_restore(flags); | ||
397 | } | ||
398 | EXPORT_SYMBOL_GPL(blkiocg_update_dispatch_stats); | ||
399 | |||
400 | void blkiocg_update_completion_stats(struct blkio_group *blkg, | ||
401 | uint64_t start_time, uint64_t io_start_time, bool direction, bool sync) | ||
402 | { | ||
403 | struct blkio_group_stats *stats; | ||
404 | unsigned long flags; | ||
405 | unsigned long long now = sched_clock(); | ||
406 | |||
407 | spin_lock_irqsave(&blkg->stats_lock, flags); | ||
408 | stats = &blkg->stats; | ||
409 | if (time_after64(now, io_start_time)) | ||
410 | blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICE_TIME], | ||
411 | now - io_start_time, direction, sync); | ||
412 | if (time_after64(io_start_time, start_time)) | ||
413 | blkio_add_stat(stats->stat_arr[BLKIO_STAT_WAIT_TIME], | ||
414 | io_start_time - start_time, direction, sync); | ||
415 | spin_unlock_irqrestore(&blkg->stats_lock, flags); | ||
416 | } | ||
417 | EXPORT_SYMBOL_GPL(blkiocg_update_completion_stats); | ||
418 | |||
419 | /* Merged stats are per cpu. */ | ||
420 | void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction, | ||
421 | bool sync) | ||
422 | { | ||
423 | struct blkio_group_stats_cpu *stats_cpu; | ||
424 | unsigned long flags; | ||
425 | 269 | ||
426 | /* | 270 | /* |
427 | * Disabling interrupts to provide mutual exclusion between two | 271 | * Put the reference taken at the time of creation so that when all |
428 | * writes on same cpu. It probably is not needed for 64bit. Not | 272 | * queues are gone, group can be destroyed. |
429 | * optimizing that case yet. | ||
430 | */ | 273 | */ |
431 | local_irq_save(flags); | 274 | blkg_put(blkg); |
432 | |||
433 | stats_cpu = this_cpu_ptr(blkg->stats_cpu); | ||
434 | |||
435 | u64_stats_update_begin(&stats_cpu->syncp); | ||
436 | blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_MERGED], 1, | ||
437 | direction, sync); | ||
438 | u64_stats_update_end(&stats_cpu->syncp); | ||
439 | local_irq_restore(flags); | ||
440 | } | 275 | } |
441 | EXPORT_SYMBOL_GPL(blkiocg_update_io_merged_stats); | ||
442 | 276 | ||
443 | /* | 277 | /** |
444 | * This function allocates the per cpu stats for blkio_group. Should be called | 278 | * blkg_destroy_all - destroy all blkgs associated with a request_queue |
445 | * from sleepable context as alloc_per_cpu() requires that. | 279 | * @q: request_queue of interest |
280 | * | ||
281 | * Destroy all blkgs associated with @q. | ||
446 | */ | 282 | */ |
447 | int blkio_alloc_blkg_stats(struct blkio_group *blkg) | 283 | static void blkg_destroy_all(struct request_queue *q) |
448 | { | 284 | { |
449 | /* Allocate memory for per cpu stats */ | 285 | struct blkcg_gq *blkg, *n; |
450 | blkg->stats_cpu = alloc_percpu(struct blkio_group_stats_cpu); | ||
451 | if (!blkg->stats_cpu) | ||
452 | return -ENOMEM; | ||
453 | return 0; | ||
454 | } | ||
455 | EXPORT_SYMBOL_GPL(blkio_alloc_blkg_stats); | ||
456 | 286 | ||
457 | void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, | 287 | lockdep_assert_held(q->queue_lock); |
458 | struct blkio_group *blkg, void *key, dev_t dev, | ||
459 | enum blkio_policy_id plid) | ||
460 | { | ||
461 | unsigned long flags; | ||
462 | |||
463 | spin_lock_irqsave(&blkcg->lock, flags); | ||
464 | spin_lock_init(&blkg->stats_lock); | ||
465 | rcu_assign_pointer(blkg->key, key); | ||
466 | blkg->blkcg_id = css_id(&blkcg->css); | ||
467 | hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list); | ||
468 | blkg->plid = plid; | ||
469 | spin_unlock_irqrestore(&blkcg->lock, flags); | ||
470 | /* Need to take css reference ? */ | ||
471 | cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path)); | ||
472 | blkg->dev = dev; | ||
473 | } | ||
474 | EXPORT_SYMBOL_GPL(blkiocg_add_blkio_group); | ||
475 | 288 | ||
476 | static void __blkiocg_del_blkio_group(struct blkio_group *blkg) | 289 | list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) { |
477 | { | 290 | struct blkcg *blkcg = blkg->blkcg; |
478 | hlist_del_init_rcu(&blkg->blkcg_node); | ||
479 | blkg->blkcg_id = 0; | ||
480 | } | ||
481 | 291 | ||
482 | /* | 292 | spin_lock(&blkcg->lock); |
483 | * returns 0 if blkio_group was still on cgroup list. Otherwise returns 1 | 293 | blkg_destroy(blkg); |
484 | * indicating that blk_group was unhashed by the time we got to it. | 294 | spin_unlock(&blkcg->lock); |
485 | */ | ||
486 | int blkiocg_del_blkio_group(struct blkio_group *blkg) | ||
487 | { | ||
488 | struct blkio_cgroup *blkcg; | ||
489 | unsigned long flags; | ||
490 | struct cgroup_subsys_state *css; | ||
491 | int ret = 1; | ||
492 | |||
493 | rcu_read_lock(); | ||
494 | css = css_lookup(&blkio_subsys, blkg->blkcg_id); | ||
495 | if (css) { | ||
496 | blkcg = container_of(css, struct blkio_cgroup, css); | ||
497 | spin_lock_irqsave(&blkcg->lock, flags); | ||
498 | if (!hlist_unhashed(&blkg->blkcg_node)) { | ||
499 | __blkiocg_del_blkio_group(blkg); | ||
500 | ret = 0; | ||
501 | } | ||
502 | spin_unlock_irqrestore(&blkcg->lock, flags); | ||
503 | } | 295 | } |
504 | |||
505 | rcu_read_unlock(); | ||
506 | return ret; | ||
507 | } | 296 | } |
508 | EXPORT_SYMBOL_GPL(blkiocg_del_blkio_group); | ||
509 | 297 | ||
510 | /* called under rcu_read_lock(). */ | 298 | static void blkg_rcu_free(struct rcu_head *rcu_head) |
511 | struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key) | ||
512 | { | 299 | { |
513 | struct blkio_group *blkg; | 300 | blkg_free(container_of(rcu_head, struct blkcg_gq, rcu_head)); |
514 | struct hlist_node *n; | ||
515 | void *__key; | ||
516 | |||
517 | hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) { | ||
518 | __key = blkg->key; | ||
519 | if (__key == key) | ||
520 | return blkg; | ||
521 | } | ||
522 | |||
523 | return NULL; | ||
524 | } | 301 | } |
525 | EXPORT_SYMBOL_GPL(blkiocg_lookup_group); | ||
526 | 302 | ||
527 | static void blkio_reset_stats_cpu(struct blkio_group *blkg) | 303 | void __blkg_release(struct blkcg_gq *blkg) |
528 | { | 304 | { |
529 | struct blkio_group_stats_cpu *stats_cpu; | 305 | /* release the extra blkcg reference this blkg has been holding */ |
530 | int i, j, k; | 306 | css_put(&blkg->blkcg->css); |
307 | |||
531 | /* | 308 | /* |
532 | * Note: On 64 bit arch this should not be an issue. This has the | 309 | * A group is freed in rcu manner. But having an rcu lock does not |
533 | * possibility of returning some inconsistent value on 32bit arch | 310 | * mean that one can access all the fields of blkg and assume these |
534 | * as 64bit update on 32bit is non atomic. Taking care of this | 311 | * are valid. For example, don't try to follow throtl_data and |
535 | * corner case makes code very complicated, like sending IPIs to | 312 | * request queue links. |
536 | * cpus, taking care of stats of offline cpus etc. | ||
537 | * | 313 | * |
538 | * reset stats is anyway more of a debug feature and this sounds a | 314 | * Having a reference to blkg under an rcu allows acess to only |
539 | * corner case. So I am not complicating the code yet until and | 315 | * values local to groups like group stats and group rate limits |
540 | * unless this becomes a real issue. | ||
541 | */ | 316 | */ |
542 | for_each_possible_cpu(i) { | 317 | call_rcu(&blkg->rcu_head, blkg_rcu_free); |
543 | stats_cpu = per_cpu_ptr(blkg->stats_cpu, i); | ||
544 | stats_cpu->sectors = 0; | ||
545 | for(j = 0; j < BLKIO_STAT_CPU_NR; j++) | ||
546 | for (k = 0; k < BLKIO_STAT_TOTAL; k++) | ||
547 | stats_cpu->stat_arr_cpu[j][k] = 0; | ||
548 | } | ||
549 | } | 318 | } |
319 | EXPORT_SYMBOL_GPL(__blkg_release); | ||
550 | 320 | ||
551 | static int | 321 | static int blkcg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, |
552 | blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val) | 322 | u64 val) |
553 | { | 323 | { |
554 | struct blkio_cgroup *blkcg; | 324 | struct blkcg *blkcg = cgroup_to_blkcg(cgroup); |
555 | struct blkio_group *blkg; | 325 | struct blkcg_gq *blkg; |
556 | struct blkio_group_stats *stats; | ||
557 | struct hlist_node *n; | 326 | struct hlist_node *n; |
558 | uint64_t queued[BLKIO_STAT_TOTAL]; | ||
559 | int i; | 327 | int i; |
560 | #ifdef CONFIG_DEBUG_BLK_CGROUP | ||
561 | bool idling, waiting, empty; | ||
562 | unsigned long long now = sched_clock(); | ||
563 | #endif | ||
564 | 328 | ||
565 | blkcg = cgroup_to_blkio_cgroup(cgroup); | 329 | mutex_lock(&blkcg_pol_mutex); |
566 | spin_lock_irq(&blkcg->lock); | 330 | spin_lock_irq(&blkcg->lock); |
567 | hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) { | ||
568 | spin_lock(&blkg->stats_lock); | ||
569 | stats = &blkg->stats; | ||
570 | #ifdef CONFIG_DEBUG_BLK_CGROUP | ||
571 | idling = blkio_blkg_idling(stats); | ||
572 | waiting = blkio_blkg_waiting(stats); | ||
573 | empty = blkio_blkg_empty(stats); | ||
574 | #endif | ||
575 | for (i = 0; i < BLKIO_STAT_TOTAL; i++) | ||
576 | queued[i] = stats->stat_arr[BLKIO_STAT_QUEUED][i]; | ||
577 | memset(stats, 0, sizeof(struct blkio_group_stats)); | ||
578 | for (i = 0; i < BLKIO_STAT_TOTAL; i++) | ||
579 | stats->stat_arr[BLKIO_STAT_QUEUED][i] = queued[i]; | ||
580 | #ifdef CONFIG_DEBUG_BLK_CGROUP | ||
581 | if (idling) { | ||
582 | blkio_mark_blkg_idling(stats); | ||
583 | stats->start_idle_time = now; | ||
584 | } | ||
585 | if (waiting) { | ||
586 | blkio_mark_blkg_waiting(stats); | ||
587 | stats->start_group_wait_time = now; | ||
588 | } | ||
589 | if (empty) { | ||
590 | blkio_mark_blkg_empty(stats); | ||
591 | stats->start_empty_time = now; | ||
592 | } | ||
593 | #endif | ||
594 | spin_unlock(&blkg->stats_lock); | ||
595 | |||
596 | /* Reset Per cpu stats which don't take blkg->stats_lock */ | ||
597 | blkio_reset_stats_cpu(blkg); | ||
598 | } | ||
599 | |||
600 | spin_unlock_irq(&blkcg->lock); | ||
601 | return 0; | ||
602 | } | ||
603 | |||
604 | static void blkio_get_key_name(enum stat_sub_type type, dev_t dev, char *str, | ||
605 | int chars_left, bool diskname_only) | ||
606 | { | ||
607 | snprintf(str, chars_left, "%d:%d", MAJOR(dev), MINOR(dev)); | ||
608 | chars_left -= strlen(str); | ||
609 | if (chars_left <= 0) { | ||
610 | printk(KERN_WARNING | ||
611 | "Possibly incorrect cgroup stat display format"); | ||
612 | return; | ||
613 | } | ||
614 | if (diskname_only) | ||
615 | return; | ||
616 | switch (type) { | ||
617 | case BLKIO_STAT_READ: | ||
618 | strlcat(str, " Read", chars_left); | ||
619 | break; | ||
620 | case BLKIO_STAT_WRITE: | ||
621 | strlcat(str, " Write", chars_left); | ||
622 | break; | ||
623 | case BLKIO_STAT_SYNC: | ||
624 | strlcat(str, " Sync", chars_left); | ||
625 | break; | ||
626 | case BLKIO_STAT_ASYNC: | ||
627 | strlcat(str, " Async", chars_left); | ||
628 | break; | ||
629 | case BLKIO_STAT_TOTAL: | ||
630 | strlcat(str, " Total", chars_left); | ||
631 | break; | ||
632 | default: | ||
633 | strlcat(str, " Invalid", chars_left); | ||
634 | } | ||
635 | } | ||
636 | |||
637 | static uint64_t blkio_fill_stat(char *str, int chars_left, uint64_t val, | ||
638 | struct cgroup_map_cb *cb, dev_t dev) | ||
639 | { | ||
640 | blkio_get_key_name(0, dev, str, chars_left, true); | ||
641 | cb->fill(cb, str, val); | ||
642 | return val; | ||
643 | } | ||
644 | |||
645 | |||
646 | static uint64_t blkio_read_stat_cpu(struct blkio_group *blkg, | ||
647 | enum stat_type_cpu type, enum stat_sub_type sub_type) | ||
648 | { | ||
649 | int cpu; | ||
650 | struct blkio_group_stats_cpu *stats_cpu; | ||
651 | u64 val = 0, tval; | ||
652 | |||
653 | for_each_possible_cpu(cpu) { | ||
654 | unsigned int start; | ||
655 | stats_cpu = per_cpu_ptr(blkg->stats_cpu, cpu); | ||
656 | |||
657 | do { | ||
658 | start = u64_stats_fetch_begin(&stats_cpu->syncp); | ||
659 | if (type == BLKIO_STAT_CPU_SECTORS) | ||
660 | tval = stats_cpu->sectors; | ||
661 | else | ||
662 | tval = stats_cpu->stat_arr_cpu[type][sub_type]; | ||
663 | } while(u64_stats_fetch_retry(&stats_cpu->syncp, start)); | ||
664 | |||
665 | val += tval; | ||
666 | } | ||
667 | |||
668 | return val; | ||
669 | } | ||
670 | |||
671 | static uint64_t blkio_get_stat_cpu(struct blkio_group *blkg, | ||
672 | struct cgroup_map_cb *cb, dev_t dev, enum stat_type_cpu type) | ||
673 | { | ||
674 | uint64_t disk_total, val; | ||
675 | char key_str[MAX_KEY_LEN]; | ||
676 | enum stat_sub_type sub_type; | ||
677 | 331 | ||
678 | if (type == BLKIO_STAT_CPU_SECTORS) { | 332 | /* |
679 | val = blkio_read_stat_cpu(blkg, type, 0); | 333 | * Note that stat reset is racy - it doesn't synchronize against |
680 | return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, val, cb, dev); | 334 | * stat updates. This is a debug feature which shouldn't exist |
681 | } | 335 | * anyway. If you get hit by a race, retry. |
682 | 336 | */ | |
683 | for (sub_type = BLKIO_STAT_READ; sub_type < BLKIO_STAT_TOTAL; | 337 | hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) { |
684 | sub_type++) { | 338 | for (i = 0; i < BLKCG_MAX_POLS; i++) { |
685 | blkio_get_key_name(sub_type, dev, key_str, MAX_KEY_LEN, false); | 339 | struct blkcg_policy *pol = blkcg_policy[i]; |
686 | val = blkio_read_stat_cpu(blkg, type, sub_type); | ||
687 | cb->fill(cb, key_str, val); | ||
688 | } | ||
689 | |||
690 | disk_total = blkio_read_stat_cpu(blkg, type, BLKIO_STAT_READ) + | ||
691 | blkio_read_stat_cpu(blkg, type, BLKIO_STAT_WRITE); | ||
692 | |||
693 | blkio_get_key_name(BLKIO_STAT_TOTAL, dev, key_str, MAX_KEY_LEN, false); | ||
694 | cb->fill(cb, key_str, disk_total); | ||
695 | return disk_total; | ||
696 | } | ||
697 | |||
698 | /* This should be called with blkg->stats_lock held */ | ||
699 | static uint64_t blkio_get_stat(struct blkio_group *blkg, | ||
700 | struct cgroup_map_cb *cb, dev_t dev, enum stat_type type) | ||
701 | { | ||
702 | uint64_t disk_total; | ||
703 | char key_str[MAX_KEY_LEN]; | ||
704 | enum stat_sub_type sub_type; | ||
705 | |||
706 | if (type == BLKIO_STAT_TIME) | ||
707 | return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, | ||
708 | blkg->stats.time, cb, dev); | ||
709 | #ifdef CONFIG_DEBUG_BLK_CGROUP | ||
710 | if (type == BLKIO_STAT_UNACCOUNTED_TIME) | ||
711 | return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, | ||
712 | blkg->stats.unaccounted_time, cb, dev); | ||
713 | if (type == BLKIO_STAT_AVG_QUEUE_SIZE) { | ||
714 | uint64_t sum = blkg->stats.avg_queue_size_sum; | ||
715 | uint64_t samples = blkg->stats.avg_queue_size_samples; | ||
716 | if (samples) | ||
717 | do_div(sum, samples); | ||
718 | else | ||
719 | sum = 0; | ||
720 | return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, sum, cb, dev); | ||
721 | } | ||
722 | if (type == BLKIO_STAT_GROUP_WAIT_TIME) | ||
723 | return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, | ||
724 | blkg->stats.group_wait_time, cb, dev); | ||
725 | if (type == BLKIO_STAT_IDLE_TIME) | ||
726 | return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, | ||
727 | blkg->stats.idle_time, cb, dev); | ||
728 | if (type == BLKIO_STAT_EMPTY_TIME) | ||
729 | return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, | ||
730 | blkg->stats.empty_time, cb, dev); | ||
731 | if (type == BLKIO_STAT_DEQUEUE) | ||
732 | return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, | ||
733 | blkg->stats.dequeue, cb, dev); | ||
734 | #endif | ||
735 | |||
736 | for (sub_type = BLKIO_STAT_READ; sub_type < BLKIO_STAT_TOTAL; | ||
737 | sub_type++) { | ||
738 | blkio_get_key_name(sub_type, dev, key_str, MAX_KEY_LEN, false); | ||
739 | cb->fill(cb, key_str, blkg->stats.stat_arr[type][sub_type]); | ||
740 | } | ||
741 | disk_total = blkg->stats.stat_arr[type][BLKIO_STAT_READ] + | ||
742 | blkg->stats.stat_arr[type][BLKIO_STAT_WRITE]; | ||
743 | blkio_get_key_name(BLKIO_STAT_TOTAL, dev, key_str, MAX_KEY_LEN, false); | ||
744 | cb->fill(cb, key_str, disk_total); | ||
745 | return disk_total; | ||
746 | } | ||
747 | |||
748 | static int blkio_policy_parse_and_set(char *buf, | ||
749 | struct blkio_policy_node *newpn, enum blkio_policy_id plid, int fileid) | ||
750 | { | ||
751 | struct gendisk *disk = NULL; | ||
752 | char *s[4], *p, *major_s = NULL, *minor_s = NULL; | ||
753 | unsigned long major, minor; | ||
754 | int i = 0, ret = -EINVAL; | ||
755 | int part; | ||
756 | dev_t dev; | ||
757 | u64 temp; | ||
758 | |||
759 | memset(s, 0, sizeof(s)); | ||
760 | |||
761 | while ((p = strsep(&buf, " ")) != NULL) { | ||
762 | if (!*p) | ||
763 | continue; | ||
764 | |||
765 | s[i++] = p; | ||
766 | |||
767 | /* Prevent from inputing too many things */ | ||
768 | if (i == 3) | ||
769 | break; | ||
770 | } | ||
771 | |||
772 | if (i != 2) | ||
773 | goto out; | ||
774 | |||
775 | p = strsep(&s[0], ":"); | ||
776 | if (p != NULL) | ||
777 | major_s = p; | ||
778 | else | ||
779 | goto out; | ||
780 | |||
781 | minor_s = s[0]; | ||
782 | if (!minor_s) | ||
783 | goto out; | ||
784 | |||
785 | if (strict_strtoul(major_s, 10, &major)) | ||
786 | goto out; | ||
787 | |||
788 | if (strict_strtoul(minor_s, 10, &minor)) | ||
789 | goto out; | ||
790 | |||
791 | dev = MKDEV(major, minor); | ||
792 | |||
793 | if (strict_strtoull(s[1], 10, &temp)) | ||
794 | goto out; | ||
795 | |||
796 | /* For rule removal, do not check for device presence. */ | ||
797 | if (temp) { | ||
798 | disk = get_gendisk(dev, &part); | ||
799 | if (!disk || part) { | ||
800 | ret = -ENODEV; | ||
801 | goto out; | ||
802 | } | ||
803 | } | ||
804 | |||
805 | newpn->dev = dev; | ||
806 | |||
807 | switch (plid) { | ||
808 | case BLKIO_POLICY_PROP: | ||
809 | if ((temp < BLKIO_WEIGHT_MIN && temp > 0) || | ||
810 | temp > BLKIO_WEIGHT_MAX) | ||
811 | goto out; | ||
812 | |||
813 | newpn->plid = plid; | ||
814 | newpn->fileid = fileid; | ||
815 | newpn->val.weight = temp; | ||
816 | break; | ||
817 | case BLKIO_POLICY_THROTL: | ||
818 | switch(fileid) { | ||
819 | case BLKIO_THROTL_read_bps_device: | ||
820 | case BLKIO_THROTL_write_bps_device: | ||
821 | newpn->plid = plid; | ||
822 | newpn->fileid = fileid; | ||
823 | newpn->val.bps = temp; | ||
824 | break; | ||
825 | case BLKIO_THROTL_read_iops_device: | ||
826 | case BLKIO_THROTL_write_iops_device: | ||
827 | if (temp > THROTL_IOPS_MAX) | ||
828 | goto out; | ||
829 | |||
830 | newpn->plid = plid; | ||
831 | newpn->fileid = fileid; | ||
832 | newpn->val.iops = (unsigned int)temp; | ||
833 | break; | ||
834 | } | ||
835 | break; | ||
836 | default: | ||
837 | BUG(); | ||
838 | } | ||
839 | ret = 0; | ||
840 | out: | ||
841 | put_disk(disk); | ||
842 | return ret; | ||
843 | } | ||
844 | |||
845 | unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg, | ||
846 | dev_t dev) | ||
847 | { | ||
848 | struct blkio_policy_node *pn; | ||
849 | unsigned long flags; | ||
850 | unsigned int weight; | ||
851 | |||
852 | spin_lock_irqsave(&blkcg->lock, flags); | ||
853 | |||
854 | pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_PROP, | ||
855 | BLKIO_PROP_weight_device); | ||
856 | if (pn) | ||
857 | weight = pn->val.weight; | ||
858 | else | ||
859 | weight = blkcg->weight; | ||
860 | |||
861 | spin_unlock_irqrestore(&blkcg->lock, flags); | ||
862 | |||
863 | return weight; | ||
864 | } | ||
865 | EXPORT_SYMBOL_GPL(blkcg_get_weight); | ||
866 | |||
867 | uint64_t blkcg_get_read_bps(struct blkio_cgroup *blkcg, dev_t dev) | ||
868 | { | ||
869 | struct blkio_policy_node *pn; | ||
870 | unsigned long flags; | ||
871 | uint64_t bps = -1; | ||
872 | |||
873 | spin_lock_irqsave(&blkcg->lock, flags); | ||
874 | pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL, | ||
875 | BLKIO_THROTL_read_bps_device); | ||
876 | if (pn) | ||
877 | bps = pn->val.bps; | ||
878 | spin_unlock_irqrestore(&blkcg->lock, flags); | ||
879 | |||
880 | return bps; | ||
881 | } | ||
882 | |||
883 | uint64_t blkcg_get_write_bps(struct blkio_cgroup *blkcg, dev_t dev) | ||
884 | { | ||
885 | struct blkio_policy_node *pn; | ||
886 | unsigned long flags; | ||
887 | uint64_t bps = -1; | ||
888 | |||
889 | spin_lock_irqsave(&blkcg->lock, flags); | ||
890 | pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL, | ||
891 | BLKIO_THROTL_write_bps_device); | ||
892 | if (pn) | ||
893 | bps = pn->val.bps; | ||
894 | spin_unlock_irqrestore(&blkcg->lock, flags); | ||
895 | |||
896 | return bps; | ||
897 | } | ||
898 | |||
899 | unsigned int blkcg_get_read_iops(struct blkio_cgroup *blkcg, dev_t dev) | ||
900 | { | ||
901 | struct blkio_policy_node *pn; | ||
902 | unsigned long flags; | ||
903 | unsigned int iops = -1; | ||
904 | |||
905 | spin_lock_irqsave(&blkcg->lock, flags); | ||
906 | pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL, | ||
907 | BLKIO_THROTL_read_iops_device); | ||
908 | if (pn) | ||
909 | iops = pn->val.iops; | ||
910 | spin_unlock_irqrestore(&blkcg->lock, flags); | ||
911 | |||
912 | return iops; | ||
913 | } | ||
914 | |||
915 | unsigned int blkcg_get_write_iops(struct blkio_cgroup *blkcg, dev_t dev) | ||
916 | { | ||
917 | struct blkio_policy_node *pn; | ||
918 | unsigned long flags; | ||
919 | unsigned int iops = -1; | ||
920 | |||
921 | spin_lock_irqsave(&blkcg->lock, flags); | ||
922 | pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL, | ||
923 | BLKIO_THROTL_write_iops_device); | ||
924 | if (pn) | ||
925 | iops = pn->val.iops; | ||
926 | spin_unlock_irqrestore(&blkcg->lock, flags); | ||
927 | |||
928 | return iops; | ||
929 | } | ||
930 | 340 | ||
931 | /* Checks whether user asked for deleting a policy rule */ | 341 | if (blkcg_policy_enabled(blkg->q, pol) && |
932 | static bool blkio_delete_rule_command(struct blkio_policy_node *pn) | 342 | pol->pd_reset_stats_fn) |
933 | { | 343 | pol->pd_reset_stats_fn(blkg); |
934 | switch(pn->plid) { | ||
935 | case BLKIO_POLICY_PROP: | ||
936 | if (pn->val.weight == 0) | ||
937 | return 1; | ||
938 | break; | ||
939 | case BLKIO_POLICY_THROTL: | ||
940 | switch(pn->fileid) { | ||
941 | case BLKIO_THROTL_read_bps_device: | ||
942 | case BLKIO_THROTL_write_bps_device: | ||
943 | if (pn->val.bps == 0) | ||
944 | return 1; | ||
945 | break; | ||
946 | case BLKIO_THROTL_read_iops_device: | ||
947 | case BLKIO_THROTL_write_iops_device: | ||
948 | if (pn->val.iops == 0) | ||
949 | return 1; | ||
950 | } | 344 | } |
951 | break; | ||
952 | default: | ||
953 | BUG(); | ||
954 | } | 345 | } |
955 | 346 | ||
347 | spin_unlock_irq(&blkcg->lock); | ||
348 | mutex_unlock(&blkcg_pol_mutex); | ||
956 | return 0; | 349 | return 0; |
957 | } | 350 | } |
958 | 351 | ||
959 | static void blkio_update_policy_rule(struct blkio_policy_node *oldpn, | 352 | static const char *blkg_dev_name(struct blkcg_gq *blkg) |
960 | struct blkio_policy_node *newpn) | ||
961 | { | ||
962 | switch(oldpn->plid) { | ||
963 | case BLKIO_POLICY_PROP: | ||
964 | oldpn->val.weight = newpn->val.weight; | ||
965 | break; | ||
966 | case BLKIO_POLICY_THROTL: | ||
967 | switch(newpn->fileid) { | ||
968 | case BLKIO_THROTL_read_bps_device: | ||
969 | case BLKIO_THROTL_write_bps_device: | ||
970 | oldpn->val.bps = newpn->val.bps; | ||
971 | break; | ||
972 | case BLKIO_THROTL_read_iops_device: | ||
973 | case BLKIO_THROTL_write_iops_device: | ||
974 | oldpn->val.iops = newpn->val.iops; | ||
975 | } | ||
976 | break; | ||
977 | default: | ||
978 | BUG(); | ||
979 | } | ||
980 | } | ||
981 | |||
982 | /* | ||
983 | * Some rules/values in blkg have changed. Propagate those to respective | ||
984 | * policies. | ||
985 | */ | ||
986 | static void blkio_update_blkg_policy(struct blkio_cgroup *blkcg, | ||
987 | struct blkio_group *blkg, struct blkio_policy_node *pn) | ||
988 | { | 353 | { |
989 | unsigned int weight, iops; | 354 | /* some drivers (floppy) instantiate a queue w/o disk registered */ |
990 | u64 bps; | 355 | if (blkg->q->backing_dev_info.dev) |
991 | 356 | return dev_name(blkg->q->backing_dev_info.dev); | |
992 | switch(pn->plid) { | 357 | return NULL; |
993 | case BLKIO_POLICY_PROP: | ||
994 | weight = pn->val.weight ? pn->val.weight : | ||
995 | blkcg->weight; | ||
996 | blkio_update_group_weight(blkg, weight); | ||
997 | break; | ||
998 | case BLKIO_POLICY_THROTL: | ||
999 | switch(pn->fileid) { | ||
1000 | case BLKIO_THROTL_read_bps_device: | ||
1001 | case BLKIO_THROTL_write_bps_device: | ||
1002 | bps = pn->val.bps ? pn->val.bps : (-1); | ||
1003 | blkio_update_group_bps(blkg, bps, pn->fileid); | ||
1004 | break; | ||
1005 | case BLKIO_THROTL_read_iops_device: | ||
1006 | case BLKIO_THROTL_write_iops_device: | ||
1007 | iops = pn->val.iops ? pn->val.iops : (-1); | ||
1008 | blkio_update_group_iops(blkg, iops, pn->fileid); | ||
1009 | break; | ||
1010 | } | ||
1011 | break; | ||
1012 | default: | ||
1013 | BUG(); | ||
1014 | } | ||
1015 | } | 358 | } |
1016 | 359 | ||
1017 | /* | 360 | /** |
1018 | * A policy node rule has been updated. Propagate this update to all the | 361 | * blkcg_print_blkgs - helper for printing per-blkg data |
1019 | * block groups which might be affected by this update. | 362 | * @sf: seq_file to print to |
363 | * @blkcg: blkcg of interest | ||
364 | * @prfill: fill function to print out a blkg | ||
365 | * @pol: policy in question | ||
366 | * @data: data to be passed to @prfill | ||
367 | * @show_total: to print out sum of prfill return values or not | ||
368 | * | ||
369 | * This function invokes @prfill on each blkg of @blkcg if pd for the | ||
370 | * policy specified by @pol exists. @prfill is invoked with @sf, the | ||
371 | * policy data and @data. If @show_total is %true, the sum of the return | ||
372 | * values from @prfill is printed with "Total" label at the end. | ||
373 | * | ||
374 | * This is to be used to construct print functions for | ||
375 | * cftype->read_seq_string method. | ||
1020 | */ | 376 | */ |
1021 | static void blkio_update_policy_node_blkg(struct blkio_cgroup *blkcg, | 377 | void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg, |
1022 | struct blkio_policy_node *pn) | 378 | u64 (*prfill)(struct seq_file *, |
379 | struct blkg_policy_data *, int), | ||
380 | const struct blkcg_policy *pol, int data, | ||
381 | bool show_total) | ||
1023 | { | 382 | { |
1024 | struct blkio_group *blkg; | 383 | struct blkcg_gq *blkg; |
1025 | struct hlist_node *n; | 384 | struct hlist_node *n; |
385 | u64 total = 0; | ||
1026 | 386 | ||
1027 | spin_lock(&blkio_list_lock); | ||
1028 | spin_lock_irq(&blkcg->lock); | 387 | spin_lock_irq(&blkcg->lock); |
1029 | 388 | hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) | |
1030 | hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) { | 389 | if (blkcg_policy_enabled(blkg->q, pol)) |
1031 | if (pn->dev != blkg->dev || pn->plid != blkg->plid) | 390 | total += prfill(sf, blkg->pd[pol->plid], data); |
1032 | continue; | ||
1033 | blkio_update_blkg_policy(blkcg, blkg, pn); | ||
1034 | } | ||
1035 | |||
1036 | spin_unlock_irq(&blkcg->lock); | 391 | spin_unlock_irq(&blkcg->lock); |
1037 | spin_unlock(&blkio_list_lock); | 392 | |
393 | if (show_total) | ||
394 | seq_printf(sf, "Total %llu\n", (unsigned long long)total); | ||
1038 | } | 395 | } |
396 | EXPORT_SYMBOL_GPL(blkcg_print_blkgs); | ||
1039 | 397 | ||
1040 | static int blkiocg_file_write(struct cgroup *cgrp, struct cftype *cft, | 398 | /** |
1041 | const char *buffer) | 399 | * __blkg_prfill_u64 - prfill helper for a single u64 value |
400 | * @sf: seq_file to print to | ||
401 | * @pd: policy private data of interest | ||
402 | * @v: value to print | ||
403 | * | ||
404 | * Print @v to @sf for the device assocaited with @pd. | ||
405 | */ | ||
406 | u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v) | ||
1042 | { | 407 | { |
1043 | int ret = 0; | 408 | const char *dname = blkg_dev_name(pd->blkg); |
1044 | char *buf; | ||
1045 | struct blkio_policy_node *newpn, *pn; | ||
1046 | struct blkio_cgroup *blkcg; | ||
1047 | int keep_newpn = 0; | ||
1048 | enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); | ||
1049 | int fileid = BLKIOFILE_ATTR(cft->private); | ||
1050 | |||
1051 | buf = kstrdup(buffer, GFP_KERNEL); | ||
1052 | if (!buf) | ||
1053 | return -ENOMEM; | ||
1054 | |||
1055 | newpn = kzalloc(sizeof(*newpn), GFP_KERNEL); | ||
1056 | if (!newpn) { | ||
1057 | ret = -ENOMEM; | ||
1058 | goto free_buf; | ||
1059 | } | ||
1060 | |||
1061 | ret = blkio_policy_parse_and_set(buf, newpn, plid, fileid); | ||
1062 | if (ret) | ||
1063 | goto free_newpn; | ||
1064 | |||
1065 | blkcg = cgroup_to_blkio_cgroup(cgrp); | ||
1066 | |||
1067 | spin_lock_irq(&blkcg->lock); | ||
1068 | |||
1069 | pn = blkio_policy_search_node(blkcg, newpn->dev, plid, fileid); | ||
1070 | if (!pn) { | ||
1071 | if (!blkio_delete_rule_command(newpn)) { | ||
1072 | blkio_policy_insert_node(blkcg, newpn); | ||
1073 | keep_newpn = 1; | ||
1074 | } | ||
1075 | spin_unlock_irq(&blkcg->lock); | ||
1076 | goto update_io_group; | ||
1077 | } | ||
1078 | |||
1079 | if (blkio_delete_rule_command(newpn)) { | ||
1080 | blkio_policy_delete_node(pn); | ||
1081 | kfree(pn); | ||
1082 | spin_unlock_irq(&blkcg->lock); | ||
1083 | goto update_io_group; | ||
1084 | } | ||
1085 | spin_unlock_irq(&blkcg->lock); | ||
1086 | 409 | ||
1087 | blkio_update_policy_rule(pn, newpn); | 410 | if (!dname) |
411 | return 0; | ||
1088 | 412 | ||
1089 | update_io_group: | 413 | seq_printf(sf, "%s %llu\n", dname, (unsigned long long)v); |
1090 | blkio_update_policy_node_blkg(blkcg, newpn); | 414 | return v; |
1091 | |||
1092 | free_newpn: | ||
1093 | if (!keep_newpn) | ||
1094 | kfree(newpn); | ||
1095 | free_buf: | ||
1096 | kfree(buf); | ||
1097 | return ret; | ||
1098 | } | 415 | } |
416 | EXPORT_SYMBOL_GPL(__blkg_prfill_u64); | ||
1099 | 417 | ||
1100 | static void | 418 | /** |
1101 | blkio_print_policy_node(struct seq_file *m, struct blkio_policy_node *pn) | 419 | * __blkg_prfill_rwstat - prfill helper for a blkg_rwstat |
1102 | { | 420 | * @sf: seq_file to print to |
1103 | switch(pn->plid) { | 421 | * @pd: policy private data of interest |
1104 | case BLKIO_POLICY_PROP: | 422 | * @rwstat: rwstat to print |
1105 | if (pn->fileid == BLKIO_PROP_weight_device) | 423 | * |
1106 | seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev), | 424 | * Print @rwstat to @sf for the device assocaited with @pd. |
1107 | MINOR(pn->dev), pn->val.weight); | 425 | */ |
1108 | break; | 426 | u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, |
1109 | case BLKIO_POLICY_THROTL: | 427 | const struct blkg_rwstat *rwstat) |
1110 | switch(pn->fileid) { | 428 | { |
1111 | case BLKIO_THROTL_read_bps_device: | 429 | static const char *rwstr[] = { |
1112 | case BLKIO_THROTL_write_bps_device: | 430 | [BLKG_RWSTAT_READ] = "Read", |
1113 | seq_printf(m, "%u:%u\t%llu\n", MAJOR(pn->dev), | 431 | [BLKG_RWSTAT_WRITE] = "Write", |
1114 | MINOR(pn->dev), pn->val.bps); | 432 | [BLKG_RWSTAT_SYNC] = "Sync", |
1115 | break; | 433 | [BLKG_RWSTAT_ASYNC] = "Async", |
1116 | case BLKIO_THROTL_read_iops_device: | 434 | }; |
1117 | case BLKIO_THROTL_write_iops_device: | 435 | const char *dname = blkg_dev_name(pd->blkg); |
1118 | seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev), | 436 | u64 v; |
1119 | MINOR(pn->dev), pn->val.iops); | 437 | int i; |
1120 | break; | ||
1121 | } | ||
1122 | break; | ||
1123 | default: | ||
1124 | BUG(); | ||
1125 | } | ||
1126 | } | ||
1127 | 438 | ||
1128 | /* cgroup files which read their data from policy nodes end up here */ | 439 | if (!dname) |
1129 | static void blkio_read_policy_node_files(struct cftype *cft, | 440 | return 0; |
1130 | struct blkio_cgroup *blkcg, struct seq_file *m) | ||
1131 | { | ||
1132 | struct blkio_policy_node *pn; | ||
1133 | |||
1134 | if (!list_empty(&blkcg->policy_list)) { | ||
1135 | spin_lock_irq(&blkcg->lock); | ||
1136 | list_for_each_entry(pn, &blkcg->policy_list, node) { | ||
1137 | if (!pn_matches_cftype(cft, pn)) | ||
1138 | continue; | ||
1139 | blkio_print_policy_node(m, pn); | ||
1140 | } | ||
1141 | spin_unlock_irq(&blkcg->lock); | ||
1142 | } | ||
1143 | } | ||
1144 | 441 | ||
1145 | static int blkiocg_file_read(struct cgroup *cgrp, struct cftype *cft, | 442 | for (i = 0; i < BLKG_RWSTAT_NR; i++) |
1146 | struct seq_file *m) | 443 | seq_printf(sf, "%s %s %llu\n", dname, rwstr[i], |
1147 | { | 444 | (unsigned long long)rwstat->cnt[i]); |
1148 | struct blkio_cgroup *blkcg; | ||
1149 | enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); | ||
1150 | int name = BLKIOFILE_ATTR(cft->private); | ||
1151 | |||
1152 | blkcg = cgroup_to_blkio_cgroup(cgrp); | ||
1153 | |||
1154 | switch(plid) { | ||
1155 | case BLKIO_POLICY_PROP: | ||
1156 | switch(name) { | ||
1157 | case BLKIO_PROP_weight_device: | ||
1158 | blkio_read_policy_node_files(cft, blkcg, m); | ||
1159 | return 0; | ||
1160 | default: | ||
1161 | BUG(); | ||
1162 | } | ||
1163 | break; | ||
1164 | case BLKIO_POLICY_THROTL: | ||
1165 | switch(name){ | ||
1166 | case BLKIO_THROTL_read_bps_device: | ||
1167 | case BLKIO_THROTL_write_bps_device: | ||
1168 | case BLKIO_THROTL_read_iops_device: | ||
1169 | case BLKIO_THROTL_write_iops_device: | ||
1170 | blkio_read_policy_node_files(cft, blkcg, m); | ||
1171 | return 0; | ||
1172 | default: | ||
1173 | BUG(); | ||
1174 | } | ||
1175 | break; | ||
1176 | default: | ||
1177 | BUG(); | ||
1178 | } | ||
1179 | 445 | ||
1180 | return 0; | 446 | v = rwstat->cnt[BLKG_RWSTAT_READ] + rwstat->cnt[BLKG_RWSTAT_WRITE]; |
447 | seq_printf(sf, "%s Total %llu\n", dname, (unsigned long long)v); | ||
448 | return v; | ||
1181 | } | 449 | } |
1182 | 450 | ||
1183 | static int blkio_read_blkg_stats(struct blkio_cgroup *blkcg, | 451 | /** |
1184 | struct cftype *cft, struct cgroup_map_cb *cb, | 452 | * blkg_prfill_stat - prfill callback for blkg_stat |
1185 | enum stat_type type, bool show_total, bool pcpu) | 453 | * @sf: seq_file to print to |
454 | * @pd: policy private data of interest | ||
455 | * @off: offset to the blkg_stat in @pd | ||
456 | * | ||
457 | * prfill callback for printing a blkg_stat. | ||
458 | */ | ||
459 | u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, int off) | ||
1186 | { | 460 | { |
1187 | struct blkio_group *blkg; | 461 | return __blkg_prfill_u64(sf, pd, blkg_stat_read((void *)pd + off)); |
1188 | struct hlist_node *n; | ||
1189 | uint64_t cgroup_total = 0; | ||
1190 | |||
1191 | rcu_read_lock(); | ||
1192 | hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) { | ||
1193 | if (blkg->dev) { | ||
1194 | if (!cftype_blkg_same_policy(cft, blkg)) | ||
1195 | continue; | ||
1196 | if (pcpu) | ||
1197 | cgroup_total += blkio_get_stat_cpu(blkg, cb, | ||
1198 | blkg->dev, type); | ||
1199 | else { | ||
1200 | spin_lock_irq(&blkg->stats_lock); | ||
1201 | cgroup_total += blkio_get_stat(blkg, cb, | ||
1202 | blkg->dev, type); | ||
1203 | spin_unlock_irq(&blkg->stats_lock); | ||
1204 | } | ||
1205 | } | ||
1206 | } | ||
1207 | if (show_total) | ||
1208 | cb->fill(cb, "Total", cgroup_total); | ||
1209 | rcu_read_unlock(); | ||
1210 | return 0; | ||
1211 | } | 462 | } |
463 | EXPORT_SYMBOL_GPL(blkg_prfill_stat); | ||
1212 | 464 | ||
1213 | /* All map kind of cgroup file get serviced by this function */ | 465 | /** |
1214 | static int blkiocg_file_read_map(struct cgroup *cgrp, struct cftype *cft, | 466 | * blkg_prfill_rwstat - prfill callback for blkg_rwstat |
1215 | struct cgroup_map_cb *cb) | 467 | * @sf: seq_file to print to |
468 | * @pd: policy private data of interest | ||
469 | * @off: offset to the blkg_rwstat in @pd | ||
470 | * | ||
471 | * prfill callback for printing a blkg_rwstat. | ||
472 | */ | ||
473 | u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, | ||
474 | int off) | ||
1216 | { | 475 | { |
1217 | struct blkio_cgroup *blkcg; | 476 | struct blkg_rwstat rwstat = blkg_rwstat_read((void *)pd + off); |
1218 | enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); | ||
1219 | int name = BLKIOFILE_ATTR(cft->private); | ||
1220 | |||
1221 | blkcg = cgroup_to_blkio_cgroup(cgrp); | ||
1222 | |||
1223 | switch(plid) { | ||
1224 | case BLKIO_POLICY_PROP: | ||
1225 | switch(name) { | ||
1226 | case BLKIO_PROP_time: | ||
1227 | return blkio_read_blkg_stats(blkcg, cft, cb, | ||
1228 | BLKIO_STAT_TIME, 0, 0); | ||
1229 | case BLKIO_PROP_sectors: | ||
1230 | return blkio_read_blkg_stats(blkcg, cft, cb, | ||
1231 | BLKIO_STAT_CPU_SECTORS, 0, 1); | ||
1232 | case BLKIO_PROP_io_service_bytes: | ||
1233 | return blkio_read_blkg_stats(blkcg, cft, cb, | ||
1234 | BLKIO_STAT_CPU_SERVICE_BYTES, 1, 1); | ||
1235 | case BLKIO_PROP_io_serviced: | ||
1236 | return blkio_read_blkg_stats(blkcg, cft, cb, | ||
1237 | BLKIO_STAT_CPU_SERVICED, 1, 1); | ||
1238 | case BLKIO_PROP_io_service_time: | ||
1239 | return blkio_read_blkg_stats(blkcg, cft, cb, | ||
1240 | BLKIO_STAT_SERVICE_TIME, 1, 0); | ||
1241 | case BLKIO_PROP_io_wait_time: | ||
1242 | return blkio_read_blkg_stats(blkcg, cft, cb, | ||
1243 | BLKIO_STAT_WAIT_TIME, 1, 0); | ||
1244 | case BLKIO_PROP_io_merged: | ||
1245 | return blkio_read_blkg_stats(blkcg, cft, cb, | ||
1246 | BLKIO_STAT_CPU_MERGED, 1, 1); | ||
1247 | case BLKIO_PROP_io_queued: | ||
1248 | return blkio_read_blkg_stats(blkcg, cft, cb, | ||
1249 | BLKIO_STAT_QUEUED, 1, 0); | ||
1250 | #ifdef CONFIG_DEBUG_BLK_CGROUP | ||
1251 | case BLKIO_PROP_unaccounted_time: | ||
1252 | return blkio_read_blkg_stats(blkcg, cft, cb, | ||
1253 | BLKIO_STAT_UNACCOUNTED_TIME, 0, 0); | ||
1254 | case BLKIO_PROP_dequeue: | ||
1255 | return blkio_read_blkg_stats(blkcg, cft, cb, | ||
1256 | BLKIO_STAT_DEQUEUE, 0, 0); | ||
1257 | case BLKIO_PROP_avg_queue_size: | ||
1258 | return blkio_read_blkg_stats(blkcg, cft, cb, | ||
1259 | BLKIO_STAT_AVG_QUEUE_SIZE, 0, 0); | ||
1260 | case BLKIO_PROP_group_wait_time: | ||
1261 | return blkio_read_blkg_stats(blkcg, cft, cb, | ||
1262 | BLKIO_STAT_GROUP_WAIT_TIME, 0, 0); | ||
1263 | case BLKIO_PROP_idle_time: | ||
1264 | return blkio_read_blkg_stats(blkcg, cft, cb, | ||
1265 | BLKIO_STAT_IDLE_TIME, 0, 0); | ||
1266 | case BLKIO_PROP_empty_time: | ||
1267 | return blkio_read_blkg_stats(blkcg, cft, cb, | ||
1268 | BLKIO_STAT_EMPTY_TIME, 0, 0); | ||
1269 | #endif | ||
1270 | default: | ||
1271 | BUG(); | ||
1272 | } | ||
1273 | break; | ||
1274 | case BLKIO_POLICY_THROTL: | ||
1275 | switch(name){ | ||
1276 | case BLKIO_THROTL_io_service_bytes: | ||
1277 | return blkio_read_blkg_stats(blkcg, cft, cb, | ||
1278 | BLKIO_STAT_CPU_SERVICE_BYTES, 1, 1); | ||
1279 | case BLKIO_THROTL_io_serviced: | ||
1280 | return blkio_read_blkg_stats(blkcg, cft, cb, | ||
1281 | BLKIO_STAT_CPU_SERVICED, 1, 1); | ||
1282 | default: | ||
1283 | BUG(); | ||
1284 | } | ||
1285 | break; | ||
1286 | default: | ||
1287 | BUG(); | ||
1288 | } | ||
1289 | 477 | ||
1290 | return 0; | 478 | return __blkg_prfill_rwstat(sf, pd, &rwstat); |
1291 | } | 479 | } |
480 | EXPORT_SYMBOL_GPL(blkg_prfill_rwstat); | ||
1292 | 481 | ||
1293 | static int blkio_weight_write(struct blkio_cgroup *blkcg, u64 val) | 482 | /** |
483 | * blkg_conf_prep - parse and prepare for per-blkg config update | ||
484 | * @blkcg: target block cgroup | ||
485 | * @pol: target policy | ||
486 | * @input: input string | ||
487 | * @ctx: blkg_conf_ctx to be filled | ||
488 | * | ||
489 | * Parse per-blkg config update from @input and initialize @ctx with the | ||
490 | * result. @ctx->blkg points to the blkg to be updated and @ctx->v the new | ||
491 | * value. This function returns with RCU read lock and queue lock held and | ||
492 | * must be paired with blkg_conf_finish(). | ||
493 | */ | ||
494 | int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, | ||
495 | const char *input, struct blkg_conf_ctx *ctx) | ||
496 | __acquires(rcu) __acquires(disk->queue->queue_lock) | ||
1294 | { | 497 | { |
1295 | struct blkio_group *blkg; | 498 | struct gendisk *disk; |
1296 | struct hlist_node *n; | 499 | struct blkcg_gq *blkg; |
1297 | struct blkio_policy_node *pn; | 500 | unsigned int major, minor; |
501 | unsigned long long v; | ||
502 | int part, ret; | ||
1298 | 503 | ||
1299 | if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX) | 504 | if (sscanf(input, "%u:%u %llu", &major, &minor, &v) != 3) |
1300 | return -EINVAL; | 505 | return -EINVAL; |
1301 | 506 | ||
1302 | spin_lock(&blkio_list_lock); | 507 | disk = get_gendisk(MKDEV(major, minor), &part); |
1303 | spin_lock_irq(&blkcg->lock); | 508 | if (!disk || part) |
1304 | blkcg->weight = (unsigned int)val; | 509 | return -EINVAL; |
1305 | |||
1306 | hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) { | ||
1307 | pn = blkio_policy_search_node(blkcg, blkg->dev, | ||
1308 | BLKIO_POLICY_PROP, BLKIO_PROP_weight_device); | ||
1309 | if (pn) | ||
1310 | continue; | ||
1311 | |||
1312 | blkio_update_group_weight(blkg, blkcg->weight); | ||
1313 | } | ||
1314 | spin_unlock_irq(&blkcg->lock); | ||
1315 | spin_unlock(&blkio_list_lock); | ||
1316 | return 0; | ||
1317 | } | ||
1318 | 510 | ||
1319 | static u64 blkiocg_file_read_u64 (struct cgroup *cgrp, struct cftype *cft) { | 511 | rcu_read_lock(); |
1320 | struct blkio_cgroup *blkcg; | 512 | spin_lock_irq(disk->queue->queue_lock); |
1321 | enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); | ||
1322 | int name = BLKIOFILE_ATTR(cft->private); | ||
1323 | 513 | ||
1324 | blkcg = cgroup_to_blkio_cgroup(cgrp); | 514 | if (blkcg_policy_enabled(disk->queue, pol)) |
515 | blkg = blkg_lookup_create(blkcg, disk->queue); | ||
516 | else | ||
517 | blkg = ERR_PTR(-EINVAL); | ||
1325 | 518 | ||
1326 | switch(plid) { | 519 | if (IS_ERR(blkg)) { |
1327 | case BLKIO_POLICY_PROP: | 520 | ret = PTR_ERR(blkg); |
1328 | switch(name) { | 521 | rcu_read_unlock(); |
1329 | case BLKIO_PROP_weight: | 522 | spin_unlock_irq(disk->queue->queue_lock); |
1330 | return (u64)blkcg->weight; | 523 | put_disk(disk); |
524 | /* | ||
525 | * If queue was bypassing, we should retry. Do so after a | ||
526 | * short msleep(). It isn't strictly necessary but queue | ||
527 | * can be bypassing for some time and it's always nice to | ||
528 | * avoid busy looping. | ||
529 | */ | ||
530 | if (ret == -EBUSY) { | ||
531 | msleep(10); | ||
532 | ret = restart_syscall(); | ||
1331 | } | 533 | } |
1332 | break; | 534 | return ret; |
1333 | default: | ||
1334 | BUG(); | ||
1335 | } | 535 | } |
536 | |||
537 | ctx->disk = disk; | ||
538 | ctx->blkg = blkg; | ||
539 | ctx->v = v; | ||
1336 | return 0; | 540 | return 0; |
1337 | } | 541 | } |
542 | EXPORT_SYMBOL_GPL(blkg_conf_prep); | ||
1338 | 543 | ||
1339 | static int | 544 | /** |
1340 | blkiocg_file_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val) | 545 | * blkg_conf_finish - finish up per-blkg config update |
546 | * @ctx: blkg_conf_ctx intiailized by blkg_conf_prep() | ||
547 | * | ||
548 | * Finish up after per-blkg config update. This function must be paired | ||
549 | * with blkg_conf_prep(). | ||
550 | */ | ||
551 | void blkg_conf_finish(struct blkg_conf_ctx *ctx) | ||
552 | __releases(ctx->disk->queue->queue_lock) __releases(rcu) | ||
1341 | { | 553 | { |
1342 | struct blkio_cgroup *blkcg; | 554 | spin_unlock_irq(ctx->disk->queue->queue_lock); |
1343 | enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); | 555 | rcu_read_unlock(); |
1344 | int name = BLKIOFILE_ATTR(cft->private); | 556 | put_disk(ctx->disk); |
1345 | |||
1346 | blkcg = cgroup_to_blkio_cgroup(cgrp); | ||
1347 | |||
1348 | switch(plid) { | ||
1349 | case BLKIO_POLICY_PROP: | ||
1350 | switch(name) { | ||
1351 | case BLKIO_PROP_weight: | ||
1352 | return blkio_weight_write(blkcg, val); | ||
1353 | } | ||
1354 | break; | ||
1355 | default: | ||
1356 | BUG(); | ||
1357 | } | ||
1358 | |||
1359 | return 0; | ||
1360 | } | 557 | } |
558 | EXPORT_SYMBOL_GPL(blkg_conf_finish); | ||
1361 | 559 | ||
1362 | struct cftype blkio_files[] = { | 560 | struct cftype blkcg_files[] = { |
1363 | { | ||
1364 | .name = "weight_device", | ||
1365 | .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, | ||
1366 | BLKIO_PROP_weight_device), | ||
1367 | .read_seq_string = blkiocg_file_read, | ||
1368 | .write_string = blkiocg_file_write, | ||
1369 | .max_write_len = 256, | ||
1370 | }, | ||
1371 | { | ||
1372 | .name = "weight", | ||
1373 | .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, | ||
1374 | BLKIO_PROP_weight), | ||
1375 | .read_u64 = blkiocg_file_read_u64, | ||
1376 | .write_u64 = blkiocg_file_write_u64, | ||
1377 | }, | ||
1378 | { | ||
1379 | .name = "time", | ||
1380 | .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, | ||
1381 | BLKIO_PROP_time), | ||
1382 | .read_map = blkiocg_file_read_map, | ||
1383 | }, | ||
1384 | { | ||
1385 | .name = "sectors", | ||
1386 | .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, | ||
1387 | BLKIO_PROP_sectors), | ||
1388 | .read_map = blkiocg_file_read_map, | ||
1389 | }, | ||
1390 | { | ||
1391 | .name = "io_service_bytes", | ||
1392 | .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, | ||
1393 | BLKIO_PROP_io_service_bytes), | ||
1394 | .read_map = blkiocg_file_read_map, | ||
1395 | }, | ||
1396 | { | ||
1397 | .name = "io_serviced", | ||
1398 | .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, | ||
1399 | BLKIO_PROP_io_serviced), | ||
1400 | .read_map = blkiocg_file_read_map, | ||
1401 | }, | ||
1402 | { | ||
1403 | .name = "io_service_time", | ||
1404 | .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, | ||
1405 | BLKIO_PROP_io_service_time), | ||
1406 | .read_map = blkiocg_file_read_map, | ||
1407 | }, | ||
1408 | { | ||
1409 | .name = "io_wait_time", | ||
1410 | .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, | ||
1411 | BLKIO_PROP_io_wait_time), | ||
1412 | .read_map = blkiocg_file_read_map, | ||
1413 | }, | ||
1414 | { | ||
1415 | .name = "io_merged", | ||
1416 | .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, | ||
1417 | BLKIO_PROP_io_merged), | ||
1418 | .read_map = blkiocg_file_read_map, | ||
1419 | }, | ||
1420 | { | ||
1421 | .name = "io_queued", | ||
1422 | .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, | ||
1423 | BLKIO_PROP_io_queued), | ||
1424 | .read_map = blkiocg_file_read_map, | ||
1425 | }, | ||
1426 | { | 561 | { |
1427 | .name = "reset_stats", | 562 | .name = "reset_stats", |
1428 | .write_u64 = blkiocg_reset_stats, | 563 | .write_u64 = blkcg_reset_stats, |
1429 | }, | ||
1430 | #ifdef CONFIG_BLK_DEV_THROTTLING | ||
1431 | { | ||
1432 | .name = "throttle.read_bps_device", | ||
1433 | .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL, | ||
1434 | BLKIO_THROTL_read_bps_device), | ||
1435 | .read_seq_string = blkiocg_file_read, | ||
1436 | .write_string = blkiocg_file_write, | ||
1437 | .max_write_len = 256, | ||
1438 | }, | ||
1439 | |||
1440 | { | ||
1441 | .name = "throttle.write_bps_device", | ||
1442 | .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL, | ||
1443 | BLKIO_THROTL_write_bps_device), | ||
1444 | .read_seq_string = blkiocg_file_read, | ||
1445 | .write_string = blkiocg_file_write, | ||
1446 | .max_write_len = 256, | ||
1447 | }, | ||
1448 | |||
1449 | { | ||
1450 | .name = "throttle.read_iops_device", | ||
1451 | .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL, | ||
1452 | BLKIO_THROTL_read_iops_device), | ||
1453 | .read_seq_string = blkiocg_file_read, | ||
1454 | .write_string = blkiocg_file_write, | ||
1455 | .max_write_len = 256, | ||
1456 | }, | ||
1457 | |||
1458 | { | ||
1459 | .name = "throttle.write_iops_device", | ||
1460 | .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL, | ||
1461 | BLKIO_THROTL_write_iops_device), | ||
1462 | .read_seq_string = blkiocg_file_read, | ||
1463 | .write_string = blkiocg_file_write, | ||
1464 | .max_write_len = 256, | ||
1465 | }, | ||
1466 | { | ||
1467 | .name = "throttle.io_service_bytes", | ||
1468 | .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL, | ||
1469 | BLKIO_THROTL_io_service_bytes), | ||
1470 | .read_map = blkiocg_file_read_map, | ||
1471 | }, | ||
1472 | { | ||
1473 | .name = "throttle.io_serviced", | ||
1474 | .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL, | ||
1475 | BLKIO_THROTL_io_serviced), | ||
1476 | .read_map = blkiocg_file_read_map, | ||
1477 | }, | ||
1478 | #endif /* CONFIG_BLK_DEV_THROTTLING */ | ||
1479 | |||
1480 | #ifdef CONFIG_DEBUG_BLK_CGROUP | ||
1481 | { | ||
1482 | .name = "avg_queue_size", | ||
1483 | .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, | ||
1484 | BLKIO_PROP_avg_queue_size), | ||
1485 | .read_map = blkiocg_file_read_map, | ||
1486 | }, | 564 | }, |
1487 | { | ||
1488 | .name = "group_wait_time", | ||
1489 | .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, | ||
1490 | BLKIO_PROP_group_wait_time), | ||
1491 | .read_map = blkiocg_file_read_map, | ||
1492 | }, | ||
1493 | { | ||
1494 | .name = "idle_time", | ||
1495 | .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, | ||
1496 | BLKIO_PROP_idle_time), | ||
1497 | .read_map = blkiocg_file_read_map, | ||
1498 | }, | ||
1499 | { | ||
1500 | .name = "empty_time", | ||
1501 | .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, | ||
1502 | BLKIO_PROP_empty_time), | ||
1503 | .read_map = blkiocg_file_read_map, | ||
1504 | }, | ||
1505 | { | ||
1506 | .name = "dequeue", | ||
1507 | .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, | ||
1508 | BLKIO_PROP_dequeue), | ||
1509 | .read_map = blkiocg_file_read_map, | ||
1510 | }, | ||
1511 | { | ||
1512 | .name = "unaccounted_time", | ||
1513 | .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, | ||
1514 | BLKIO_PROP_unaccounted_time), | ||
1515 | .read_map = blkiocg_file_read_map, | ||
1516 | }, | ||
1517 | #endif | ||
1518 | { } /* terminate */ | 565 | { } /* terminate */ |
1519 | }; | 566 | }; |
1520 | 567 | ||
1521 | static void blkiocg_destroy(struct cgroup *cgroup) | 568 | /** |
569 | * blkcg_pre_destroy - cgroup pre_destroy callback | ||
570 | * @cgroup: cgroup of interest | ||
571 | * | ||
572 | * This function is called when @cgroup is about to go away and responsible | ||
573 | * for shooting down all blkgs associated with @cgroup. blkgs should be | ||
574 | * removed while holding both q and blkcg locks. As blkcg lock is nested | ||
575 | * inside q lock, this function performs reverse double lock dancing. | ||
576 | * | ||
577 | * This is the blkcg counterpart of ioc_release_fn(). | ||
578 | */ | ||
579 | static int blkcg_pre_destroy(struct cgroup *cgroup) | ||
1522 | { | 580 | { |
1523 | struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup); | 581 | struct blkcg *blkcg = cgroup_to_blkcg(cgroup); |
1524 | unsigned long flags; | ||
1525 | struct blkio_group *blkg; | ||
1526 | void *key; | ||
1527 | struct blkio_policy_type *blkiop; | ||
1528 | struct blkio_policy_node *pn, *pntmp; | ||
1529 | 582 | ||
1530 | rcu_read_lock(); | 583 | spin_lock_irq(&blkcg->lock); |
1531 | do { | ||
1532 | spin_lock_irqsave(&blkcg->lock, flags); | ||
1533 | 584 | ||
1534 | if (hlist_empty(&blkcg->blkg_list)) { | 585 | while (!hlist_empty(&blkcg->blkg_list)) { |
1535 | spin_unlock_irqrestore(&blkcg->lock, flags); | 586 | struct blkcg_gq *blkg = hlist_entry(blkcg->blkg_list.first, |
1536 | break; | 587 | struct blkcg_gq, blkcg_node); |
588 | struct request_queue *q = blkg->q; | ||
589 | |||
590 | if (spin_trylock(q->queue_lock)) { | ||
591 | blkg_destroy(blkg); | ||
592 | spin_unlock(q->queue_lock); | ||
593 | } else { | ||
594 | spin_unlock_irq(&blkcg->lock); | ||
595 | cpu_relax(); | ||
596 | spin_lock_irq(&blkcg->lock); | ||
1537 | } | 597 | } |
598 | } | ||
1538 | 599 | ||
1539 | blkg = hlist_entry(blkcg->blkg_list.first, struct blkio_group, | 600 | spin_unlock_irq(&blkcg->lock); |
1540 | blkcg_node); | 601 | return 0; |
1541 | key = rcu_dereference(blkg->key); | 602 | } |
1542 | __blkiocg_del_blkio_group(blkg); | ||
1543 | |||
1544 | spin_unlock_irqrestore(&blkcg->lock, flags); | ||
1545 | |||
1546 | /* | ||
1547 | * This blkio_group is being unlinked as associated cgroup is | ||
1548 | * going away. Let all the IO controlling policies know about | ||
1549 | * this event. | ||
1550 | */ | ||
1551 | spin_lock(&blkio_list_lock); | ||
1552 | list_for_each_entry(blkiop, &blkio_list, list) { | ||
1553 | if (blkiop->plid != blkg->plid) | ||
1554 | continue; | ||
1555 | blkiop->ops.blkio_unlink_group_fn(key, blkg); | ||
1556 | } | ||
1557 | spin_unlock(&blkio_list_lock); | ||
1558 | } while (1); | ||
1559 | 603 | ||
1560 | list_for_each_entry_safe(pn, pntmp, &blkcg->policy_list, node) { | 604 | static void blkcg_destroy(struct cgroup *cgroup) |
1561 | blkio_policy_delete_node(pn); | 605 | { |
1562 | kfree(pn); | 606 | struct blkcg *blkcg = cgroup_to_blkcg(cgroup); |
1563 | } | ||
1564 | 607 | ||
1565 | free_css_id(&blkio_subsys, &blkcg->css); | 608 | if (blkcg != &blkcg_root) |
1566 | rcu_read_unlock(); | ||
1567 | if (blkcg != &blkio_root_cgroup) | ||
1568 | kfree(blkcg); | 609 | kfree(blkcg); |
1569 | } | 610 | } |
1570 | 611 | ||
1571 | static struct cgroup_subsys_state *blkiocg_create(struct cgroup *cgroup) | 612 | static struct cgroup_subsys_state *blkcg_create(struct cgroup *cgroup) |
1572 | { | 613 | { |
1573 | struct blkio_cgroup *blkcg; | 614 | static atomic64_t id_seq = ATOMIC64_INIT(0); |
615 | struct blkcg *blkcg; | ||
1574 | struct cgroup *parent = cgroup->parent; | 616 | struct cgroup *parent = cgroup->parent; |
1575 | 617 | ||
1576 | if (!parent) { | 618 | if (!parent) { |
1577 | blkcg = &blkio_root_cgroup; | 619 | blkcg = &blkcg_root; |
1578 | goto done; | 620 | goto done; |
1579 | } | 621 | } |
1580 | 622 | ||
@@ -1582,22 +624,68 @@ static struct cgroup_subsys_state *blkiocg_create(struct cgroup *cgroup) | |||
1582 | if (!blkcg) | 624 | if (!blkcg) |
1583 | return ERR_PTR(-ENOMEM); | 625 | return ERR_PTR(-ENOMEM); |
1584 | 626 | ||
1585 | blkcg->weight = BLKIO_WEIGHT_DEFAULT; | 627 | blkcg->cfq_weight = CFQ_WEIGHT_DEFAULT; |
628 | blkcg->id = atomic64_inc_return(&id_seq); /* root is 0, start from 1 */ | ||
1586 | done: | 629 | done: |
1587 | spin_lock_init(&blkcg->lock); | 630 | spin_lock_init(&blkcg->lock); |
631 | INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_ATOMIC); | ||
1588 | INIT_HLIST_HEAD(&blkcg->blkg_list); | 632 | INIT_HLIST_HEAD(&blkcg->blkg_list); |
1589 | 633 | ||
1590 | INIT_LIST_HEAD(&blkcg->policy_list); | ||
1591 | return &blkcg->css; | 634 | return &blkcg->css; |
1592 | } | 635 | } |
1593 | 636 | ||
637 | /** | ||
638 | * blkcg_init_queue - initialize blkcg part of request queue | ||
639 | * @q: request_queue to initialize | ||
640 | * | ||
641 | * Called from blk_alloc_queue_node(). Responsible for initializing blkcg | ||
642 | * part of new request_queue @q. | ||
643 | * | ||
644 | * RETURNS: | ||
645 | * 0 on success, -errno on failure. | ||
646 | */ | ||
647 | int blkcg_init_queue(struct request_queue *q) | ||
648 | { | ||
649 | might_sleep(); | ||
650 | |||
651 | return blk_throtl_init(q); | ||
652 | } | ||
653 | |||
654 | /** | ||
655 | * blkcg_drain_queue - drain blkcg part of request_queue | ||
656 | * @q: request_queue to drain | ||
657 | * | ||
658 | * Called from blk_drain_queue(). Responsible for draining blkcg part. | ||
659 | */ | ||
660 | void blkcg_drain_queue(struct request_queue *q) | ||
661 | { | ||
662 | lockdep_assert_held(q->queue_lock); | ||
663 | |||
664 | blk_throtl_drain(q); | ||
665 | } | ||
666 | |||
667 | /** | ||
668 | * blkcg_exit_queue - exit and release blkcg part of request_queue | ||
669 | * @q: request_queue being released | ||
670 | * | ||
671 | * Called from blk_release_queue(). Responsible for exiting blkcg part. | ||
672 | */ | ||
673 | void blkcg_exit_queue(struct request_queue *q) | ||
674 | { | ||
675 | spin_lock_irq(q->queue_lock); | ||
676 | blkg_destroy_all(q); | ||
677 | spin_unlock_irq(q->queue_lock); | ||
678 | |||
679 | blk_throtl_exit(q); | ||
680 | } | ||
681 | |||
1594 | /* | 682 | /* |
1595 | * We cannot support shared io contexts, as we have no mean to support | 683 | * We cannot support shared io contexts, as we have no mean to support |
1596 | * two tasks with the same ioc in two different groups without major rework | 684 | * two tasks with the same ioc in two different groups without major rework |
1597 | * of the main cic data structures. For now we allow a task to change | 685 | * of the main cic data structures. For now we allow a task to change |
1598 | * its cgroup only if it's the only owner of its ioc. | 686 | * its cgroup only if it's the only owner of its ioc. |
1599 | */ | 687 | */ |
1600 | static int blkiocg_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) | 688 | static int blkcg_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) |
1601 | { | 689 | { |
1602 | struct task_struct *task; | 690 | struct task_struct *task; |
1603 | struct io_context *ioc; | 691 | struct io_context *ioc; |
@@ -1616,63 +704,213 @@ static int blkiocg_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) | |||
1616 | return ret; | 704 | return ret; |
1617 | } | 705 | } |
1618 | 706 | ||
1619 | static void blkiocg_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) | ||
1620 | { | ||
1621 | struct task_struct *task; | ||
1622 | struct io_context *ioc; | ||
1623 | |||
1624 | cgroup_taskset_for_each(task, cgrp, tset) { | ||
1625 | /* we don't lose anything even if ioc allocation fails */ | ||
1626 | ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE); | ||
1627 | if (ioc) { | ||
1628 | ioc_cgroup_changed(ioc); | ||
1629 | put_io_context(ioc); | ||
1630 | } | ||
1631 | } | ||
1632 | } | ||
1633 | |||
1634 | struct cgroup_subsys blkio_subsys = { | 707 | struct cgroup_subsys blkio_subsys = { |
1635 | .name = "blkio", | 708 | .name = "blkio", |
1636 | .create = blkiocg_create, | 709 | .create = blkcg_create, |
1637 | .can_attach = blkiocg_can_attach, | 710 | .can_attach = blkcg_can_attach, |
1638 | .attach = blkiocg_attach, | 711 | .pre_destroy = blkcg_pre_destroy, |
1639 | .destroy = blkiocg_destroy, | 712 | .destroy = blkcg_destroy, |
1640 | #ifdef CONFIG_BLK_CGROUP | ||
1641 | /* note: blkio_subsys_id is otherwise defined in blk-cgroup.h */ | ||
1642 | .subsys_id = blkio_subsys_id, | 713 | .subsys_id = blkio_subsys_id, |
1643 | #endif | 714 | .base_cftypes = blkcg_files, |
1644 | .base_cftypes = blkio_files, | ||
1645 | .use_id = 1, | ||
1646 | .module = THIS_MODULE, | 715 | .module = THIS_MODULE, |
1647 | }; | 716 | }; |
1648 | EXPORT_SYMBOL_GPL(blkio_subsys); | 717 | EXPORT_SYMBOL_GPL(blkio_subsys); |
1649 | 718 | ||
1650 | void blkio_policy_register(struct blkio_policy_type *blkiop) | 719 | /** |
720 | * blkcg_activate_policy - activate a blkcg policy on a request_queue | ||
721 | * @q: request_queue of interest | ||
722 | * @pol: blkcg policy to activate | ||
723 | * | ||
724 | * Activate @pol on @q. Requires %GFP_KERNEL context. @q goes through | ||
725 | * bypass mode to populate its blkgs with policy_data for @pol. | ||
726 | * | ||
727 | * Activation happens with @q bypassed, so nobody would be accessing blkgs | ||
728 | * from IO path. Update of each blkg is protected by both queue and blkcg | ||
729 | * locks so that holding either lock and testing blkcg_policy_enabled() is | ||
730 | * always enough for dereferencing policy data. | ||
731 | * | ||
732 | * The caller is responsible for synchronizing [de]activations and policy | ||
733 | * [un]registerations. Returns 0 on success, -errno on failure. | ||
734 | */ | ||
735 | int blkcg_activate_policy(struct request_queue *q, | ||
736 | const struct blkcg_policy *pol) | ||
1651 | { | 737 | { |
1652 | spin_lock(&blkio_list_lock); | 738 | LIST_HEAD(pds); |
1653 | list_add_tail(&blkiop->list, &blkio_list); | 739 | struct blkcg_gq *blkg; |
1654 | spin_unlock(&blkio_list_lock); | 740 | struct blkg_policy_data *pd, *n; |
741 | int cnt = 0, ret; | ||
742 | |||
743 | if (blkcg_policy_enabled(q, pol)) | ||
744 | return 0; | ||
745 | |||
746 | blk_queue_bypass_start(q); | ||
747 | |||
748 | /* make sure the root blkg exists and count the existing blkgs */ | ||
749 | spin_lock_irq(q->queue_lock); | ||
750 | |||
751 | rcu_read_lock(); | ||
752 | blkg = __blkg_lookup_create(&blkcg_root, q); | ||
753 | rcu_read_unlock(); | ||
754 | |||
755 | if (IS_ERR(blkg)) { | ||
756 | ret = PTR_ERR(blkg); | ||
757 | goto out_unlock; | ||
758 | } | ||
759 | q->root_blkg = blkg; | ||
760 | |||
761 | list_for_each_entry(blkg, &q->blkg_list, q_node) | ||
762 | cnt++; | ||
763 | |||
764 | spin_unlock_irq(q->queue_lock); | ||
765 | |||
766 | /* allocate policy_data for all existing blkgs */ | ||
767 | while (cnt--) { | ||
768 | pd = kzalloc_node(pol->pd_size, GFP_KERNEL, q->node); | ||
769 | if (!pd) { | ||
770 | ret = -ENOMEM; | ||
771 | goto out_free; | ||
772 | } | ||
773 | list_add_tail(&pd->alloc_node, &pds); | ||
774 | } | ||
775 | |||
776 | /* | ||
777 | * Install the allocated pds. With @q bypassing, no new blkg | ||
778 | * should have been created while the queue lock was dropped. | ||
779 | */ | ||
780 | spin_lock_irq(q->queue_lock); | ||
781 | |||
782 | list_for_each_entry(blkg, &q->blkg_list, q_node) { | ||
783 | if (WARN_ON(list_empty(&pds))) { | ||
784 | /* umm... this shouldn't happen, just abort */ | ||
785 | ret = -ENOMEM; | ||
786 | goto out_unlock; | ||
787 | } | ||
788 | pd = list_first_entry(&pds, struct blkg_policy_data, alloc_node); | ||
789 | list_del_init(&pd->alloc_node); | ||
790 | |||
791 | /* grab blkcg lock too while installing @pd on @blkg */ | ||
792 | spin_lock(&blkg->blkcg->lock); | ||
793 | |||
794 | blkg->pd[pol->plid] = pd; | ||
795 | pd->blkg = blkg; | ||
796 | pol->pd_init_fn(blkg); | ||
797 | |||
798 | spin_unlock(&blkg->blkcg->lock); | ||
799 | } | ||
800 | |||
801 | __set_bit(pol->plid, q->blkcg_pols); | ||
802 | ret = 0; | ||
803 | out_unlock: | ||
804 | spin_unlock_irq(q->queue_lock); | ||
805 | out_free: | ||
806 | blk_queue_bypass_end(q); | ||
807 | list_for_each_entry_safe(pd, n, &pds, alloc_node) | ||
808 | kfree(pd); | ||
809 | return ret; | ||
1655 | } | 810 | } |
1656 | EXPORT_SYMBOL_GPL(blkio_policy_register); | 811 | EXPORT_SYMBOL_GPL(blkcg_activate_policy); |
1657 | 812 | ||
1658 | void blkio_policy_unregister(struct blkio_policy_type *blkiop) | 813 | /** |
814 | * blkcg_deactivate_policy - deactivate a blkcg policy on a request_queue | ||
815 | * @q: request_queue of interest | ||
816 | * @pol: blkcg policy to deactivate | ||
817 | * | ||
818 | * Deactivate @pol on @q. Follows the same synchronization rules as | ||
819 | * blkcg_activate_policy(). | ||
820 | */ | ||
821 | void blkcg_deactivate_policy(struct request_queue *q, | ||
822 | const struct blkcg_policy *pol) | ||
1659 | { | 823 | { |
1660 | spin_lock(&blkio_list_lock); | 824 | struct blkcg_gq *blkg; |
1661 | list_del_init(&blkiop->list); | 825 | |
1662 | spin_unlock(&blkio_list_lock); | 826 | if (!blkcg_policy_enabled(q, pol)) |
827 | return; | ||
828 | |||
829 | blk_queue_bypass_start(q); | ||
830 | spin_lock_irq(q->queue_lock); | ||
831 | |||
832 | __clear_bit(pol->plid, q->blkcg_pols); | ||
833 | |||
834 | /* if no policy is left, no need for blkgs - shoot them down */ | ||
835 | if (bitmap_empty(q->blkcg_pols, BLKCG_MAX_POLS)) | ||
836 | blkg_destroy_all(q); | ||
837 | |||
838 | list_for_each_entry(blkg, &q->blkg_list, q_node) { | ||
839 | /* grab blkcg lock too while removing @pd from @blkg */ | ||
840 | spin_lock(&blkg->blkcg->lock); | ||
841 | |||
842 | if (pol->pd_exit_fn) | ||
843 | pol->pd_exit_fn(blkg); | ||
844 | |||
845 | kfree(blkg->pd[pol->plid]); | ||
846 | blkg->pd[pol->plid] = NULL; | ||
847 | |||
848 | spin_unlock(&blkg->blkcg->lock); | ||
849 | } | ||
850 | |||
851 | spin_unlock_irq(q->queue_lock); | ||
852 | blk_queue_bypass_end(q); | ||
1663 | } | 853 | } |
1664 | EXPORT_SYMBOL_GPL(blkio_policy_unregister); | 854 | EXPORT_SYMBOL_GPL(blkcg_deactivate_policy); |
1665 | 855 | ||
1666 | static int __init init_cgroup_blkio(void) | 856 | /** |
857 | * blkcg_policy_register - register a blkcg policy | ||
858 | * @pol: blkcg policy to register | ||
859 | * | ||
860 | * Register @pol with blkcg core. Might sleep and @pol may be modified on | ||
861 | * successful registration. Returns 0 on success and -errno on failure. | ||
862 | */ | ||
863 | int blkcg_policy_register(struct blkcg_policy *pol) | ||
1667 | { | 864 | { |
1668 | return cgroup_load_subsys(&blkio_subsys); | 865 | int i, ret; |
866 | |||
867 | if (WARN_ON(pol->pd_size < sizeof(struct blkg_policy_data))) | ||
868 | return -EINVAL; | ||
869 | |||
870 | mutex_lock(&blkcg_pol_mutex); | ||
871 | |||
872 | /* find an empty slot */ | ||
873 | ret = -ENOSPC; | ||
874 | for (i = 0; i < BLKCG_MAX_POLS; i++) | ||
875 | if (!blkcg_policy[i]) | ||
876 | break; | ||
877 | if (i >= BLKCG_MAX_POLS) | ||
878 | goto out_unlock; | ||
879 | |||
880 | /* register and update blkgs */ | ||
881 | pol->plid = i; | ||
882 | blkcg_policy[i] = pol; | ||
883 | |||
884 | /* everything is in place, add intf files for the new policy */ | ||
885 | if (pol->cftypes) | ||
886 | WARN_ON(cgroup_add_cftypes(&blkio_subsys, pol->cftypes)); | ||
887 | ret = 0; | ||
888 | out_unlock: | ||
889 | mutex_unlock(&blkcg_pol_mutex); | ||
890 | return ret; | ||
1669 | } | 891 | } |
892 | EXPORT_SYMBOL_GPL(blkcg_policy_register); | ||
1670 | 893 | ||
1671 | static void __exit exit_cgroup_blkio(void) | 894 | /** |
895 | * blkcg_policy_unregister - unregister a blkcg policy | ||
896 | * @pol: blkcg policy to unregister | ||
897 | * | ||
898 | * Undo blkcg_policy_register(@pol). Might sleep. | ||
899 | */ | ||
900 | void blkcg_policy_unregister(struct blkcg_policy *pol) | ||
1672 | { | 901 | { |
1673 | cgroup_unload_subsys(&blkio_subsys); | 902 | mutex_lock(&blkcg_pol_mutex); |
1674 | } | ||
1675 | 903 | ||
1676 | module_init(init_cgroup_blkio); | 904 | if (WARN_ON(blkcg_policy[pol->plid] != pol)) |
1677 | module_exit(exit_cgroup_blkio); | 905 | goto out_unlock; |
1678 | MODULE_LICENSE("GPL"); | 906 | |
907 | /* kill the intf files first */ | ||
908 | if (pol->cftypes) | ||
909 | cgroup_rm_cftypes(&blkio_subsys, pol->cftypes); | ||
910 | |||
911 | /* unregister and update blkgs */ | ||
912 | blkcg_policy[pol->plid] = NULL; | ||
913 | out_unlock: | ||
914 | mutex_unlock(&blkcg_pol_mutex); | ||
915 | } | ||
916 | EXPORT_SYMBOL_GPL(blkcg_policy_unregister); | ||
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index 6f3ace7e792f..8ac457ce7783 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h | |||
@@ -15,350 +15,371 @@ | |||
15 | 15 | ||
16 | #include <linux/cgroup.h> | 16 | #include <linux/cgroup.h> |
17 | #include <linux/u64_stats_sync.h> | 17 | #include <linux/u64_stats_sync.h> |
18 | 18 | #include <linux/seq_file.h> | |
19 | enum blkio_policy_id { | 19 | #include <linux/radix-tree.h> |
20 | BLKIO_POLICY_PROP = 0, /* Proportional Bandwidth division */ | ||
21 | BLKIO_POLICY_THROTL, /* Throttling */ | ||
22 | }; | ||
23 | 20 | ||
24 | /* Max limits for throttle policy */ | 21 | /* Max limits for throttle policy */ |
25 | #define THROTL_IOPS_MAX UINT_MAX | 22 | #define THROTL_IOPS_MAX UINT_MAX |
26 | 23 | ||
27 | #if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE) | 24 | /* CFQ specific, out here for blkcg->cfq_weight */ |
28 | 25 | #define CFQ_WEIGHT_MIN 10 | |
29 | #ifndef CONFIG_BLK_CGROUP | 26 | #define CFQ_WEIGHT_MAX 1000 |
30 | /* When blk-cgroup is a module, its subsys_id isn't a compile-time constant */ | 27 | #define CFQ_WEIGHT_DEFAULT 500 |
31 | extern struct cgroup_subsys blkio_subsys; | ||
32 | #define blkio_subsys_id blkio_subsys.subsys_id | ||
33 | #endif | ||
34 | |||
35 | enum stat_type { | ||
36 | /* Total time spent (in ns) between request dispatch to the driver and | ||
37 | * request completion for IOs doen by this cgroup. This may not be | ||
38 | * accurate when NCQ is turned on. */ | ||
39 | BLKIO_STAT_SERVICE_TIME = 0, | ||
40 | /* Total time spent waiting in scheduler queue in ns */ | ||
41 | BLKIO_STAT_WAIT_TIME, | ||
42 | /* Number of IOs queued up */ | ||
43 | BLKIO_STAT_QUEUED, | ||
44 | /* All the single valued stats go below this */ | ||
45 | BLKIO_STAT_TIME, | ||
46 | #ifdef CONFIG_DEBUG_BLK_CGROUP | ||
47 | /* Time not charged to this cgroup */ | ||
48 | BLKIO_STAT_UNACCOUNTED_TIME, | ||
49 | BLKIO_STAT_AVG_QUEUE_SIZE, | ||
50 | BLKIO_STAT_IDLE_TIME, | ||
51 | BLKIO_STAT_EMPTY_TIME, | ||
52 | BLKIO_STAT_GROUP_WAIT_TIME, | ||
53 | BLKIO_STAT_DEQUEUE | ||
54 | #endif | ||
55 | }; | ||
56 | 28 | ||
57 | /* Per cpu stats */ | 29 | #ifdef CONFIG_BLK_CGROUP |
58 | enum stat_type_cpu { | ||
59 | BLKIO_STAT_CPU_SECTORS, | ||
60 | /* Total bytes transferred */ | ||
61 | BLKIO_STAT_CPU_SERVICE_BYTES, | ||
62 | /* Total IOs serviced, post merge */ | ||
63 | BLKIO_STAT_CPU_SERVICED, | ||
64 | /* Number of IOs merged */ | ||
65 | BLKIO_STAT_CPU_MERGED, | ||
66 | BLKIO_STAT_CPU_NR | ||
67 | }; | ||
68 | 30 | ||
69 | enum stat_sub_type { | 31 | enum blkg_rwstat_type { |
70 | BLKIO_STAT_READ = 0, | 32 | BLKG_RWSTAT_READ, |
71 | BLKIO_STAT_WRITE, | 33 | BLKG_RWSTAT_WRITE, |
72 | BLKIO_STAT_SYNC, | 34 | BLKG_RWSTAT_SYNC, |
73 | BLKIO_STAT_ASYNC, | 35 | BLKG_RWSTAT_ASYNC, |
74 | BLKIO_STAT_TOTAL | ||
75 | }; | ||
76 | 36 | ||
77 | /* blkg state flags */ | 37 | BLKG_RWSTAT_NR, |
78 | enum blkg_state_flags { | 38 | BLKG_RWSTAT_TOTAL = BLKG_RWSTAT_NR, |
79 | BLKG_waiting = 0, | ||
80 | BLKG_idling, | ||
81 | BLKG_empty, | ||
82 | }; | 39 | }; |
83 | 40 | ||
84 | /* cgroup files owned by proportional weight policy */ | 41 | struct blkcg_gq; |
85 | enum blkcg_file_name_prop { | ||
86 | BLKIO_PROP_weight = 1, | ||
87 | BLKIO_PROP_weight_device, | ||
88 | BLKIO_PROP_io_service_bytes, | ||
89 | BLKIO_PROP_io_serviced, | ||
90 | BLKIO_PROP_time, | ||
91 | BLKIO_PROP_sectors, | ||
92 | BLKIO_PROP_unaccounted_time, | ||
93 | BLKIO_PROP_io_service_time, | ||
94 | BLKIO_PROP_io_wait_time, | ||
95 | BLKIO_PROP_io_merged, | ||
96 | BLKIO_PROP_io_queued, | ||
97 | BLKIO_PROP_avg_queue_size, | ||
98 | BLKIO_PROP_group_wait_time, | ||
99 | BLKIO_PROP_idle_time, | ||
100 | BLKIO_PROP_empty_time, | ||
101 | BLKIO_PROP_dequeue, | ||
102 | }; | ||
103 | 42 | ||
104 | /* cgroup files owned by throttle policy */ | 43 | struct blkcg { |
105 | enum blkcg_file_name_throtl { | 44 | struct cgroup_subsys_state css; |
106 | BLKIO_THROTL_read_bps_device, | 45 | spinlock_t lock; |
107 | BLKIO_THROTL_write_bps_device, | ||
108 | BLKIO_THROTL_read_iops_device, | ||
109 | BLKIO_THROTL_write_iops_device, | ||
110 | BLKIO_THROTL_io_service_bytes, | ||
111 | BLKIO_THROTL_io_serviced, | ||
112 | }; | ||
113 | 46 | ||
114 | struct blkio_cgroup { | 47 | struct radix_tree_root blkg_tree; |
115 | struct cgroup_subsys_state css; | 48 | struct blkcg_gq *blkg_hint; |
116 | unsigned int weight; | 49 | struct hlist_head blkg_list; |
117 | spinlock_t lock; | 50 | |
118 | struct hlist_head blkg_list; | 51 | /* for policies to test whether associated blkcg has changed */ |
119 | struct list_head policy_list; /* list of blkio_policy_node */ | 52 | uint64_t id; |
120 | }; | ||
121 | 53 | ||
122 | struct blkio_group_stats { | 54 | /* TODO: per-policy storage in blkcg */ |
123 | /* total disk time and nr sectors dispatched by this group */ | 55 | unsigned int cfq_weight; /* belongs to cfq */ |
124 | uint64_t time; | ||
125 | uint64_t stat_arr[BLKIO_STAT_QUEUED + 1][BLKIO_STAT_TOTAL]; | ||
126 | #ifdef CONFIG_DEBUG_BLK_CGROUP | ||
127 | /* Time not charged to this cgroup */ | ||
128 | uint64_t unaccounted_time; | ||
129 | |||
130 | /* Sum of number of IOs queued across all samples */ | ||
131 | uint64_t avg_queue_size_sum; | ||
132 | /* Count of samples taken for average */ | ||
133 | uint64_t avg_queue_size_samples; | ||
134 | /* How many times this group has been removed from service tree */ | ||
135 | unsigned long dequeue; | ||
136 | |||
137 | /* Total time spent waiting for it to be assigned a timeslice. */ | ||
138 | uint64_t group_wait_time; | ||
139 | uint64_t start_group_wait_time; | ||
140 | |||
141 | /* Time spent idling for this blkio_group */ | ||
142 | uint64_t idle_time; | ||
143 | uint64_t start_idle_time; | ||
144 | /* | ||
145 | * Total time when we have requests queued and do not contain the | ||
146 | * current active queue. | ||
147 | */ | ||
148 | uint64_t empty_time; | ||
149 | uint64_t start_empty_time; | ||
150 | uint16_t flags; | ||
151 | #endif | ||
152 | }; | 56 | }; |
153 | 57 | ||
154 | /* Per cpu blkio group stats */ | 58 | struct blkg_stat { |
155 | struct blkio_group_stats_cpu { | 59 | struct u64_stats_sync syncp; |
156 | uint64_t sectors; | 60 | uint64_t cnt; |
157 | uint64_t stat_arr_cpu[BLKIO_STAT_CPU_NR][BLKIO_STAT_TOTAL]; | ||
158 | struct u64_stats_sync syncp; | ||
159 | }; | 61 | }; |
160 | 62 | ||
161 | struct blkio_group { | 63 | struct blkg_rwstat { |
162 | /* An rcu protected unique identifier for the group */ | 64 | struct u64_stats_sync syncp; |
163 | void *key; | 65 | uint64_t cnt[BLKG_RWSTAT_NR]; |
164 | struct hlist_node blkcg_node; | ||
165 | unsigned short blkcg_id; | ||
166 | /* Store cgroup path */ | ||
167 | char path[128]; | ||
168 | /* The device MKDEV(major, minor), this group has been created for */ | ||
169 | dev_t dev; | ||
170 | /* policy which owns this blk group */ | ||
171 | enum blkio_policy_id plid; | ||
172 | |||
173 | /* Need to serialize the stats in the case of reset/update */ | ||
174 | spinlock_t stats_lock; | ||
175 | struct blkio_group_stats stats; | ||
176 | /* Per cpu stats pointer */ | ||
177 | struct blkio_group_stats_cpu __percpu *stats_cpu; | ||
178 | }; | 66 | }; |
179 | 67 | ||
180 | struct blkio_policy_node { | 68 | /* |
181 | struct list_head node; | 69 | * A blkcg_gq (blkg) is association between a block cgroup (blkcg) and a |
182 | dev_t dev; | 70 | * request_queue (q). This is used by blkcg policies which need to track |
183 | /* This node belongs to max bw policy or porportional weight policy */ | 71 | * information per blkcg - q pair. |
184 | enum blkio_policy_id plid; | 72 | * |
185 | /* cgroup file to which this rule belongs to */ | 73 | * There can be multiple active blkcg policies and each has its private |
186 | int fileid; | 74 | * data on each blkg, the size of which is determined by |
187 | 75 | * blkcg_policy->pd_size. blkcg core allocates and frees such areas | |
188 | union { | 76 | * together with blkg and invokes pd_init/exit_fn() methods. |
189 | unsigned int weight; | 77 | * |
190 | /* | 78 | * Such private data must embed struct blkg_policy_data (pd) at the |
191 | * Rate read/write in terms of bytes per second | 79 | * beginning and pd_size can't be smaller than pd. |
192 | * Whether this rate represents read or write is determined | 80 | */ |
193 | * by file type "fileid". | 81 | struct blkg_policy_data { |
194 | */ | 82 | /* the blkg this per-policy data belongs to */ |
195 | u64 bps; | 83 | struct blkcg_gq *blkg; |
196 | unsigned int iops; | 84 | |
197 | } val; | 85 | /* used during policy activation */ |
86 | struct list_head alloc_node; | ||
198 | }; | 87 | }; |
199 | 88 | ||
200 | extern unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg, | 89 | /* association between a blk cgroup and a request queue */ |
201 | dev_t dev); | 90 | struct blkcg_gq { |
202 | extern uint64_t blkcg_get_read_bps(struct blkio_cgroup *blkcg, | 91 | /* Pointer to the associated request_queue */ |
203 | dev_t dev); | 92 | struct request_queue *q; |
204 | extern uint64_t blkcg_get_write_bps(struct blkio_cgroup *blkcg, | 93 | struct list_head q_node; |
205 | dev_t dev); | 94 | struct hlist_node blkcg_node; |
206 | extern unsigned int blkcg_get_read_iops(struct blkio_cgroup *blkcg, | 95 | struct blkcg *blkcg; |
207 | dev_t dev); | 96 | /* reference count */ |
208 | extern unsigned int blkcg_get_write_iops(struct blkio_cgroup *blkcg, | 97 | int refcnt; |
209 | dev_t dev); | 98 | |
210 | 99 | struct blkg_policy_data *pd[BLKCG_MAX_POLS]; | |
211 | typedef void (blkio_unlink_group_fn) (void *key, struct blkio_group *blkg); | 100 | |
212 | 101 | struct rcu_head rcu_head; | |
213 | typedef void (blkio_update_group_weight_fn) (void *key, | ||
214 | struct blkio_group *blkg, unsigned int weight); | ||
215 | typedef void (blkio_update_group_read_bps_fn) (void * key, | ||
216 | struct blkio_group *blkg, u64 read_bps); | ||
217 | typedef void (blkio_update_group_write_bps_fn) (void *key, | ||
218 | struct blkio_group *blkg, u64 write_bps); | ||
219 | typedef void (blkio_update_group_read_iops_fn) (void *key, | ||
220 | struct blkio_group *blkg, unsigned int read_iops); | ||
221 | typedef void (blkio_update_group_write_iops_fn) (void *key, | ||
222 | struct blkio_group *blkg, unsigned int write_iops); | ||
223 | |||
224 | struct blkio_policy_ops { | ||
225 | blkio_unlink_group_fn *blkio_unlink_group_fn; | ||
226 | blkio_update_group_weight_fn *blkio_update_group_weight_fn; | ||
227 | blkio_update_group_read_bps_fn *blkio_update_group_read_bps_fn; | ||
228 | blkio_update_group_write_bps_fn *blkio_update_group_write_bps_fn; | ||
229 | blkio_update_group_read_iops_fn *blkio_update_group_read_iops_fn; | ||
230 | blkio_update_group_write_iops_fn *blkio_update_group_write_iops_fn; | ||
231 | }; | 102 | }; |
232 | 103 | ||
233 | struct blkio_policy_type { | 104 | typedef void (blkcg_pol_init_pd_fn)(struct blkcg_gq *blkg); |
234 | struct list_head list; | 105 | typedef void (blkcg_pol_exit_pd_fn)(struct blkcg_gq *blkg); |
235 | struct blkio_policy_ops ops; | 106 | typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkcg_gq *blkg); |
236 | enum blkio_policy_id plid; | 107 | |
108 | struct blkcg_policy { | ||
109 | int plid; | ||
110 | /* policy specific private data size */ | ||
111 | size_t pd_size; | ||
112 | /* cgroup files for the policy */ | ||
113 | struct cftype *cftypes; | ||
114 | |||
115 | /* operations */ | ||
116 | blkcg_pol_init_pd_fn *pd_init_fn; | ||
117 | blkcg_pol_exit_pd_fn *pd_exit_fn; | ||
118 | blkcg_pol_reset_pd_stats_fn *pd_reset_stats_fn; | ||
237 | }; | 119 | }; |
238 | 120 | ||
121 | extern struct blkcg blkcg_root; | ||
122 | |||
123 | struct blkcg *cgroup_to_blkcg(struct cgroup *cgroup); | ||
124 | struct blkcg *bio_blkcg(struct bio *bio); | ||
125 | struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q); | ||
126 | struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, | ||
127 | struct request_queue *q); | ||
128 | int blkcg_init_queue(struct request_queue *q); | ||
129 | void blkcg_drain_queue(struct request_queue *q); | ||
130 | void blkcg_exit_queue(struct request_queue *q); | ||
131 | |||
239 | /* Blkio controller policy registration */ | 132 | /* Blkio controller policy registration */ |
240 | extern void blkio_policy_register(struct blkio_policy_type *); | 133 | int blkcg_policy_register(struct blkcg_policy *pol); |
241 | extern void blkio_policy_unregister(struct blkio_policy_type *); | 134 | void blkcg_policy_unregister(struct blkcg_policy *pol); |
135 | int blkcg_activate_policy(struct request_queue *q, | ||
136 | const struct blkcg_policy *pol); | ||
137 | void blkcg_deactivate_policy(struct request_queue *q, | ||
138 | const struct blkcg_policy *pol); | ||
139 | |||
140 | void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg, | ||
141 | u64 (*prfill)(struct seq_file *, | ||
142 | struct blkg_policy_data *, int), | ||
143 | const struct blkcg_policy *pol, int data, | ||
144 | bool show_total); | ||
145 | u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v); | ||
146 | u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, | ||
147 | const struct blkg_rwstat *rwstat); | ||
148 | u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, int off); | ||
149 | u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, | ||
150 | int off); | ||
151 | |||
152 | struct blkg_conf_ctx { | ||
153 | struct gendisk *disk; | ||
154 | struct blkcg_gq *blkg; | ||
155 | u64 v; | ||
156 | }; | ||
157 | |||
158 | int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, | ||
159 | const char *input, struct blkg_conf_ctx *ctx); | ||
160 | void blkg_conf_finish(struct blkg_conf_ctx *ctx); | ||
161 | |||
162 | |||
163 | /** | ||
164 | * blkg_to_pdata - get policy private data | ||
165 | * @blkg: blkg of interest | ||
166 | * @pol: policy of interest | ||
167 | * | ||
168 | * Return pointer to private data associated with the @blkg-@pol pair. | ||
169 | */ | ||
170 | static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg, | ||
171 | struct blkcg_policy *pol) | ||
172 | { | ||
173 | return blkg ? blkg->pd[pol->plid] : NULL; | ||
174 | } | ||
175 | |||
176 | /** | ||
177 | * pdata_to_blkg - get blkg associated with policy private data | ||
178 | * @pd: policy private data of interest | ||
179 | * | ||
180 | * @pd is policy private data. Determine the blkg it's associated with. | ||
181 | */ | ||
182 | static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) | ||
183 | { | ||
184 | return pd ? pd->blkg : NULL; | ||
185 | } | ||
186 | |||
187 | /** | ||
188 | * blkg_path - format cgroup path of blkg | ||
189 | * @blkg: blkg of interest | ||
190 | * @buf: target buffer | ||
191 | * @buflen: target buffer length | ||
192 | * | ||
193 | * Format the path of the cgroup of @blkg into @buf. | ||
194 | */ | ||
195 | static inline int blkg_path(struct blkcg_gq *blkg, char *buf, int buflen) | ||
196 | { | ||
197 | int ret; | ||
198 | |||
199 | rcu_read_lock(); | ||
200 | ret = cgroup_path(blkg->blkcg->css.cgroup, buf, buflen); | ||
201 | rcu_read_unlock(); | ||
202 | if (ret) | ||
203 | strncpy(buf, "<unavailable>", buflen); | ||
204 | return ret; | ||
205 | } | ||
242 | 206 | ||
243 | static inline char *blkg_path(struct blkio_group *blkg) | 207 | /** |
208 | * blkg_get - get a blkg reference | ||
209 | * @blkg: blkg to get | ||
210 | * | ||
211 | * The caller should be holding queue_lock and an existing reference. | ||
212 | */ | ||
213 | static inline void blkg_get(struct blkcg_gq *blkg) | ||
244 | { | 214 | { |
245 | return blkg->path; | 215 | lockdep_assert_held(blkg->q->queue_lock); |
216 | WARN_ON_ONCE(!blkg->refcnt); | ||
217 | blkg->refcnt++; | ||
246 | } | 218 | } |
247 | 219 | ||
248 | #else | 220 | void __blkg_release(struct blkcg_gq *blkg); |
249 | 221 | ||
250 | struct blkio_group { | 222 | /** |
223 | * blkg_put - put a blkg reference | ||
224 | * @blkg: blkg to put | ||
225 | * | ||
226 | * The caller should be holding queue_lock. | ||
227 | */ | ||
228 | static inline void blkg_put(struct blkcg_gq *blkg) | ||
229 | { | ||
230 | lockdep_assert_held(blkg->q->queue_lock); | ||
231 | WARN_ON_ONCE(blkg->refcnt <= 0); | ||
232 | if (!--blkg->refcnt) | ||
233 | __blkg_release(blkg); | ||
234 | } | ||
235 | |||
236 | /** | ||
237 | * blkg_stat_add - add a value to a blkg_stat | ||
238 | * @stat: target blkg_stat | ||
239 | * @val: value to add | ||
240 | * | ||
241 | * Add @val to @stat. The caller is responsible for synchronizing calls to | ||
242 | * this function. | ||
243 | */ | ||
244 | static inline void blkg_stat_add(struct blkg_stat *stat, uint64_t val) | ||
245 | { | ||
246 | u64_stats_update_begin(&stat->syncp); | ||
247 | stat->cnt += val; | ||
248 | u64_stats_update_end(&stat->syncp); | ||
249 | } | ||
250 | |||
251 | /** | ||
252 | * blkg_stat_read - read the current value of a blkg_stat | ||
253 | * @stat: blkg_stat to read | ||
254 | * | ||
255 | * Read the current value of @stat. This function can be called without | ||
256 | * synchroniztion and takes care of u64 atomicity. | ||
257 | */ | ||
258 | static inline uint64_t blkg_stat_read(struct blkg_stat *stat) | ||
259 | { | ||
260 | unsigned int start; | ||
261 | uint64_t v; | ||
262 | |||
263 | do { | ||
264 | start = u64_stats_fetch_begin(&stat->syncp); | ||
265 | v = stat->cnt; | ||
266 | } while (u64_stats_fetch_retry(&stat->syncp, start)); | ||
267 | |||
268 | return v; | ||
269 | } | ||
270 | |||
271 | /** | ||
272 | * blkg_stat_reset - reset a blkg_stat | ||
273 | * @stat: blkg_stat to reset | ||
274 | */ | ||
275 | static inline void blkg_stat_reset(struct blkg_stat *stat) | ||
276 | { | ||
277 | stat->cnt = 0; | ||
278 | } | ||
279 | |||
280 | /** | ||
281 | * blkg_rwstat_add - add a value to a blkg_rwstat | ||
282 | * @rwstat: target blkg_rwstat | ||
283 | * @rw: mask of REQ_{WRITE|SYNC} | ||
284 | * @val: value to add | ||
285 | * | ||
286 | * Add @val to @rwstat. The counters are chosen according to @rw. The | ||
287 | * caller is responsible for synchronizing calls to this function. | ||
288 | */ | ||
289 | static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat, | ||
290 | int rw, uint64_t val) | ||
291 | { | ||
292 | u64_stats_update_begin(&rwstat->syncp); | ||
293 | |||
294 | if (rw & REQ_WRITE) | ||
295 | rwstat->cnt[BLKG_RWSTAT_WRITE] += val; | ||
296 | else | ||
297 | rwstat->cnt[BLKG_RWSTAT_READ] += val; | ||
298 | if (rw & REQ_SYNC) | ||
299 | rwstat->cnt[BLKG_RWSTAT_SYNC] += val; | ||
300 | else | ||
301 | rwstat->cnt[BLKG_RWSTAT_ASYNC] += val; | ||
302 | |||
303 | u64_stats_update_end(&rwstat->syncp); | ||
304 | } | ||
305 | |||
306 | /** | ||
307 | * blkg_rwstat_read - read the current values of a blkg_rwstat | ||
308 | * @rwstat: blkg_rwstat to read | ||
309 | * | ||
310 | * Read the current snapshot of @rwstat and return it as the return value. | ||
311 | * This function can be called without synchronization and takes care of | ||
312 | * u64 atomicity. | ||
313 | */ | ||
314 | static inline struct blkg_rwstat blkg_rwstat_read(struct blkg_rwstat *rwstat) | ||
315 | { | ||
316 | unsigned int start; | ||
317 | struct blkg_rwstat tmp; | ||
318 | |||
319 | do { | ||
320 | start = u64_stats_fetch_begin(&rwstat->syncp); | ||
321 | tmp = *rwstat; | ||
322 | } while (u64_stats_fetch_retry(&rwstat->syncp, start)); | ||
323 | |||
324 | return tmp; | ||
325 | } | ||
326 | |||
327 | /** | ||
328 | * blkg_rwstat_sum - read the total count of a blkg_rwstat | ||
329 | * @rwstat: blkg_rwstat to read | ||
330 | * | ||
331 | * Return the total count of @rwstat regardless of the IO direction. This | ||
332 | * function can be called without synchronization and takes care of u64 | ||
333 | * atomicity. | ||
334 | */ | ||
335 | static inline uint64_t blkg_rwstat_sum(struct blkg_rwstat *rwstat) | ||
336 | { | ||
337 | struct blkg_rwstat tmp = blkg_rwstat_read(rwstat); | ||
338 | |||
339 | return tmp.cnt[BLKG_RWSTAT_READ] + tmp.cnt[BLKG_RWSTAT_WRITE]; | ||
340 | } | ||
341 | |||
342 | /** | ||
343 | * blkg_rwstat_reset - reset a blkg_rwstat | ||
344 | * @rwstat: blkg_rwstat to reset | ||
345 | */ | ||
346 | static inline void blkg_rwstat_reset(struct blkg_rwstat *rwstat) | ||
347 | { | ||
348 | memset(rwstat->cnt, 0, sizeof(rwstat->cnt)); | ||
349 | } | ||
350 | |||
351 | #else /* CONFIG_BLK_CGROUP */ | ||
352 | |||
353 | struct cgroup; | ||
354 | |||
355 | struct blkg_policy_data { | ||
251 | }; | 356 | }; |
252 | 357 | ||
253 | struct blkio_policy_type { | 358 | struct blkcg_gq { |
254 | }; | 359 | }; |
255 | 360 | ||
256 | static inline void blkio_policy_register(struct blkio_policy_type *blkiop) { } | 361 | struct blkcg_policy { |
257 | static inline void blkio_policy_unregister(struct blkio_policy_type *blkiop) { } | 362 | }; |
258 | 363 | ||
259 | static inline char *blkg_path(struct blkio_group *blkg) { return NULL; } | 364 | static inline struct blkcg *cgroup_to_blkcg(struct cgroup *cgroup) { return NULL; } |
260 | 365 | static inline struct blkcg *bio_blkcg(struct bio *bio) { return NULL; } | |
261 | #endif | 366 | static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; } |
262 | 367 | static inline int blkcg_init_queue(struct request_queue *q) { return 0; } | |
263 | #define BLKIO_WEIGHT_MIN 10 | 368 | static inline void blkcg_drain_queue(struct request_queue *q) { } |
264 | #define BLKIO_WEIGHT_MAX 1000 | 369 | static inline void blkcg_exit_queue(struct request_queue *q) { } |
265 | #define BLKIO_WEIGHT_DEFAULT 500 | 370 | static inline int blkcg_policy_register(struct blkcg_policy *pol) { return 0; } |
266 | 371 | static inline void blkcg_policy_unregister(struct blkcg_policy *pol) { } | |
267 | #ifdef CONFIG_DEBUG_BLK_CGROUP | 372 | static inline int blkcg_activate_policy(struct request_queue *q, |
268 | void blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg); | 373 | const struct blkcg_policy *pol) { return 0; } |
269 | void blkiocg_update_dequeue_stats(struct blkio_group *blkg, | 374 | static inline void blkcg_deactivate_policy(struct request_queue *q, |
270 | unsigned long dequeue); | 375 | const struct blkcg_policy *pol) { } |
271 | void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg); | 376 | |
272 | void blkiocg_update_idle_time_stats(struct blkio_group *blkg); | 377 | static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg, |
273 | void blkiocg_set_start_empty_time(struct blkio_group *blkg); | 378 | struct blkcg_policy *pol) { return NULL; } |
274 | 379 | static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) { return NULL; } | |
275 | #define BLKG_FLAG_FNS(name) \ | 380 | static inline char *blkg_path(struct blkcg_gq *blkg) { return NULL; } |
276 | static inline void blkio_mark_blkg_##name( \ | 381 | static inline void blkg_get(struct blkcg_gq *blkg) { } |
277 | struct blkio_group_stats *stats) \ | 382 | static inline void blkg_put(struct blkcg_gq *blkg) { } |
278 | { \ | 383 | |
279 | stats->flags |= (1 << BLKG_##name); \ | 384 | #endif /* CONFIG_BLK_CGROUP */ |
280 | } \ | 385 | #endif /* _BLK_CGROUP_H */ |
281 | static inline void blkio_clear_blkg_##name( \ | ||
282 | struct blkio_group_stats *stats) \ | ||
283 | { \ | ||
284 | stats->flags &= ~(1 << BLKG_##name); \ | ||
285 | } \ | ||
286 | static inline int blkio_blkg_##name(struct blkio_group_stats *stats) \ | ||
287 | { \ | ||
288 | return (stats->flags & (1 << BLKG_##name)) != 0; \ | ||
289 | } \ | ||
290 | |||
291 | BLKG_FLAG_FNS(waiting) | ||
292 | BLKG_FLAG_FNS(idling) | ||
293 | BLKG_FLAG_FNS(empty) | ||
294 | #undef BLKG_FLAG_FNS | ||
295 | #else | ||
296 | static inline void blkiocg_update_avg_queue_size_stats( | ||
297 | struct blkio_group *blkg) {} | ||
298 | static inline void blkiocg_update_dequeue_stats(struct blkio_group *blkg, | ||
299 | unsigned long dequeue) {} | ||
300 | static inline void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg) | ||
301 | {} | ||
302 | static inline void blkiocg_update_idle_time_stats(struct blkio_group *blkg) {} | ||
303 | static inline void blkiocg_set_start_empty_time(struct blkio_group *blkg) {} | ||
304 | #endif | ||
305 | |||
306 | #if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE) | ||
307 | extern struct blkio_cgroup blkio_root_cgroup; | ||
308 | extern struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup); | ||
309 | extern struct blkio_cgroup *task_blkio_cgroup(struct task_struct *tsk); | ||
310 | extern void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, | ||
311 | struct blkio_group *blkg, void *key, dev_t dev, | ||
312 | enum blkio_policy_id plid); | ||
313 | extern int blkio_alloc_blkg_stats(struct blkio_group *blkg); | ||
314 | extern int blkiocg_del_blkio_group(struct blkio_group *blkg); | ||
315 | extern struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, | ||
316 | void *key); | ||
317 | void blkiocg_update_timeslice_used(struct blkio_group *blkg, | ||
318 | unsigned long time, | ||
319 | unsigned long unaccounted_time); | ||
320 | void blkiocg_update_dispatch_stats(struct blkio_group *blkg, uint64_t bytes, | ||
321 | bool direction, bool sync); | ||
322 | void blkiocg_update_completion_stats(struct blkio_group *blkg, | ||
323 | uint64_t start_time, uint64_t io_start_time, bool direction, bool sync); | ||
324 | void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction, | ||
325 | bool sync); | ||
326 | void blkiocg_update_io_add_stats(struct blkio_group *blkg, | ||
327 | struct blkio_group *curr_blkg, bool direction, bool sync); | ||
328 | void blkiocg_update_io_remove_stats(struct blkio_group *blkg, | ||
329 | bool direction, bool sync); | ||
330 | #else | ||
331 | struct cgroup; | ||
332 | static inline struct blkio_cgroup * | ||
333 | cgroup_to_blkio_cgroup(struct cgroup *cgroup) { return NULL; } | ||
334 | static inline struct blkio_cgroup * | ||
335 | task_blkio_cgroup(struct task_struct *tsk) { return NULL; } | ||
336 | |||
337 | static inline void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, | ||
338 | struct blkio_group *blkg, void *key, dev_t dev, | ||
339 | enum blkio_policy_id plid) {} | ||
340 | |||
341 | static inline int blkio_alloc_blkg_stats(struct blkio_group *blkg) { return 0; } | ||
342 | |||
343 | static inline int | ||
344 | blkiocg_del_blkio_group(struct blkio_group *blkg) { return 0; } | ||
345 | |||
346 | static inline struct blkio_group * | ||
347 | blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key) { return NULL; } | ||
348 | static inline void blkiocg_update_timeslice_used(struct blkio_group *blkg, | ||
349 | unsigned long time, | ||
350 | unsigned long unaccounted_time) | ||
351 | {} | ||
352 | static inline void blkiocg_update_dispatch_stats(struct blkio_group *blkg, | ||
353 | uint64_t bytes, bool direction, bool sync) {} | ||
354 | static inline void blkiocg_update_completion_stats(struct blkio_group *blkg, | ||
355 | uint64_t start_time, uint64_t io_start_time, bool direction, | ||
356 | bool sync) {} | ||
357 | static inline void blkiocg_update_io_merged_stats(struct blkio_group *blkg, | ||
358 | bool direction, bool sync) {} | ||
359 | static inline void blkiocg_update_io_add_stats(struct blkio_group *blkg, | ||
360 | struct blkio_group *curr_blkg, bool direction, bool sync) {} | ||
361 | static inline void blkiocg_update_io_remove_stats(struct blkio_group *blkg, | ||
362 | bool direction, bool sync) {} | ||
363 | #endif | ||
364 | #endif /* _BLK_CGROUP_H */ | ||
diff --git a/block/blk-core.c b/block/blk-core.c index 1f61b74867e4..3c923a7aeb56 100644 --- a/block/blk-core.c +++ b/block/blk-core.c | |||
@@ -29,11 +29,13 @@ | |||
29 | #include <linux/fault-inject.h> | 29 | #include <linux/fault-inject.h> |
30 | #include <linux/list_sort.h> | 30 | #include <linux/list_sort.h> |
31 | #include <linux/delay.h> | 31 | #include <linux/delay.h> |
32 | #include <linux/ratelimit.h> | ||
32 | 33 | ||
33 | #define CREATE_TRACE_POINTS | 34 | #define CREATE_TRACE_POINTS |
34 | #include <trace/events/block.h> | 35 | #include <trace/events/block.h> |
35 | 36 | ||
36 | #include "blk.h" | 37 | #include "blk.h" |
38 | #include "blk-cgroup.h" | ||
37 | 39 | ||
38 | EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap); | 40 | EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap); |
39 | EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap); | 41 | EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap); |
@@ -280,7 +282,7 @@ EXPORT_SYMBOL(blk_stop_queue); | |||
280 | * | 282 | * |
281 | * This function does not cancel any asynchronous activity arising | 283 | * This function does not cancel any asynchronous activity arising |
282 | * out of elevator or throttling code. That would require elevaotor_exit() | 284 | * out of elevator or throttling code. That would require elevaotor_exit() |
283 | * and blk_throtl_exit() to be called with queue lock initialized. | 285 | * and blkcg_exit_queue() to be called with queue lock initialized. |
284 | * | 286 | * |
285 | */ | 287 | */ |
286 | void blk_sync_queue(struct request_queue *q) | 288 | void blk_sync_queue(struct request_queue *q) |
@@ -365,17 +367,23 @@ void blk_drain_queue(struct request_queue *q, bool drain_all) | |||
365 | 367 | ||
366 | spin_lock_irq(q->queue_lock); | 368 | spin_lock_irq(q->queue_lock); |
367 | 369 | ||
368 | elv_drain_elevator(q); | 370 | /* |
369 | if (drain_all) | 371 | * The caller might be trying to drain @q before its |
370 | blk_throtl_drain(q); | 372 | * elevator is initialized. |
373 | */ | ||
374 | if (q->elevator) | ||
375 | elv_drain_elevator(q); | ||
376 | |||
377 | blkcg_drain_queue(q); | ||
371 | 378 | ||
372 | /* | 379 | /* |
373 | * This function might be called on a queue which failed | 380 | * This function might be called on a queue which failed |
374 | * driver init after queue creation. Some drivers | 381 | * driver init after queue creation or is not yet fully |
375 | * (e.g. fd) get unhappy in such cases. Kick queue iff | 382 | * active yet. Some drivers (e.g. fd and loop) get unhappy |
376 | * dispatch queue has something on it. | 383 | * in such cases. Kick queue iff dispatch queue has |
384 | * something on it and @q has request_fn set. | ||
377 | */ | 385 | */ |
378 | if (!list_empty(&q->queue_head)) | 386 | if (!list_empty(&q->queue_head) && q->request_fn) |
379 | __blk_run_queue(q); | 387 | __blk_run_queue(q); |
380 | 388 | ||
381 | drain |= q->rq.elvpriv; | 389 | drain |= q->rq.elvpriv; |
@@ -403,6 +411,49 @@ void blk_drain_queue(struct request_queue *q, bool drain_all) | |||
403 | } | 411 | } |
404 | 412 | ||
405 | /** | 413 | /** |
414 | * blk_queue_bypass_start - enter queue bypass mode | ||
415 | * @q: queue of interest | ||
416 | * | ||
417 | * In bypass mode, only the dispatch FIFO queue of @q is used. This | ||
418 | * function makes @q enter bypass mode and drains all requests which were | ||
419 | * throttled or issued before. On return, it's guaranteed that no request | ||
420 | * is being throttled or has ELVPRIV set and blk_queue_bypass() %true | ||
421 | * inside queue or RCU read lock. | ||
422 | */ | ||
423 | void blk_queue_bypass_start(struct request_queue *q) | ||
424 | { | ||
425 | bool drain; | ||
426 | |||
427 | spin_lock_irq(q->queue_lock); | ||
428 | drain = !q->bypass_depth++; | ||
429 | queue_flag_set(QUEUE_FLAG_BYPASS, q); | ||
430 | spin_unlock_irq(q->queue_lock); | ||
431 | |||
432 | if (drain) { | ||
433 | blk_drain_queue(q, false); | ||
434 | /* ensure blk_queue_bypass() is %true inside RCU read lock */ | ||
435 | synchronize_rcu(); | ||
436 | } | ||
437 | } | ||
438 | EXPORT_SYMBOL_GPL(blk_queue_bypass_start); | ||
439 | |||
440 | /** | ||
441 | * blk_queue_bypass_end - leave queue bypass mode | ||
442 | * @q: queue of interest | ||
443 | * | ||
444 | * Leave bypass mode and restore the normal queueing behavior. | ||
445 | */ | ||
446 | void blk_queue_bypass_end(struct request_queue *q) | ||
447 | { | ||
448 | spin_lock_irq(q->queue_lock); | ||
449 | if (!--q->bypass_depth) | ||
450 | queue_flag_clear(QUEUE_FLAG_BYPASS, q); | ||
451 | WARN_ON_ONCE(q->bypass_depth < 0); | ||
452 | spin_unlock_irq(q->queue_lock); | ||
453 | } | ||
454 | EXPORT_SYMBOL_GPL(blk_queue_bypass_end); | ||
455 | |||
456 | /** | ||
406 | * blk_cleanup_queue - shutdown a request queue | 457 | * blk_cleanup_queue - shutdown a request queue |
407 | * @q: request queue to shutdown | 458 | * @q: request queue to shutdown |
408 | * | 459 | * |
@@ -418,6 +469,19 @@ void blk_cleanup_queue(struct request_queue *q) | |||
418 | queue_flag_set_unlocked(QUEUE_FLAG_DEAD, q); | 469 | queue_flag_set_unlocked(QUEUE_FLAG_DEAD, q); |
419 | 470 | ||
420 | spin_lock_irq(lock); | 471 | spin_lock_irq(lock); |
472 | |||
473 | /* | ||
474 | * Dead queue is permanently in bypass mode till released. Note | ||
475 | * that, unlike blk_queue_bypass_start(), we aren't performing | ||
476 | * synchronize_rcu() after entering bypass mode to avoid the delay | ||
477 | * as some drivers create and destroy a lot of queues while | ||
478 | * probing. This is still safe because blk_release_queue() will be | ||
479 | * called only after the queue refcnt drops to zero and nothing, | ||
480 | * RCU or not, would be traversing the queue by then. | ||
481 | */ | ||
482 | q->bypass_depth++; | ||
483 | queue_flag_set(QUEUE_FLAG_BYPASS, q); | ||
484 | |||
421 | queue_flag_set(QUEUE_FLAG_NOMERGES, q); | 485 | queue_flag_set(QUEUE_FLAG_NOMERGES, q); |
422 | queue_flag_set(QUEUE_FLAG_NOXMERGES, q); | 486 | queue_flag_set(QUEUE_FLAG_NOXMERGES, q); |
423 | queue_flag_set(QUEUE_FLAG_DEAD, q); | 487 | queue_flag_set(QUEUE_FLAG_DEAD, q); |
@@ -428,13 +492,8 @@ void blk_cleanup_queue(struct request_queue *q) | |||
428 | spin_unlock_irq(lock); | 492 | spin_unlock_irq(lock); |
429 | mutex_unlock(&q->sysfs_lock); | 493 | mutex_unlock(&q->sysfs_lock); |
430 | 494 | ||
431 | /* | 495 | /* drain all requests queued before DEAD marking */ |
432 | * Drain all requests queued before DEAD marking. The caller might | 496 | blk_drain_queue(q, true); |
433 | * be trying to tear down @q before its elevator is initialized, in | ||
434 | * which case we don't want to call into draining. | ||
435 | */ | ||
436 | if (q->elevator) | ||
437 | blk_drain_queue(q, true); | ||
438 | 497 | ||
439 | /* @q won't process any more request, flush async actions */ | 498 | /* @q won't process any more request, flush async actions */ |
440 | del_timer_sync(&q->backing_dev_info.laptop_mode_wb_timer); | 499 | del_timer_sync(&q->backing_dev_info.laptop_mode_wb_timer); |
@@ -498,14 +557,15 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) | |||
498 | if (err) | 557 | if (err) |
499 | goto fail_id; | 558 | goto fail_id; |
500 | 559 | ||
501 | if (blk_throtl_init(q)) | ||
502 | goto fail_id; | ||
503 | |||
504 | setup_timer(&q->backing_dev_info.laptop_mode_wb_timer, | 560 | setup_timer(&q->backing_dev_info.laptop_mode_wb_timer, |
505 | laptop_mode_timer_fn, (unsigned long) q); | 561 | laptop_mode_timer_fn, (unsigned long) q); |
506 | setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q); | 562 | setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q); |
563 | INIT_LIST_HEAD(&q->queue_head); | ||
507 | INIT_LIST_HEAD(&q->timeout_list); | 564 | INIT_LIST_HEAD(&q->timeout_list); |
508 | INIT_LIST_HEAD(&q->icq_list); | 565 | INIT_LIST_HEAD(&q->icq_list); |
566 | #ifdef CONFIG_BLK_CGROUP | ||
567 | INIT_LIST_HEAD(&q->blkg_list); | ||
568 | #endif | ||
509 | INIT_LIST_HEAD(&q->flush_queue[0]); | 569 | INIT_LIST_HEAD(&q->flush_queue[0]); |
510 | INIT_LIST_HEAD(&q->flush_queue[1]); | 570 | INIT_LIST_HEAD(&q->flush_queue[1]); |
511 | INIT_LIST_HEAD(&q->flush_data_in_flight); | 571 | INIT_LIST_HEAD(&q->flush_data_in_flight); |
@@ -522,6 +582,18 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) | |||
522 | */ | 582 | */ |
523 | q->queue_lock = &q->__queue_lock; | 583 | q->queue_lock = &q->__queue_lock; |
524 | 584 | ||
585 | /* | ||
586 | * A queue starts its life with bypass turned on to avoid | ||
587 | * unnecessary bypass on/off overhead and nasty surprises during | ||
588 | * init. The initial bypass will be finished at the end of | ||
589 | * blk_init_allocated_queue(). | ||
590 | */ | ||
591 | q->bypass_depth = 1; | ||
592 | __set_bit(QUEUE_FLAG_BYPASS, &q->queue_flags); | ||
593 | |||
594 | if (blkcg_init_queue(q)) | ||
595 | goto fail_id; | ||
596 | |||
525 | return q; | 597 | return q; |
526 | 598 | ||
527 | fail_id: | 599 | fail_id: |
@@ -614,15 +686,15 @@ blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn, | |||
614 | 686 | ||
615 | q->sg_reserved_size = INT_MAX; | 687 | q->sg_reserved_size = INT_MAX; |
616 | 688 | ||
617 | /* | 689 | /* init elevator */ |
618 | * all done | 690 | if (elevator_init(q, NULL)) |
619 | */ | 691 | return NULL; |
620 | if (!elevator_init(q, NULL)) { | ||
621 | blk_queue_congestion_threshold(q); | ||
622 | return q; | ||
623 | } | ||
624 | 692 | ||
625 | return NULL; | 693 | blk_queue_congestion_threshold(q); |
694 | |||
695 | /* all done, end the initial bypass */ | ||
696 | blk_queue_bypass_end(q); | ||
697 | return q; | ||
626 | } | 698 | } |
627 | EXPORT_SYMBOL(blk_init_allocated_queue); | 699 | EXPORT_SYMBOL(blk_init_allocated_queue); |
628 | 700 | ||
@@ -648,33 +720,6 @@ static inline void blk_free_request(struct request_queue *q, struct request *rq) | |||
648 | mempool_free(rq, q->rq.rq_pool); | 720 | mempool_free(rq, q->rq.rq_pool); |
649 | } | 721 | } |
650 | 722 | ||
651 | static struct request * | ||
652 | blk_alloc_request(struct request_queue *q, struct io_cq *icq, | ||
653 | unsigned int flags, gfp_t gfp_mask) | ||
654 | { | ||
655 | struct request *rq = mempool_alloc(q->rq.rq_pool, gfp_mask); | ||
656 | |||
657 | if (!rq) | ||
658 | return NULL; | ||
659 | |||
660 | blk_rq_init(q, rq); | ||
661 | |||
662 | rq->cmd_flags = flags | REQ_ALLOCED; | ||
663 | |||
664 | if (flags & REQ_ELVPRIV) { | ||
665 | rq->elv.icq = icq; | ||
666 | if (unlikely(elv_set_request(q, rq, gfp_mask))) { | ||
667 | mempool_free(rq, q->rq.rq_pool); | ||
668 | return NULL; | ||
669 | } | ||
670 | /* @rq->elv.icq holds on to io_context until @rq is freed */ | ||
671 | if (icq) | ||
672 | get_io_context(icq->ioc); | ||
673 | } | ||
674 | |||
675 | return rq; | ||
676 | } | ||
677 | |||
678 | /* | 723 | /* |
679 | * ioc_batching returns true if the ioc is a valid batching request and | 724 | * ioc_batching returns true if the ioc is a valid batching request and |
680 | * should be given priority access to a request. | 725 | * should be given priority access to a request. |
@@ -763,6 +808,22 @@ static bool blk_rq_should_init_elevator(struct bio *bio) | |||
763 | } | 808 | } |
764 | 809 | ||
765 | /** | 810 | /** |
811 | * rq_ioc - determine io_context for request allocation | ||
812 | * @bio: request being allocated is for this bio (can be %NULL) | ||
813 | * | ||
814 | * Determine io_context to use for request allocation for @bio. May return | ||
815 | * %NULL if %current->io_context doesn't exist. | ||
816 | */ | ||
817 | static struct io_context *rq_ioc(struct bio *bio) | ||
818 | { | ||
819 | #ifdef CONFIG_BLK_CGROUP | ||
820 | if (bio && bio->bi_ioc) | ||
821 | return bio->bi_ioc; | ||
822 | #endif | ||
823 | return current->io_context; | ||
824 | } | ||
825 | |||
826 | /** | ||
766 | * get_request - get a free request | 827 | * get_request - get a free request |
767 | * @q: request_queue to allocate request from | 828 | * @q: request_queue to allocate request from |
768 | * @rw_flags: RW and SYNC flags | 829 | * @rw_flags: RW and SYNC flags |
@@ -779,7 +840,7 @@ static bool blk_rq_should_init_elevator(struct bio *bio) | |||
779 | static struct request *get_request(struct request_queue *q, int rw_flags, | 840 | static struct request *get_request(struct request_queue *q, int rw_flags, |
780 | struct bio *bio, gfp_t gfp_mask) | 841 | struct bio *bio, gfp_t gfp_mask) |
781 | { | 842 | { |
782 | struct request *rq = NULL; | 843 | struct request *rq; |
783 | struct request_list *rl = &q->rq; | 844 | struct request_list *rl = &q->rq; |
784 | struct elevator_type *et; | 845 | struct elevator_type *et; |
785 | struct io_context *ioc; | 846 | struct io_context *ioc; |
@@ -789,7 +850,7 @@ static struct request *get_request(struct request_queue *q, int rw_flags, | |||
789 | int may_queue; | 850 | int may_queue; |
790 | retry: | 851 | retry: |
791 | et = q->elevator->type; | 852 | et = q->elevator->type; |
792 | ioc = current->io_context; | 853 | ioc = rq_ioc(bio); |
793 | 854 | ||
794 | if (unlikely(blk_queue_dead(q))) | 855 | if (unlikely(blk_queue_dead(q))) |
795 | return NULL; | 856 | return NULL; |
@@ -808,7 +869,7 @@ retry: | |||
808 | */ | 869 | */ |
809 | if (!ioc && !retried) { | 870 | if (!ioc && !retried) { |
810 | spin_unlock_irq(q->queue_lock); | 871 | spin_unlock_irq(q->queue_lock); |
811 | create_io_context(current, gfp_mask, q->node); | 872 | create_io_context(gfp_mask, q->node); |
812 | spin_lock_irq(q->queue_lock); | 873 | spin_lock_irq(q->queue_lock); |
813 | retried = true; | 874 | retried = true; |
814 | goto retry; | 875 | goto retry; |
@@ -831,7 +892,7 @@ retry: | |||
831 | * process is not a "batcher", and not | 892 | * process is not a "batcher", and not |
832 | * exempted by the IO scheduler | 893 | * exempted by the IO scheduler |
833 | */ | 894 | */ |
834 | goto out; | 895 | return NULL; |
835 | } | 896 | } |
836 | } | 897 | } |
837 | } | 898 | } |
@@ -844,7 +905,7 @@ retry: | |||
844 | * allocated with any setting of ->nr_requests | 905 | * allocated with any setting of ->nr_requests |
845 | */ | 906 | */ |
846 | if (rl->count[is_sync] >= (3 * q->nr_requests / 2)) | 907 | if (rl->count[is_sync] >= (3 * q->nr_requests / 2)) |
847 | goto out; | 908 | return NULL; |
848 | 909 | ||
849 | rl->count[is_sync]++; | 910 | rl->count[is_sync]++; |
850 | rl->starved[is_sync] = 0; | 911 | rl->starved[is_sync] = 0; |
@@ -859,8 +920,7 @@ retry: | |||
859 | * Also, lookup icq while holding queue_lock. If it doesn't exist, | 920 | * Also, lookup icq while holding queue_lock. If it doesn't exist, |
860 | * it will be created after releasing queue_lock. | 921 | * it will be created after releasing queue_lock. |
861 | */ | 922 | */ |
862 | if (blk_rq_should_init_elevator(bio) && | 923 | if (blk_rq_should_init_elevator(bio) && !blk_queue_bypass(q)) { |
863 | !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags)) { | ||
864 | rw_flags |= REQ_ELVPRIV; | 924 | rw_flags |= REQ_ELVPRIV; |
865 | rl->elvpriv++; | 925 | rl->elvpriv++; |
866 | if (et->icq_cache && ioc) | 926 | if (et->icq_cache && ioc) |
@@ -871,41 +931,36 @@ retry: | |||
871 | rw_flags |= REQ_IO_STAT; | 931 | rw_flags |= REQ_IO_STAT; |
872 | spin_unlock_irq(q->queue_lock); | 932 | spin_unlock_irq(q->queue_lock); |
873 | 933 | ||
874 | /* create icq if missing */ | 934 | /* allocate and init request */ |
875 | if ((rw_flags & REQ_ELVPRIV) && unlikely(et->icq_cache && !icq)) { | 935 | rq = mempool_alloc(q->rq.rq_pool, gfp_mask); |
876 | icq = ioc_create_icq(q, gfp_mask); | 936 | if (!rq) |
877 | if (!icq) | 937 | goto fail_alloc; |
878 | goto fail_icq; | ||
879 | } | ||
880 | |||
881 | rq = blk_alloc_request(q, icq, rw_flags, gfp_mask); | ||
882 | 938 | ||
883 | fail_icq: | 939 | blk_rq_init(q, rq); |
884 | if (unlikely(!rq)) { | 940 | rq->cmd_flags = rw_flags | REQ_ALLOCED; |
885 | /* | 941 | |
886 | * Allocation failed presumably due to memory. Undo anything | 942 | /* init elvpriv */ |
887 | * we might have messed up. | 943 | if (rw_flags & REQ_ELVPRIV) { |
888 | * | 944 | if (unlikely(et->icq_cache && !icq)) { |
889 | * Allocating task should really be put onto the front of the | 945 | create_io_context(gfp_mask, q->node); |
890 | * wait queue, but this is pretty rare. | 946 | ioc = rq_ioc(bio); |
891 | */ | 947 | if (!ioc) |
892 | spin_lock_irq(q->queue_lock); | 948 | goto fail_elvpriv; |
893 | freed_request(q, rw_flags); | 949 | |
950 | icq = ioc_create_icq(ioc, q, gfp_mask); | ||
951 | if (!icq) | ||
952 | goto fail_elvpriv; | ||
953 | } | ||
894 | 954 | ||
895 | /* | 955 | rq->elv.icq = icq; |
896 | * in the very unlikely event that allocation failed and no | 956 | if (unlikely(elv_set_request(q, rq, bio, gfp_mask))) |
897 | * requests for this direction was pending, mark us starved | 957 | goto fail_elvpriv; |
898 | * so that freeing of a request in the other direction will | ||
899 | * notice us. another possible fix would be to split the | ||
900 | * rq mempool into READ and WRITE | ||
901 | */ | ||
902 | rq_starved: | ||
903 | if (unlikely(rl->count[is_sync] == 0)) | ||
904 | rl->starved[is_sync] = 1; | ||
905 | 958 | ||
906 | goto out; | 959 | /* @rq->elv.icq holds io_context until @rq is freed */ |
960 | if (icq) | ||
961 | get_io_context(icq->ioc); | ||
907 | } | 962 | } |
908 | 963 | out: | |
909 | /* | 964 | /* |
910 | * ioc may be NULL here, and ioc_batching will be false. That's | 965 | * ioc may be NULL here, and ioc_batching will be false. That's |
911 | * OK, if the queue is under the request limit then requests need | 966 | * OK, if the queue is under the request limit then requests need |
@@ -916,8 +971,48 @@ rq_starved: | |||
916 | ioc->nr_batch_requests--; | 971 | ioc->nr_batch_requests--; |
917 | 972 | ||
918 | trace_block_getrq(q, bio, rw_flags & 1); | 973 | trace_block_getrq(q, bio, rw_flags & 1); |
919 | out: | ||
920 | return rq; | 974 | return rq; |
975 | |||
976 | fail_elvpriv: | ||
977 | /* | ||
978 | * elvpriv init failed. ioc, icq and elvpriv aren't mempool backed | ||
979 | * and may fail indefinitely under memory pressure and thus | ||
980 | * shouldn't stall IO. Treat this request as !elvpriv. This will | ||
981 | * disturb iosched and blkcg but weird is bettern than dead. | ||
982 | */ | ||
983 | printk_ratelimited(KERN_WARNING "%s: request aux data allocation failed, iosched may be disturbed\n", | ||
984 | dev_name(q->backing_dev_info.dev)); | ||
985 | |||
986 | rq->cmd_flags &= ~REQ_ELVPRIV; | ||
987 | rq->elv.icq = NULL; | ||
988 | |||
989 | spin_lock_irq(q->queue_lock); | ||
990 | rl->elvpriv--; | ||
991 | spin_unlock_irq(q->queue_lock); | ||
992 | goto out; | ||
993 | |||
994 | fail_alloc: | ||
995 | /* | ||
996 | * Allocation failed presumably due to memory. Undo anything we | ||
997 | * might have messed up. | ||
998 | * | ||
999 | * Allocating task should really be put onto the front of the wait | ||
1000 | * queue, but this is pretty rare. | ||
1001 | */ | ||
1002 | spin_lock_irq(q->queue_lock); | ||
1003 | freed_request(q, rw_flags); | ||
1004 | |||
1005 | /* | ||
1006 | * in the very unlikely event that allocation failed and no | ||
1007 | * requests for this direction was pending, mark us starved so that | ||
1008 | * freeing of a request in the other direction will notice | ||
1009 | * us. another possible fix would be to split the rq mempool into | ||
1010 | * READ and WRITE | ||
1011 | */ | ||
1012 | rq_starved: | ||
1013 | if (unlikely(rl->count[is_sync] == 0)) | ||
1014 | rl->starved[is_sync] = 1; | ||
1015 | return NULL; | ||
921 | } | 1016 | } |
922 | 1017 | ||
923 | /** | 1018 | /** |
@@ -961,7 +1056,7 @@ static struct request *get_request_wait(struct request_queue *q, int rw_flags, | |||
961 | * up to a big batch of them for a small period time. | 1056 | * up to a big batch of them for a small period time. |
962 | * See ioc_batching, ioc_set_batching | 1057 | * See ioc_batching, ioc_set_batching |
963 | */ | 1058 | */ |
964 | create_io_context(current, GFP_NOIO, q->node); | 1059 | create_io_context(GFP_NOIO, q->node); |
965 | ioc_set_batching(q, current->io_context); | 1060 | ioc_set_batching(q, current->io_context); |
966 | 1061 | ||
967 | spin_lock_irq(q->queue_lock); | 1062 | spin_lock_irq(q->queue_lock); |
diff --git a/block/blk-ioc.c b/block/blk-ioc.c index fb95dd2f889a..1e2d53b04858 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c | |||
@@ -155,20 +155,20 @@ void put_io_context(struct io_context *ioc) | |||
155 | } | 155 | } |
156 | EXPORT_SYMBOL(put_io_context); | 156 | EXPORT_SYMBOL(put_io_context); |
157 | 157 | ||
158 | /* Called by the exiting task */ | 158 | /** |
159 | void exit_io_context(struct task_struct *task) | 159 | * put_io_context_active - put active reference on ioc |
160 | * @ioc: ioc of interest | ||
161 | * | ||
162 | * Undo get_io_context_active(). If active reference reaches zero after | ||
163 | * put, @ioc can never issue further IOs and ioscheds are notified. | ||
164 | */ | ||
165 | void put_io_context_active(struct io_context *ioc) | ||
160 | { | 166 | { |
161 | struct io_context *ioc; | ||
162 | struct io_cq *icq; | ||
163 | struct hlist_node *n; | 167 | struct hlist_node *n; |
164 | unsigned long flags; | 168 | unsigned long flags; |
169 | struct io_cq *icq; | ||
165 | 170 | ||
166 | task_lock(task); | 171 | if (!atomic_dec_and_test(&ioc->active_ref)) { |
167 | ioc = task->io_context; | ||
168 | task->io_context = NULL; | ||
169 | task_unlock(task); | ||
170 | |||
171 | if (!atomic_dec_and_test(&ioc->nr_tasks)) { | ||
172 | put_io_context(ioc); | 172 | put_io_context(ioc); |
173 | return; | 173 | return; |
174 | } | 174 | } |
@@ -197,6 +197,20 @@ retry: | |||
197 | put_io_context(ioc); | 197 | put_io_context(ioc); |
198 | } | 198 | } |
199 | 199 | ||
200 | /* Called by the exiting task */ | ||
201 | void exit_io_context(struct task_struct *task) | ||
202 | { | ||
203 | struct io_context *ioc; | ||
204 | |||
205 | task_lock(task); | ||
206 | ioc = task->io_context; | ||
207 | task->io_context = NULL; | ||
208 | task_unlock(task); | ||
209 | |||
210 | atomic_dec(&ioc->nr_tasks); | ||
211 | put_io_context_active(ioc); | ||
212 | } | ||
213 | |||
200 | /** | 214 | /** |
201 | * ioc_clear_queue - break any ioc association with the specified queue | 215 | * ioc_clear_queue - break any ioc association with the specified queue |
202 | * @q: request_queue being cleared | 216 | * @q: request_queue being cleared |
@@ -218,19 +232,18 @@ void ioc_clear_queue(struct request_queue *q) | |||
218 | } | 232 | } |
219 | } | 233 | } |
220 | 234 | ||
221 | void create_io_context_slowpath(struct task_struct *task, gfp_t gfp_flags, | 235 | int create_task_io_context(struct task_struct *task, gfp_t gfp_flags, int node) |
222 | int node) | ||
223 | { | 236 | { |
224 | struct io_context *ioc; | 237 | struct io_context *ioc; |
225 | 238 | ||
226 | ioc = kmem_cache_alloc_node(iocontext_cachep, gfp_flags | __GFP_ZERO, | 239 | ioc = kmem_cache_alloc_node(iocontext_cachep, gfp_flags | __GFP_ZERO, |
227 | node); | 240 | node); |
228 | if (unlikely(!ioc)) | 241 | if (unlikely(!ioc)) |
229 | return; | 242 | return -ENOMEM; |
230 | 243 | ||
231 | /* initialize */ | 244 | /* initialize */ |
232 | atomic_long_set(&ioc->refcount, 1); | 245 | atomic_long_set(&ioc->refcount, 1); |
233 | atomic_set(&ioc->nr_tasks, 1); | 246 | atomic_set(&ioc->active_ref, 1); |
234 | spin_lock_init(&ioc->lock); | 247 | spin_lock_init(&ioc->lock); |
235 | INIT_RADIX_TREE(&ioc->icq_tree, GFP_ATOMIC | __GFP_HIGH); | 248 | INIT_RADIX_TREE(&ioc->icq_tree, GFP_ATOMIC | __GFP_HIGH); |
236 | INIT_HLIST_HEAD(&ioc->icq_list); | 249 | INIT_HLIST_HEAD(&ioc->icq_list); |
@@ -250,6 +263,8 @@ void create_io_context_slowpath(struct task_struct *task, gfp_t gfp_flags, | |||
250 | else | 263 | else |
251 | kmem_cache_free(iocontext_cachep, ioc); | 264 | kmem_cache_free(iocontext_cachep, ioc); |
252 | task_unlock(task); | 265 | task_unlock(task); |
266 | |||
267 | return 0; | ||
253 | } | 268 | } |
254 | 269 | ||
255 | /** | 270 | /** |
@@ -281,7 +296,7 @@ struct io_context *get_task_io_context(struct task_struct *task, | |||
281 | return ioc; | 296 | return ioc; |
282 | } | 297 | } |
283 | task_unlock(task); | 298 | task_unlock(task); |
284 | } while (create_io_context(task, gfp_flags, node)); | 299 | } while (!create_task_io_context(task, gfp_flags, node)); |
285 | 300 | ||
286 | return NULL; | 301 | return NULL; |
287 | } | 302 | } |
@@ -325,26 +340,23 @@ EXPORT_SYMBOL(ioc_lookup_icq); | |||
325 | 340 | ||
326 | /** | 341 | /** |
327 | * ioc_create_icq - create and link io_cq | 342 | * ioc_create_icq - create and link io_cq |
343 | * @ioc: io_context of interest | ||
328 | * @q: request_queue of interest | 344 | * @q: request_queue of interest |
329 | * @gfp_mask: allocation mask | 345 | * @gfp_mask: allocation mask |
330 | * | 346 | * |
331 | * Make sure io_cq linking %current->io_context and @q exists. If either | 347 | * Make sure io_cq linking @ioc and @q exists. If icq doesn't exist, they |
332 | * io_context and/or icq don't exist, they will be created using @gfp_mask. | 348 | * will be created using @gfp_mask. |
333 | * | 349 | * |
334 | * The caller is responsible for ensuring @ioc won't go away and @q is | 350 | * The caller is responsible for ensuring @ioc won't go away and @q is |
335 | * alive and will stay alive until this function returns. | 351 | * alive and will stay alive until this function returns. |
336 | */ | 352 | */ |
337 | struct io_cq *ioc_create_icq(struct request_queue *q, gfp_t gfp_mask) | 353 | struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q, |
354 | gfp_t gfp_mask) | ||
338 | { | 355 | { |
339 | struct elevator_type *et = q->elevator->type; | 356 | struct elevator_type *et = q->elevator->type; |
340 | struct io_context *ioc; | ||
341 | struct io_cq *icq; | 357 | struct io_cq *icq; |
342 | 358 | ||
343 | /* allocate stuff */ | 359 | /* allocate stuff */ |
344 | ioc = create_io_context(current, gfp_mask, q->node); | ||
345 | if (!ioc) | ||
346 | return NULL; | ||
347 | |||
348 | icq = kmem_cache_alloc_node(et->icq_cache, gfp_mask | __GFP_ZERO, | 360 | icq = kmem_cache_alloc_node(et->icq_cache, gfp_mask | __GFP_ZERO, |
349 | q->node); | 361 | q->node); |
350 | if (!icq) | 362 | if (!icq) |
@@ -382,74 +394,6 @@ struct io_cq *ioc_create_icq(struct request_queue *q, gfp_t gfp_mask) | |||
382 | return icq; | 394 | return icq; |
383 | } | 395 | } |
384 | 396 | ||
385 | void ioc_set_icq_flags(struct io_context *ioc, unsigned int flags) | ||
386 | { | ||
387 | struct io_cq *icq; | ||
388 | struct hlist_node *n; | ||
389 | |||
390 | hlist_for_each_entry(icq, n, &ioc->icq_list, ioc_node) | ||
391 | icq->flags |= flags; | ||
392 | } | ||
393 | |||
394 | /** | ||
395 | * ioc_ioprio_changed - notify ioprio change | ||
396 | * @ioc: io_context of interest | ||
397 | * @ioprio: new ioprio | ||
398 | * | ||
399 | * @ioc's ioprio has changed to @ioprio. Set %ICQ_IOPRIO_CHANGED for all | ||
400 | * icq's. iosched is responsible for checking the bit and applying it on | ||
401 | * request issue path. | ||
402 | */ | ||
403 | void ioc_ioprio_changed(struct io_context *ioc, int ioprio) | ||
404 | { | ||
405 | unsigned long flags; | ||
406 | |||
407 | spin_lock_irqsave(&ioc->lock, flags); | ||
408 | ioc->ioprio = ioprio; | ||
409 | ioc_set_icq_flags(ioc, ICQ_IOPRIO_CHANGED); | ||
410 | spin_unlock_irqrestore(&ioc->lock, flags); | ||
411 | } | ||
412 | |||
413 | /** | ||
414 | * ioc_cgroup_changed - notify cgroup change | ||
415 | * @ioc: io_context of interest | ||
416 | * | ||
417 | * @ioc's cgroup has changed. Set %ICQ_CGROUP_CHANGED for all icq's. | ||
418 | * iosched is responsible for checking the bit and applying it on request | ||
419 | * issue path. | ||
420 | */ | ||
421 | void ioc_cgroup_changed(struct io_context *ioc) | ||
422 | { | ||
423 | unsigned long flags; | ||
424 | |||
425 | spin_lock_irqsave(&ioc->lock, flags); | ||
426 | ioc_set_icq_flags(ioc, ICQ_CGROUP_CHANGED); | ||
427 | spin_unlock_irqrestore(&ioc->lock, flags); | ||
428 | } | ||
429 | EXPORT_SYMBOL(ioc_cgroup_changed); | ||
430 | |||
431 | /** | ||
432 | * icq_get_changed - fetch and clear icq changed mask | ||
433 | * @icq: icq of interest | ||
434 | * | ||
435 | * Fetch and clear ICQ_*_CHANGED bits from @icq. Grabs and releases | ||
436 | * @icq->ioc->lock. | ||
437 | */ | ||
438 | unsigned icq_get_changed(struct io_cq *icq) | ||
439 | { | ||
440 | unsigned int changed = 0; | ||
441 | unsigned long flags; | ||
442 | |||
443 | if (unlikely(icq->flags & ICQ_CHANGED_MASK)) { | ||
444 | spin_lock_irqsave(&icq->ioc->lock, flags); | ||
445 | changed = icq->flags & ICQ_CHANGED_MASK; | ||
446 | icq->flags &= ~ICQ_CHANGED_MASK; | ||
447 | spin_unlock_irqrestore(&icq->ioc->lock, flags); | ||
448 | } | ||
449 | return changed; | ||
450 | } | ||
451 | EXPORT_SYMBOL(icq_get_changed); | ||
452 | |||
453 | static int __init blk_ioc_init(void) | 397 | static int __init blk_ioc_init(void) |
454 | { | 398 | { |
455 | iocontext_cachep = kmem_cache_create("blkdev_ioc", | 399 | iocontext_cachep = kmem_cache_create("blkdev_ioc", |
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index cf150011d808..aa41b47c22d2 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c | |||
@@ -9,6 +9,7 @@ | |||
9 | #include <linux/blktrace_api.h> | 9 | #include <linux/blktrace_api.h> |
10 | 10 | ||
11 | #include "blk.h" | 11 | #include "blk.h" |
12 | #include "blk-cgroup.h" | ||
12 | 13 | ||
13 | struct queue_sysfs_entry { | 14 | struct queue_sysfs_entry { |
14 | struct attribute attr; | 15 | struct attribute attr; |
@@ -479,6 +480,8 @@ static void blk_release_queue(struct kobject *kobj) | |||
479 | 480 | ||
480 | blk_sync_queue(q); | 481 | blk_sync_queue(q); |
481 | 482 | ||
483 | blkcg_exit_queue(q); | ||
484 | |||
482 | if (q->elevator) { | 485 | if (q->elevator) { |
483 | spin_lock_irq(q->queue_lock); | 486 | spin_lock_irq(q->queue_lock); |
484 | ioc_clear_queue(q); | 487 | ioc_clear_queue(q); |
@@ -486,15 +489,12 @@ static void blk_release_queue(struct kobject *kobj) | |||
486 | elevator_exit(q->elevator); | 489 | elevator_exit(q->elevator); |
487 | } | 490 | } |
488 | 491 | ||
489 | blk_throtl_exit(q); | ||
490 | |||
491 | if (rl->rq_pool) | 492 | if (rl->rq_pool) |
492 | mempool_destroy(rl->rq_pool); | 493 | mempool_destroy(rl->rq_pool); |
493 | 494 | ||
494 | if (q->queue_tags) | 495 | if (q->queue_tags) |
495 | __blk_queue_free_tags(q); | 496 | __blk_queue_free_tags(q); |
496 | 497 | ||
497 | blk_throtl_release(q); | ||
498 | blk_trace_shutdown(q); | 498 | blk_trace_shutdown(q); |
499 | 499 | ||
500 | bdi_destroy(&q->backing_dev_info); | 500 | bdi_destroy(&q->backing_dev_info); |
diff --git a/block/blk-throttle.c b/block/blk-throttle.c index f2ddb94626bd..5b0659512047 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c | |||
@@ -21,6 +21,8 @@ static int throtl_quantum = 32; | |||
21 | /* Throttling is performed over 100ms slice and after that slice is renewed */ | 21 | /* Throttling is performed over 100ms slice and after that slice is renewed */ |
22 | static unsigned long throtl_slice = HZ/10; /* 100 ms */ | 22 | static unsigned long throtl_slice = HZ/10; /* 100 ms */ |
23 | 23 | ||
24 | static struct blkcg_policy blkcg_policy_throtl; | ||
25 | |||
24 | /* A workqueue to queue throttle related work */ | 26 | /* A workqueue to queue throttle related work */ |
25 | static struct workqueue_struct *kthrotld_workqueue; | 27 | static struct workqueue_struct *kthrotld_workqueue; |
26 | static void throtl_schedule_delayed_work(struct throtl_data *td, | 28 | static void throtl_schedule_delayed_work(struct throtl_data *td, |
@@ -38,9 +40,17 @@ struct throtl_rb_root { | |||
38 | 40 | ||
39 | #define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node) | 41 | #define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node) |
40 | 42 | ||
43 | /* Per-cpu group stats */ | ||
44 | struct tg_stats_cpu { | ||
45 | /* total bytes transferred */ | ||
46 | struct blkg_rwstat service_bytes; | ||
47 | /* total IOs serviced, post merge */ | ||
48 | struct blkg_rwstat serviced; | ||
49 | }; | ||
50 | |||
41 | struct throtl_grp { | 51 | struct throtl_grp { |
42 | /* List of throtl groups on the request queue*/ | 52 | /* must be the first member */ |
43 | struct hlist_node tg_node; | 53 | struct blkg_policy_data pd; |
44 | 54 | ||
45 | /* active throtl group service_tree member */ | 55 | /* active throtl group service_tree member */ |
46 | struct rb_node rb_node; | 56 | struct rb_node rb_node; |
@@ -52,8 +62,6 @@ struct throtl_grp { | |||
52 | */ | 62 | */ |
53 | unsigned long disptime; | 63 | unsigned long disptime; |
54 | 64 | ||
55 | struct blkio_group blkg; | ||
56 | atomic_t ref; | ||
57 | unsigned int flags; | 65 | unsigned int flags; |
58 | 66 | ||
59 | /* Two lists for READ and WRITE */ | 67 | /* Two lists for READ and WRITE */ |
@@ -80,18 +88,18 @@ struct throtl_grp { | |||
80 | /* Some throttle limits got updated for the group */ | 88 | /* Some throttle limits got updated for the group */ |
81 | int limits_changed; | 89 | int limits_changed; |
82 | 90 | ||
83 | struct rcu_head rcu_head; | 91 | /* Per cpu stats pointer */ |
92 | struct tg_stats_cpu __percpu *stats_cpu; | ||
93 | |||
94 | /* List of tgs waiting for per cpu stats memory to be allocated */ | ||
95 | struct list_head stats_alloc_node; | ||
84 | }; | 96 | }; |
85 | 97 | ||
86 | struct throtl_data | 98 | struct throtl_data |
87 | { | 99 | { |
88 | /* List of throtl groups */ | ||
89 | struct hlist_head tg_list; | ||
90 | |||
91 | /* service tree for active throtl groups */ | 100 | /* service tree for active throtl groups */ |
92 | struct throtl_rb_root tg_service_tree; | 101 | struct throtl_rb_root tg_service_tree; |
93 | 102 | ||
94 | struct throtl_grp *root_tg; | ||
95 | struct request_queue *queue; | 103 | struct request_queue *queue; |
96 | 104 | ||
97 | /* Total Number of queued bios on READ and WRITE lists */ | 105 | /* Total Number of queued bios on READ and WRITE lists */ |
@@ -108,6 +116,33 @@ struct throtl_data | |||
108 | int limits_changed; | 116 | int limits_changed; |
109 | }; | 117 | }; |
110 | 118 | ||
119 | /* list and work item to allocate percpu group stats */ | ||
120 | static DEFINE_SPINLOCK(tg_stats_alloc_lock); | ||
121 | static LIST_HEAD(tg_stats_alloc_list); | ||
122 | |||
123 | static void tg_stats_alloc_fn(struct work_struct *); | ||
124 | static DECLARE_DELAYED_WORK(tg_stats_alloc_work, tg_stats_alloc_fn); | ||
125 | |||
126 | static inline struct throtl_grp *pd_to_tg(struct blkg_policy_data *pd) | ||
127 | { | ||
128 | return pd ? container_of(pd, struct throtl_grp, pd) : NULL; | ||
129 | } | ||
130 | |||
131 | static inline struct throtl_grp *blkg_to_tg(struct blkcg_gq *blkg) | ||
132 | { | ||
133 | return pd_to_tg(blkg_to_pd(blkg, &blkcg_policy_throtl)); | ||
134 | } | ||
135 | |||
136 | static inline struct blkcg_gq *tg_to_blkg(struct throtl_grp *tg) | ||
137 | { | ||
138 | return pd_to_blkg(&tg->pd); | ||
139 | } | ||
140 | |||
141 | static inline struct throtl_grp *td_root_tg(struct throtl_data *td) | ||
142 | { | ||
143 | return blkg_to_tg(td->queue->root_blkg); | ||
144 | } | ||
145 | |||
111 | enum tg_state_flags { | 146 | enum tg_state_flags { |
112 | THROTL_TG_FLAG_on_rr = 0, /* on round-robin busy list */ | 147 | THROTL_TG_FLAG_on_rr = 0, /* on round-robin busy list */ |
113 | }; | 148 | }; |
@@ -128,244 +163,150 @@ static inline int throtl_tg_##name(const struct throtl_grp *tg) \ | |||
128 | 163 | ||
129 | THROTL_TG_FNS(on_rr); | 164 | THROTL_TG_FNS(on_rr); |
130 | 165 | ||
131 | #define throtl_log_tg(td, tg, fmt, args...) \ | 166 | #define throtl_log_tg(td, tg, fmt, args...) do { \ |
132 | blk_add_trace_msg((td)->queue, "throtl %s " fmt, \ | 167 | char __pbuf[128]; \ |
133 | blkg_path(&(tg)->blkg), ##args); \ | 168 | \ |
169 | blkg_path(tg_to_blkg(tg), __pbuf, sizeof(__pbuf)); \ | ||
170 | blk_add_trace_msg((td)->queue, "throtl %s " fmt, __pbuf, ##args); \ | ||
171 | } while (0) | ||
134 | 172 | ||
135 | #define throtl_log(td, fmt, args...) \ | 173 | #define throtl_log(td, fmt, args...) \ |
136 | blk_add_trace_msg((td)->queue, "throtl " fmt, ##args) | 174 | blk_add_trace_msg((td)->queue, "throtl " fmt, ##args) |
137 | 175 | ||
138 | static inline struct throtl_grp *tg_of_blkg(struct blkio_group *blkg) | ||
139 | { | ||
140 | if (blkg) | ||
141 | return container_of(blkg, struct throtl_grp, blkg); | ||
142 | |||
143 | return NULL; | ||
144 | } | ||
145 | |||
146 | static inline unsigned int total_nr_queued(struct throtl_data *td) | 176 | static inline unsigned int total_nr_queued(struct throtl_data *td) |
147 | { | 177 | { |
148 | return td->nr_queued[0] + td->nr_queued[1]; | 178 | return td->nr_queued[0] + td->nr_queued[1]; |
149 | } | 179 | } |
150 | 180 | ||
151 | static inline struct throtl_grp *throtl_ref_get_tg(struct throtl_grp *tg) | 181 | /* |
152 | { | 182 | * Worker for allocating per cpu stat for tgs. This is scheduled on the |
153 | atomic_inc(&tg->ref); | 183 | * system_nrt_wq once there are some groups on the alloc_list waiting for |
154 | return tg; | 184 | * allocation. |
155 | } | 185 | */ |
156 | 186 | static void tg_stats_alloc_fn(struct work_struct *work) | |
157 | static void throtl_free_tg(struct rcu_head *head) | ||
158 | { | 187 | { |
159 | struct throtl_grp *tg; | 188 | static struct tg_stats_cpu *stats_cpu; /* this fn is non-reentrant */ |
189 | struct delayed_work *dwork = to_delayed_work(work); | ||
190 | bool empty = false; | ||
191 | |||
192 | alloc_stats: | ||
193 | if (!stats_cpu) { | ||
194 | stats_cpu = alloc_percpu(struct tg_stats_cpu); | ||
195 | if (!stats_cpu) { | ||
196 | /* allocation failed, try again after some time */ | ||
197 | queue_delayed_work(system_nrt_wq, dwork, | ||
198 | msecs_to_jiffies(10)); | ||
199 | return; | ||
200 | } | ||
201 | } | ||
160 | 202 | ||
161 | tg = container_of(head, struct throtl_grp, rcu_head); | 203 | spin_lock_irq(&tg_stats_alloc_lock); |
162 | free_percpu(tg->blkg.stats_cpu); | ||
163 | kfree(tg); | ||
164 | } | ||
165 | 204 | ||
166 | static void throtl_put_tg(struct throtl_grp *tg) | 205 | if (!list_empty(&tg_stats_alloc_list)) { |
167 | { | 206 | struct throtl_grp *tg = list_first_entry(&tg_stats_alloc_list, |
168 | BUG_ON(atomic_read(&tg->ref) <= 0); | 207 | struct throtl_grp, |
169 | if (!atomic_dec_and_test(&tg->ref)) | 208 | stats_alloc_node); |
170 | return; | 209 | swap(tg->stats_cpu, stats_cpu); |
210 | list_del_init(&tg->stats_alloc_node); | ||
211 | } | ||
171 | 212 | ||
172 | /* | 213 | empty = list_empty(&tg_stats_alloc_list); |
173 | * A group is freed in rcu manner. But having an rcu lock does not | 214 | spin_unlock_irq(&tg_stats_alloc_lock); |
174 | * mean that one can access all the fields of blkg and assume these | 215 | if (!empty) |
175 | * are valid. For example, don't try to follow throtl_data and | 216 | goto alloc_stats; |
176 | * request queue links. | ||
177 | * | ||
178 | * Having a reference to blkg under an rcu allows acess to only | ||
179 | * values local to groups like group stats and group rate limits | ||
180 | */ | ||
181 | call_rcu(&tg->rcu_head, throtl_free_tg); | ||
182 | } | 217 | } |
183 | 218 | ||
184 | static void throtl_init_group(struct throtl_grp *tg) | 219 | static void throtl_pd_init(struct blkcg_gq *blkg) |
185 | { | 220 | { |
186 | INIT_HLIST_NODE(&tg->tg_node); | 221 | struct throtl_grp *tg = blkg_to_tg(blkg); |
222 | unsigned long flags; | ||
223 | |||
187 | RB_CLEAR_NODE(&tg->rb_node); | 224 | RB_CLEAR_NODE(&tg->rb_node); |
188 | bio_list_init(&tg->bio_lists[0]); | 225 | bio_list_init(&tg->bio_lists[0]); |
189 | bio_list_init(&tg->bio_lists[1]); | 226 | bio_list_init(&tg->bio_lists[1]); |
190 | tg->limits_changed = false; | 227 | tg->limits_changed = false; |
191 | 228 | ||
192 | /* Practically unlimited BW */ | 229 | tg->bps[READ] = -1; |
193 | tg->bps[0] = tg->bps[1] = -1; | 230 | tg->bps[WRITE] = -1; |
194 | tg->iops[0] = tg->iops[1] = -1; | 231 | tg->iops[READ] = -1; |
232 | tg->iops[WRITE] = -1; | ||
195 | 233 | ||
196 | /* | 234 | /* |
197 | * Take the initial reference that will be released on destroy | 235 | * Ugh... We need to perform per-cpu allocation for tg->stats_cpu |
198 | * This can be thought of a joint reference by cgroup and | 236 | * but percpu allocator can't be called from IO path. Queue tg on |
199 | * request queue which will be dropped by either request queue | 237 | * tg_stats_alloc_list and allocate from work item. |
200 | * exit or cgroup deletion path depending on who is exiting first. | ||
201 | */ | 238 | */ |
202 | atomic_set(&tg->ref, 1); | 239 | spin_lock_irqsave(&tg_stats_alloc_lock, flags); |
240 | list_add(&tg->stats_alloc_node, &tg_stats_alloc_list); | ||
241 | queue_delayed_work(system_nrt_wq, &tg_stats_alloc_work, 0); | ||
242 | spin_unlock_irqrestore(&tg_stats_alloc_lock, flags); | ||
203 | } | 243 | } |
204 | 244 | ||
205 | /* Should be called with rcu read lock held (needed for blkcg) */ | 245 | static void throtl_pd_exit(struct blkcg_gq *blkg) |
206 | static void | ||
207 | throtl_add_group_to_td_list(struct throtl_data *td, struct throtl_grp *tg) | ||
208 | { | 246 | { |
209 | hlist_add_head(&tg->tg_node, &td->tg_list); | 247 | struct throtl_grp *tg = blkg_to_tg(blkg); |
210 | td->nr_undestroyed_grps++; | 248 | unsigned long flags; |
211 | } | ||
212 | |||
213 | static void | ||
214 | __throtl_tg_fill_dev_details(struct throtl_data *td, struct throtl_grp *tg) | ||
215 | { | ||
216 | struct backing_dev_info *bdi = &td->queue->backing_dev_info; | ||
217 | unsigned int major, minor; | ||
218 | |||
219 | if (!tg || tg->blkg.dev) | ||
220 | return; | ||
221 | |||
222 | /* | ||
223 | * Fill in device details for a group which might not have been | ||
224 | * filled at group creation time as queue was being instantiated | ||
225 | * and driver had not attached a device yet | ||
226 | */ | ||
227 | if (bdi->dev && dev_name(bdi->dev)) { | ||
228 | sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); | ||
229 | tg->blkg.dev = MKDEV(major, minor); | ||
230 | } | ||
231 | } | ||
232 | |||
233 | /* | ||
234 | * Should be called with without queue lock held. Here queue lock will be | ||
235 | * taken rarely. It will be taken only once during life time of a group | ||
236 | * if need be | ||
237 | */ | ||
238 | static void | ||
239 | throtl_tg_fill_dev_details(struct throtl_data *td, struct throtl_grp *tg) | ||
240 | { | ||
241 | if (!tg || tg->blkg.dev) | ||
242 | return; | ||
243 | |||
244 | spin_lock_irq(td->queue->queue_lock); | ||
245 | __throtl_tg_fill_dev_details(td, tg); | ||
246 | spin_unlock_irq(td->queue->queue_lock); | ||
247 | } | ||
248 | |||
249 | static void throtl_init_add_tg_lists(struct throtl_data *td, | ||
250 | struct throtl_grp *tg, struct blkio_cgroup *blkcg) | ||
251 | { | ||
252 | __throtl_tg_fill_dev_details(td, tg); | ||
253 | |||
254 | /* Add group onto cgroup list */ | ||
255 | blkiocg_add_blkio_group(blkcg, &tg->blkg, (void *)td, | ||
256 | tg->blkg.dev, BLKIO_POLICY_THROTL); | ||
257 | 249 | ||
258 | tg->bps[READ] = blkcg_get_read_bps(blkcg, tg->blkg.dev); | 250 | spin_lock_irqsave(&tg_stats_alloc_lock, flags); |
259 | tg->bps[WRITE] = blkcg_get_write_bps(blkcg, tg->blkg.dev); | 251 | list_del_init(&tg->stats_alloc_node); |
260 | tg->iops[READ] = blkcg_get_read_iops(blkcg, tg->blkg.dev); | 252 | spin_unlock_irqrestore(&tg_stats_alloc_lock, flags); |
261 | tg->iops[WRITE] = blkcg_get_write_iops(blkcg, tg->blkg.dev); | ||
262 | 253 | ||
263 | throtl_add_group_to_td_list(td, tg); | 254 | free_percpu(tg->stats_cpu); |
264 | } | 255 | } |
265 | 256 | ||
266 | /* Should be called without queue lock and outside of rcu period */ | 257 | static void throtl_pd_reset_stats(struct blkcg_gq *blkg) |
267 | static struct throtl_grp *throtl_alloc_tg(struct throtl_data *td) | ||
268 | { | 258 | { |
269 | struct throtl_grp *tg = NULL; | 259 | struct throtl_grp *tg = blkg_to_tg(blkg); |
270 | int ret; | 260 | int cpu; |
271 | 261 | ||
272 | tg = kzalloc_node(sizeof(*tg), GFP_ATOMIC, td->queue->node); | 262 | if (tg->stats_cpu == NULL) |
273 | if (!tg) | 263 | return; |
274 | return NULL; | ||
275 | 264 | ||
276 | ret = blkio_alloc_blkg_stats(&tg->blkg); | 265 | for_each_possible_cpu(cpu) { |
266 | struct tg_stats_cpu *sc = per_cpu_ptr(tg->stats_cpu, cpu); | ||
277 | 267 | ||
278 | if (ret) { | 268 | blkg_rwstat_reset(&sc->service_bytes); |
279 | kfree(tg); | 269 | blkg_rwstat_reset(&sc->serviced); |
280 | return NULL; | ||
281 | } | 270 | } |
282 | |||
283 | throtl_init_group(tg); | ||
284 | return tg; | ||
285 | } | 271 | } |
286 | 272 | ||
287 | static struct | 273 | static struct throtl_grp *throtl_lookup_tg(struct throtl_data *td, |
288 | throtl_grp *throtl_find_tg(struct throtl_data *td, struct blkio_cgroup *blkcg) | 274 | struct blkcg *blkcg) |
289 | { | 275 | { |
290 | struct throtl_grp *tg = NULL; | ||
291 | void *key = td; | ||
292 | |||
293 | /* | 276 | /* |
294 | * This is the common case when there are no blkio cgroups. | 277 | * This is the common case when there are no blkcgs. Avoid lookup |
295 | * Avoid lookup in this case | 278 | * in this case |
296 | */ | 279 | */ |
297 | if (blkcg == &blkio_root_cgroup) | 280 | if (blkcg == &blkcg_root) |
298 | tg = td->root_tg; | 281 | return td_root_tg(td); |
299 | else | ||
300 | tg = tg_of_blkg(blkiocg_lookup_group(blkcg, key)); | ||
301 | 282 | ||
302 | __throtl_tg_fill_dev_details(td, tg); | 283 | return blkg_to_tg(blkg_lookup(blkcg, td->queue)); |
303 | return tg; | ||
304 | } | 284 | } |
305 | 285 | ||
306 | static struct throtl_grp * throtl_get_tg(struct throtl_data *td) | 286 | static struct throtl_grp *throtl_lookup_create_tg(struct throtl_data *td, |
287 | struct blkcg *blkcg) | ||
307 | { | 288 | { |
308 | struct throtl_grp *tg = NULL, *__tg = NULL; | ||
309 | struct blkio_cgroup *blkcg; | ||
310 | struct request_queue *q = td->queue; | 289 | struct request_queue *q = td->queue; |
311 | 290 | struct throtl_grp *tg = NULL; | |
312 | /* no throttling for dead queue */ | ||
313 | if (unlikely(blk_queue_dead(q))) | ||
314 | return NULL; | ||
315 | |||
316 | rcu_read_lock(); | ||
317 | blkcg = task_blkio_cgroup(current); | ||
318 | tg = throtl_find_tg(td, blkcg); | ||
319 | if (tg) { | ||
320 | rcu_read_unlock(); | ||
321 | return tg; | ||
322 | } | ||
323 | |||
324 | /* | ||
325 | * Need to allocate a group. Allocation of group also needs allocation | ||
326 | * of per cpu stats which in-turn takes a mutex() and can block. Hence | ||
327 | * we need to drop rcu lock and queue_lock before we call alloc. | ||
328 | */ | ||
329 | rcu_read_unlock(); | ||
330 | spin_unlock_irq(q->queue_lock); | ||
331 | |||
332 | tg = throtl_alloc_tg(td); | ||
333 | |||
334 | /* Group allocated and queue is still alive. take the lock */ | ||
335 | spin_lock_irq(q->queue_lock); | ||
336 | |||
337 | /* Make sure @q is still alive */ | ||
338 | if (unlikely(blk_queue_dead(q))) { | ||
339 | kfree(tg); | ||
340 | return NULL; | ||
341 | } | ||
342 | |||
343 | /* | ||
344 | * Initialize the new group. After sleeping, read the blkcg again. | ||
345 | */ | ||
346 | rcu_read_lock(); | ||
347 | blkcg = task_blkio_cgroup(current); | ||
348 | 291 | ||
349 | /* | 292 | /* |
350 | * If some other thread already allocated the group while we were | 293 | * This is the common case when there are no blkcgs. Avoid lookup |
351 | * not holding queue lock, free up the group | 294 | * in this case |
352 | */ | 295 | */ |
353 | __tg = throtl_find_tg(td, blkcg); | 296 | if (blkcg == &blkcg_root) { |
354 | 297 | tg = td_root_tg(td); | |
355 | if (__tg) { | 298 | } else { |
356 | kfree(tg); | 299 | struct blkcg_gq *blkg; |
357 | rcu_read_unlock(); | 300 | |
358 | return __tg; | 301 | blkg = blkg_lookup_create(blkcg, q); |
359 | } | 302 | |
360 | 303 | /* if %NULL and @q is alive, fall back to root_tg */ | |
361 | /* Group allocation failed. Account the IO to root group */ | 304 | if (!IS_ERR(blkg)) |
362 | if (!tg) { | 305 | tg = blkg_to_tg(blkg); |
363 | tg = td->root_tg; | 306 | else if (!blk_queue_dead(q)) |
364 | return tg; | 307 | tg = td_root_tg(td); |
365 | } | 308 | } |
366 | 309 | ||
367 | throtl_init_add_tg_lists(td, tg, blkcg); | ||
368 | rcu_read_unlock(); | ||
369 | return tg; | 310 | return tg; |
370 | } | 311 | } |
371 | 312 | ||
@@ -734,16 +675,41 @@ static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg, | |||
734 | return 0; | 675 | return 0; |
735 | } | 676 | } |
736 | 677 | ||
678 | static void throtl_update_dispatch_stats(struct blkcg_gq *blkg, u64 bytes, | ||
679 | int rw) | ||
680 | { | ||
681 | struct throtl_grp *tg = blkg_to_tg(blkg); | ||
682 | struct tg_stats_cpu *stats_cpu; | ||
683 | unsigned long flags; | ||
684 | |||
685 | /* If per cpu stats are not allocated yet, don't do any accounting. */ | ||
686 | if (tg->stats_cpu == NULL) | ||
687 | return; | ||
688 | |||
689 | /* | ||
690 | * Disabling interrupts to provide mutual exclusion between two | ||
691 | * writes on same cpu. It probably is not needed for 64bit. Not | ||
692 | * optimizing that case yet. | ||
693 | */ | ||
694 | local_irq_save(flags); | ||
695 | |||
696 | stats_cpu = this_cpu_ptr(tg->stats_cpu); | ||
697 | |||
698 | blkg_rwstat_add(&stats_cpu->serviced, rw, 1); | ||
699 | blkg_rwstat_add(&stats_cpu->service_bytes, rw, bytes); | ||
700 | |||
701 | local_irq_restore(flags); | ||
702 | } | ||
703 | |||
737 | static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) | 704 | static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) |
738 | { | 705 | { |
739 | bool rw = bio_data_dir(bio); | 706 | bool rw = bio_data_dir(bio); |
740 | bool sync = rw_is_sync(bio->bi_rw); | ||
741 | 707 | ||
742 | /* Charge the bio to the group */ | 708 | /* Charge the bio to the group */ |
743 | tg->bytes_disp[rw] += bio->bi_size; | 709 | tg->bytes_disp[rw] += bio->bi_size; |
744 | tg->io_disp[rw]++; | 710 | tg->io_disp[rw]++; |
745 | 711 | ||
746 | blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size, rw, sync); | 712 | throtl_update_dispatch_stats(tg_to_blkg(tg), bio->bi_size, bio->bi_rw); |
747 | } | 713 | } |
748 | 714 | ||
749 | static void throtl_add_bio_tg(struct throtl_data *td, struct throtl_grp *tg, | 715 | static void throtl_add_bio_tg(struct throtl_data *td, struct throtl_grp *tg, |
@@ -753,7 +719,7 @@ static void throtl_add_bio_tg(struct throtl_data *td, struct throtl_grp *tg, | |||
753 | 719 | ||
754 | bio_list_add(&tg->bio_lists[rw], bio); | 720 | bio_list_add(&tg->bio_lists[rw], bio); |
755 | /* Take a bio reference on tg */ | 721 | /* Take a bio reference on tg */ |
756 | throtl_ref_get_tg(tg); | 722 | blkg_get(tg_to_blkg(tg)); |
757 | tg->nr_queued[rw]++; | 723 | tg->nr_queued[rw]++; |
758 | td->nr_queued[rw]++; | 724 | td->nr_queued[rw]++; |
759 | throtl_enqueue_tg(td, tg); | 725 | throtl_enqueue_tg(td, tg); |
@@ -786,8 +752,8 @@ static void tg_dispatch_one_bio(struct throtl_data *td, struct throtl_grp *tg, | |||
786 | 752 | ||
787 | bio = bio_list_pop(&tg->bio_lists[rw]); | 753 | bio = bio_list_pop(&tg->bio_lists[rw]); |
788 | tg->nr_queued[rw]--; | 754 | tg->nr_queued[rw]--; |
789 | /* Drop bio reference on tg */ | 755 | /* Drop bio reference on blkg */ |
790 | throtl_put_tg(tg); | 756 | blkg_put(tg_to_blkg(tg)); |
791 | 757 | ||
792 | BUG_ON(td->nr_queued[rw] <= 0); | 758 | BUG_ON(td->nr_queued[rw] <= 0); |
793 | td->nr_queued[rw]--; | 759 | td->nr_queued[rw]--; |
@@ -865,8 +831,8 @@ static int throtl_select_dispatch(struct throtl_data *td, struct bio_list *bl) | |||
865 | 831 | ||
866 | static void throtl_process_limit_change(struct throtl_data *td) | 832 | static void throtl_process_limit_change(struct throtl_data *td) |
867 | { | 833 | { |
868 | struct throtl_grp *tg; | 834 | struct request_queue *q = td->queue; |
869 | struct hlist_node *pos, *n; | 835 | struct blkcg_gq *blkg, *n; |
870 | 836 | ||
871 | if (!td->limits_changed) | 837 | if (!td->limits_changed) |
872 | return; | 838 | return; |
@@ -875,7 +841,9 @@ static void throtl_process_limit_change(struct throtl_data *td) | |||
875 | 841 | ||
876 | throtl_log(td, "limits changed"); | 842 | throtl_log(td, "limits changed"); |
877 | 843 | ||
878 | hlist_for_each_entry_safe(tg, pos, n, &td->tg_list, tg_node) { | 844 | list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) { |
845 | struct throtl_grp *tg = blkg_to_tg(blkg); | ||
846 | |||
879 | if (!tg->limits_changed) | 847 | if (!tg->limits_changed) |
880 | continue; | 848 | continue; |
881 | 849 | ||
@@ -973,120 +941,159 @@ throtl_schedule_delayed_work(struct throtl_data *td, unsigned long delay) | |||
973 | } | 941 | } |
974 | } | 942 | } |
975 | 943 | ||
976 | static void | 944 | static u64 tg_prfill_cpu_rwstat(struct seq_file *sf, |
977 | throtl_destroy_tg(struct throtl_data *td, struct throtl_grp *tg) | 945 | struct blkg_policy_data *pd, int off) |
978 | { | 946 | { |
979 | /* Something wrong if we are trying to remove same group twice */ | 947 | struct throtl_grp *tg = pd_to_tg(pd); |
980 | BUG_ON(hlist_unhashed(&tg->tg_node)); | 948 | struct blkg_rwstat rwstat = { }, tmp; |
949 | int i, cpu; | ||
981 | 950 | ||
982 | hlist_del_init(&tg->tg_node); | 951 | for_each_possible_cpu(cpu) { |
952 | struct tg_stats_cpu *sc = per_cpu_ptr(tg->stats_cpu, cpu); | ||
983 | 953 | ||
984 | /* | 954 | tmp = blkg_rwstat_read((void *)sc + off); |
985 | * Put the reference taken at the time of creation so that when all | 955 | for (i = 0; i < BLKG_RWSTAT_NR; i++) |
986 | * queues are gone, group can be destroyed. | 956 | rwstat.cnt[i] += tmp.cnt[i]; |
987 | */ | 957 | } |
988 | throtl_put_tg(tg); | 958 | |
989 | td->nr_undestroyed_grps--; | 959 | return __blkg_prfill_rwstat(sf, pd, &rwstat); |
990 | } | 960 | } |
991 | 961 | ||
992 | static void throtl_release_tgs(struct throtl_data *td) | 962 | static int tg_print_cpu_rwstat(struct cgroup *cgrp, struct cftype *cft, |
963 | struct seq_file *sf) | ||
993 | { | 964 | { |
994 | struct hlist_node *pos, *n; | 965 | struct blkcg *blkcg = cgroup_to_blkcg(cgrp); |
995 | struct throtl_grp *tg; | ||
996 | 966 | ||
997 | hlist_for_each_entry_safe(tg, pos, n, &td->tg_list, tg_node) { | 967 | blkcg_print_blkgs(sf, blkcg, tg_prfill_cpu_rwstat, &blkcg_policy_throtl, |
998 | /* | 968 | cft->private, true); |
999 | * If cgroup removal path got to blk_group first and removed | 969 | return 0; |
1000 | * it from cgroup list, then it will take care of destroying | ||
1001 | * cfqg also. | ||
1002 | */ | ||
1003 | if (!blkiocg_del_blkio_group(&tg->blkg)) | ||
1004 | throtl_destroy_tg(td, tg); | ||
1005 | } | ||
1006 | } | 970 | } |
1007 | 971 | ||
1008 | /* | 972 | static u64 tg_prfill_conf_u64(struct seq_file *sf, struct blkg_policy_data *pd, |
1009 | * Blk cgroup controller notification saying that blkio_group object is being | 973 | int off) |
1010 | * delinked as associated cgroup object is going away. That also means that | ||
1011 | * no new IO will come in this group. So get rid of this group as soon as | ||
1012 | * any pending IO in the group is finished. | ||
1013 | * | ||
1014 | * This function is called under rcu_read_lock(). key is the rcu protected | ||
1015 | * pointer. That means "key" is a valid throtl_data pointer as long as we are | ||
1016 | * rcu read lock. | ||
1017 | * | ||
1018 | * "key" was fetched from blkio_group under blkio_cgroup->lock. That means | ||
1019 | * it should not be NULL as even if queue was going away, cgroup deltion | ||
1020 | * path got to it first. | ||
1021 | */ | ||
1022 | void throtl_unlink_blkio_group(void *key, struct blkio_group *blkg) | ||
1023 | { | 974 | { |
1024 | unsigned long flags; | 975 | struct throtl_grp *tg = pd_to_tg(pd); |
1025 | struct throtl_data *td = key; | 976 | u64 v = *(u64 *)((void *)tg + off); |
1026 | 977 | ||
1027 | spin_lock_irqsave(td->queue->queue_lock, flags); | 978 | if (v == -1) |
1028 | throtl_destroy_tg(td, tg_of_blkg(blkg)); | 979 | return 0; |
1029 | spin_unlock_irqrestore(td->queue->queue_lock, flags); | 980 | return __blkg_prfill_u64(sf, pd, v); |
1030 | } | 981 | } |
1031 | 982 | ||
1032 | static void throtl_update_blkio_group_common(struct throtl_data *td, | 983 | static u64 tg_prfill_conf_uint(struct seq_file *sf, struct blkg_policy_data *pd, |
1033 | struct throtl_grp *tg) | 984 | int off) |
1034 | { | 985 | { |
1035 | xchg(&tg->limits_changed, true); | 986 | struct throtl_grp *tg = pd_to_tg(pd); |
1036 | xchg(&td->limits_changed, true); | 987 | unsigned int v = *(unsigned int *)((void *)tg + off); |
1037 | /* Schedule a work now to process the limit change */ | 988 | |
1038 | throtl_schedule_delayed_work(td, 0); | 989 | if (v == -1) |
990 | return 0; | ||
991 | return __blkg_prfill_u64(sf, pd, v); | ||
1039 | } | 992 | } |
1040 | 993 | ||
1041 | /* | 994 | static int tg_print_conf_u64(struct cgroup *cgrp, struct cftype *cft, |
1042 | * For all update functions, key should be a valid pointer because these | 995 | struct seq_file *sf) |
1043 | * update functions are called under blkcg_lock, that means, blkg is | ||
1044 | * valid and in turn key is valid. queue exit path can not race because | ||
1045 | * of blkcg_lock | ||
1046 | * | ||
1047 | * Can not take queue lock in update functions as queue lock under blkcg_lock | ||
1048 | * is not allowed. Under other paths we take blkcg_lock under queue_lock. | ||
1049 | */ | ||
1050 | static void throtl_update_blkio_group_read_bps(void *key, | ||
1051 | struct blkio_group *blkg, u64 read_bps) | ||
1052 | { | 996 | { |
1053 | struct throtl_data *td = key; | 997 | blkcg_print_blkgs(sf, cgroup_to_blkcg(cgrp), tg_prfill_conf_u64, |
1054 | struct throtl_grp *tg = tg_of_blkg(blkg); | 998 | &blkcg_policy_throtl, cft->private, false); |
1055 | 999 | return 0; | |
1056 | tg->bps[READ] = read_bps; | ||
1057 | throtl_update_blkio_group_common(td, tg); | ||
1058 | } | 1000 | } |
1059 | 1001 | ||
1060 | static void throtl_update_blkio_group_write_bps(void *key, | 1002 | static int tg_print_conf_uint(struct cgroup *cgrp, struct cftype *cft, |
1061 | struct blkio_group *blkg, u64 write_bps) | 1003 | struct seq_file *sf) |
1062 | { | 1004 | { |
1063 | struct throtl_data *td = key; | 1005 | blkcg_print_blkgs(sf, cgroup_to_blkcg(cgrp), tg_prfill_conf_uint, |
1064 | struct throtl_grp *tg = tg_of_blkg(blkg); | 1006 | &blkcg_policy_throtl, cft->private, false); |
1065 | 1007 | return 0; | |
1066 | tg->bps[WRITE] = write_bps; | ||
1067 | throtl_update_blkio_group_common(td, tg); | ||
1068 | } | 1008 | } |
1069 | 1009 | ||
1070 | static void throtl_update_blkio_group_read_iops(void *key, | 1010 | static int tg_set_conf(struct cgroup *cgrp, struct cftype *cft, const char *buf, |
1071 | struct blkio_group *blkg, unsigned int read_iops) | 1011 | bool is_u64) |
1072 | { | 1012 | { |
1073 | struct throtl_data *td = key; | 1013 | struct blkcg *blkcg = cgroup_to_blkcg(cgrp); |
1074 | struct throtl_grp *tg = tg_of_blkg(blkg); | 1014 | struct blkg_conf_ctx ctx; |
1015 | struct throtl_grp *tg; | ||
1016 | struct throtl_data *td; | ||
1017 | int ret; | ||
1018 | |||
1019 | ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx); | ||
1020 | if (ret) | ||
1021 | return ret; | ||
1022 | |||
1023 | tg = blkg_to_tg(ctx.blkg); | ||
1024 | td = ctx.blkg->q->td; | ||
1025 | |||
1026 | if (!ctx.v) | ||
1027 | ctx.v = -1; | ||
1028 | |||
1029 | if (is_u64) | ||
1030 | *(u64 *)((void *)tg + cft->private) = ctx.v; | ||
1031 | else | ||
1032 | *(unsigned int *)((void *)tg + cft->private) = ctx.v; | ||
1033 | |||
1034 | /* XXX: we don't need the following deferred processing */ | ||
1035 | xchg(&tg->limits_changed, true); | ||
1036 | xchg(&td->limits_changed, true); | ||
1037 | throtl_schedule_delayed_work(td, 0); | ||
1075 | 1038 | ||
1076 | tg->iops[READ] = read_iops; | 1039 | blkg_conf_finish(&ctx); |
1077 | throtl_update_blkio_group_common(td, tg); | 1040 | return 0; |
1078 | } | 1041 | } |
1079 | 1042 | ||
1080 | static void throtl_update_blkio_group_write_iops(void *key, | 1043 | static int tg_set_conf_u64(struct cgroup *cgrp, struct cftype *cft, |
1081 | struct blkio_group *blkg, unsigned int write_iops) | 1044 | const char *buf) |
1082 | { | 1045 | { |
1083 | struct throtl_data *td = key; | 1046 | return tg_set_conf(cgrp, cft, buf, true); |
1084 | struct throtl_grp *tg = tg_of_blkg(blkg); | 1047 | } |
1085 | 1048 | ||
1086 | tg->iops[WRITE] = write_iops; | 1049 | static int tg_set_conf_uint(struct cgroup *cgrp, struct cftype *cft, |
1087 | throtl_update_blkio_group_common(td, tg); | 1050 | const char *buf) |
1051 | { | ||
1052 | return tg_set_conf(cgrp, cft, buf, false); | ||
1088 | } | 1053 | } |
1089 | 1054 | ||
1055 | static struct cftype throtl_files[] = { | ||
1056 | { | ||
1057 | .name = "throttle.read_bps_device", | ||
1058 | .private = offsetof(struct throtl_grp, bps[READ]), | ||
1059 | .read_seq_string = tg_print_conf_u64, | ||
1060 | .write_string = tg_set_conf_u64, | ||
1061 | .max_write_len = 256, | ||
1062 | }, | ||
1063 | { | ||
1064 | .name = "throttle.write_bps_device", | ||
1065 | .private = offsetof(struct throtl_grp, bps[WRITE]), | ||
1066 | .read_seq_string = tg_print_conf_u64, | ||
1067 | .write_string = tg_set_conf_u64, | ||
1068 | .max_write_len = 256, | ||
1069 | }, | ||
1070 | { | ||
1071 | .name = "throttle.read_iops_device", | ||
1072 | .private = offsetof(struct throtl_grp, iops[READ]), | ||
1073 | .read_seq_string = tg_print_conf_uint, | ||
1074 | .write_string = tg_set_conf_uint, | ||
1075 | .max_write_len = 256, | ||
1076 | }, | ||
1077 | { | ||
1078 | .name = "throttle.write_iops_device", | ||
1079 | .private = offsetof(struct throtl_grp, iops[WRITE]), | ||
1080 | .read_seq_string = tg_print_conf_uint, | ||
1081 | .write_string = tg_set_conf_uint, | ||
1082 | .max_write_len = 256, | ||
1083 | }, | ||
1084 | { | ||
1085 | .name = "throttle.io_service_bytes", | ||
1086 | .private = offsetof(struct tg_stats_cpu, service_bytes), | ||
1087 | .read_seq_string = tg_print_cpu_rwstat, | ||
1088 | }, | ||
1089 | { | ||
1090 | .name = "throttle.io_serviced", | ||
1091 | .private = offsetof(struct tg_stats_cpu, serviced), | ||
1092 | .read_seq_string = tg_print_cpu_rwstat, | ||
1093 | }, | ||
1094 | { } /* terminate */ | ||
1095 | }; | ||
1096 | |||
1090 | static void throtl_shutdown_wq(struct request_queue *q) | 1097 | static void throtl_shutdown_wq(struct request_queue *q) |
1091 | { | 1098 | { |
1092 | struct throtl_data *td = q->td; | 1099 | struct throtl_data *td = q->td; |
@@ -1094,19 +1101,13 @@ static void throtl_shutdown_wq(struct request_queue *q) | |||
1094 | cancel_delayed_work_sync(&td->throtl_work); | 1101 | cancel_delayed_work_sync(&td->throtl_work); |
1095 | } | 1102 | } |
1096 | 1103 | ||
1097 | static struct blkio_policy_type blkio_policy_throtl = { | 1104 | static struct blkcg_policy blkcg_policy_throtl = { |
1098 | .ops = { | 1105 | .pd_size = sizeof(struct throtl_grp), |
1099 | .blkio_unlink_group_fn = throtl_unlink_blkio_group, | 1106 | .cftypes = throtl_files, |
1100 | .blkio_update_group_read_bps_fn = | 1107 | |
1101 | throtl_update_blkio_group_read_bps, | 1108 | .pd_init_fn = throtl_pd_init, |
1102 | .blkio_update_group_write_bps_fn = | 1109 | .pd_exit_fn = throtl_pd_exit, |
1103 | throtl_update_blkio_group_write_bps, | 1110 | .pd_reset_stats_fn = throtl_pd_reset_stats, |
1104 | .blkio_update_group_read_iops_fn = | ||
1105 | throtl_update_blkio_group_read_iops, | ||
1106 | .blkio_update_group_write_iops_fn = | ||
1107 | throtl_update_blkio_group_write_iops, | ||
1108 | }, | ||
1109 | .plid = BLKIO_POLICY_THROTL, | ||
1110 | }; | 1111 | }; |
1111 | 1112 | ||
1112 | bool blk_throtl_bio(struct request_queue *q, struct bio *bio) | 1113 | bool blk_throtl_bio(struct request_queue *q, struct bio *bio) |
@@ -1114,7 +1115,7 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio) | |||
1114 | struct throtl_data *td = q->td; | 1115 | struct throtl_data *td = q->td; |
1115 | struct throtl_grp *tg; | 1116 | struct throtl_grp *tg; |
1116 | bool rw = bio_data_dir(bio), update_disptime = true; | 1117 | bool rw = bio_data_dir(bio), update_disptime = true; |
1117 | struct blkio_cgroup *blkcg; | 1118 | struct blkcg *blkcg; |
1118 | bool throttled = false; | 1119 | bool throttled = false; |
1119 | 1120 | ||
1120 | if (bio->bi_rw & REQ_THROTTLED) { | 1121 | if (bio->bi_rw & REQ_THROTTLED) { |
@@ -1122,33 +1123,31 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio) | |||
1122 | goto out; | 1123 | goto out; |
1123 | } | 1124 | } |
1124 | 1125 | ||
1126 | /* bio_associate_current() needs ioc, try creating */ | ||
1127 | create_io_context(GFP_ATOMIC, q->node); | ||
1128 | |||
1125 | /* | 1129 | /* |
1126 | * A throtl_grp pointer retrieved under rcu can be used to access | 1130 | * A throtl_grp pointer retrieved under rcu can be used to access |
1127 | * basic fields like stats and io rates. If a group has no rules, | 1131 | * basic fields like stats and io rates. If a group has no rules, |
1128 | * just update the dispatch stats in lockless manner and return. | 1132 | * just update the dispatch stats in lockless manner and return. |
1129 | */ | 1133 | */ |
1130 | |||
1131 | rcu_read_lock(); | 1134 | rcu_read_lock(); |
1132 | blkcg = task_blkio_cgroup(current); | 1135 | blkcg = bio_blkcg(bio); |
1133 | tg = throtl_find_tg(td, blkcg); | 1136 | tg = throtl_lookup_tg(td, blkcg); |
1134 | if (tg) { | 1137 | if (tg) { |
1135 | throtl_tg_fill_dev_details(td, tg); | ||
1136 | |||
1137 | if (tg_no_rule_group(tg, rw)) { | 1138 | if (tg_no_rule_group(tg, rw)) { |
1138 | blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size, | 1139 | throtl_update_dispatch_stats(tg_to_blkg(tg), |
1139 | rw, rw_is_sync(bio->bi_rw)); | 1140 | bio->bi_size, bio->bi_rw); |
1140 | rcu_read_unlock(); | 1141 | goto out_unlock_rcu; |
1141 | goto out; | ||
1142 | } | 1142 | } |
1143 | } | 1143 | } |
1144 | rcu_read_unlock(); | ||
1145 | 1144 | ||
1146 | /* | 1145 | /* |
1147 | * Either group has not been allocated yet or it is not an unlimited | 1146 | * Either group has not been allocated yet or it is not an unlimited |
1148 | * IO group | 1147 | * IO group |
1149 | */ | 1148 | */ |
1150 | spin_lock_irq(q->queue_lock); | 1149 | spin_lock_irq(q->queue_lock); |
1151 | tg = throtl_get_tg(td); | 1150 | tg = throtl_lookup_create_tg(td, blkcg); |
1152 | if (unlikely(!tg)) | 1151 | if (unlikely(!tg)) |
1153 | goto out_unlock; | 1152 | goto out_unlock; |
1154 | 1153 | ||
@@ -1189,6 +1188,7 @@ queue_bio: | |||
1189 | tg->io_disp[rw], tg->iops[rw], | 1188 | tg->io_disp[rw], tg->iops[rw], |
1190 | tg->nr_queued[READ], tg->nr_queued[WRITE]); | 1189 | tg->nr_queued[READ], tg->nr_queued[WRITE]); |
1191 | 1190 | ||
1191 | bio_associate_current(bio); | ||
1192 | throtl_add_bio_tg(q->td, tg, bio); | 1192 | throtl_add_bio_tg(q->td, tg, bio); |
1193 | throttled = true; | 1193 | throttled = true; |
1194 | 1194 | ||
@@ -1199,6 +1199,8 @@ queue_bio: | |||
1199 | 1199 | ||
1200 | out_unlock: | 1200 | out_unlock: |
1201 | spin_unlock_irq(q->queue_lock); | 1201 | spin_unlock_irq(q->queue_lock); |
1202 | out_unlock_rcu: | ||
1203 | rcu_read_unlock(); | ||
1202 | out: | 1204 | out: |
1203 | return throttled; | 1205 | return throttled; |
1204 | } | 1206 | } |
@@ -1241,79 +1243,31 @@ void blk_throtl_drain(struct request_queue *q) | |||
1241 | int blk_throtl_init(struct request_queue *q) | 1243 | int blk_throtl_init(struct request_queue *q) |
1242 | { | 1244 | { |
1243 | struct throtl_data *td; | 1245 | struct throtl_data *td; |
1244 | struct throtl_grp *tg; | 1246 | int ret; |
1245 | 1247 | ||
1246 | td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node); | 1248 | td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node); |
1247 | if (!td) | 1249 | if (!td) |
1248 | return -ENOMEM; | 1250 | return -ENOMEM; |
1249 | 1251 | ||
1250 | INIT_HLIST_HEAD(&td->tg_list); | ||
1251 | td->tg_service_tree = THROTL_RB_ROOT; | 1252 | td->tg_service_tree = THROTL_RB_ROOT; |
1252 | td->limits_changed = false; | 1253 | td->limits_changed = false; |
1253 | INIT_DELAYED_WORK(&td->throtl_work, blk_throtl_work); | 1254 | INIT_DELAYED_WORK(&td->throtl_work, blk_throtl_work); |
1254 | 1255 | ||
1255 | /* alloc and Init root group. */ | 1256 | q->td = td; |
1256 | td->queue = q; | 1257 | td->queue = q; |
1257 | tg = throtl_alloc_tg(td); | ||
1258 | 1258 | ||
1259 | if (!tg) { | 1259 | /* activate policy */ |
1260 | ret = blkcg_activate_policy(q, &blkcg_policy_throtl); | ||
1261 | if (ret) | ||
1260 | kfree(td); | 1262 | kfree(td); |
1261 | return -ENOMEM; | 1263 | return ret; |
1262 | } | ||
1263 | |||
1264 | td->root_tg = tg; | ||
1265 | |||
1266 | rcu_read_lock(); | ||
1267 | throtl_init_add_tg_lists(td, tg, &blkio_root_cgroup); | ||
1268 | rcu_read_unlock(); | ||
1269 | |||
1270 | /* Attach throtl data to request queue */ | ||
1271 | q->td = td; | ||
1272 | return 0; | ||
1273 | } | 1264 | } |
1274 | 1265 | ||
1275 | void blk_throtl_exit(struct request_queue *q) | 1266 | void blk_throtl_exit(struct request_queue *q) |
1276 | { | 1267 | { |
1277 | struct throtl_data *td = q->td; | 1268 | BUG_ON(!q->td); |
1278 | bool wait = false; | ||
1279 | |||
1280 | BUG_ON(!td); | ||
1281 | |||
1282 | throtl_shutdown_wq(q); | ||
1283 | |||
1284 | spin_lock_irq(q->queue_lock); | ||
1285 | throtl_release_tgs(td); | ||
1286 | |||
1287 | /* If there are other groups */ | ||
1288 | if (td->nr_undestroyed_grps > 0) | ||
1289 | wait = true; | ||
1290 | |||
1291 | spin_unlock_irq(q->queue_lock); | ||
1292 | |||
1293 | /* | ||
1294 | * Wait for tg->blkg->key accessors to exit their grace periods. | ||
1295 | * Do this wait only if there are other undestroyed groups out | ||
1296 | * there (other than root group). This can happen if cgroup deletion | ||
1297 | * path claimed the responsibility of cleaning up a group before | ||
1298 | * queue cleanup code get to the group. | ||
1299 | * | ||
1300 | * Do not call synchronize_rcu() unconditionally as there are drivers | ||
1301 | * which create/delete request queue hundreds of times during scan/boot | ||
1302 | * and synchronize_rcu() can take significant time and slow down boot. | ||
1303 | */ | ||
1304 | if (wait) | ||
1305 | synchronize_rcu(); | ||
1306 | |||
1307 | /* | ||
1308 | * Just being safe to make sure after previous flush if some body did | ||
1309 | * update limits through cgroup and another work got queued, cancel | ||
1310 | * it. | ||
1311 | */ | ||
1312 | throtl_shutdown_wq(q); | 1269 | throtl_shutdown_wq(q); |
1313 | } | 1270 | blkcg_deactivate_policy(q, &blkcg_policy_throtl); |
1314 | |||
1315 | void blk_throtl_release(struct request_queue *q) | ||
1316 | { | ||
1317 | kfree(q->td); | 1271 | kfree(q->td); |
1318 | } | 1272 | } |
1319 | 1273 | ||
@@ -1323,8 +1277,7 @@ static int __init throtl_init(void) | |||
1323 | if (!kthrotld_workqueue) | 1277 | if (!kthrotld_workqueue) |
1324 | panic("Failed to create kthrotld\n"); | 1278 | panic("Failed to create kthrotld\n"); |
1325 | 1279 | ||
1326 | blkio_policy_register(&blkio_policy_throtl); | 1280 | return blkcg_policy_register(&blkcg_policy_throtl); |
1327 | return 0; | ||
1328 | } | 1281 | } |
1329 | 1282 | ||
1330 | module_init(throtl_init); | 1283 | module_init(throtl_init); |
diff --git a/block/blk.h b/block/blk.h index d45be871329e..85f6ae42f7d3 100644 --- a/block/blk.h +++ b/block/blk.h | |||
@@ -23,7 +23,8 @@ void blk_rq_bio_prep(struct request_queue *q, struct request *rq, | |||
23 | struct bio *bio); | 23 | struct bio *bio); |
24 | int blk_rq_append_bio(struct request_queue *q, struct request *rq, | 24 | int blk_rq_append_bio(struct request_queue *q, struct request *rq, |
25 | struct bio *bio); | 25 | struct bio *bio); |
26 | void blk_drain_queue(struct request_queue *q, bool drain_all); | 26 | void blk_queue_bypass_start(struct request_queue *q); |
27 | void blk_queue_bypass_end(struct request_queue *q); | ||
27 | void blk_dequeue_request(struct request *rq); | 28 | void blk_dequeue_request(struct request *rq); |
28 | void __blk_queue_free_tags(struct request_queue *q); | 29 | void __blk_queue_free_tags(struct request_queue *q); |
29 | bool __blk_end_bidi_request(struct request *rq, int error, | 30 | bool __blk_end_bidi_request(struct request *rq, int error, |
@@ -144,9 +145,6 @@ void blk_queue_congestion_threshold(struct request_queue *q); | |||
144 | 145 | ||
145 | int blk_dev_init(void); | 146 | int blk_dev_init(void); |
146 | 147 | ||
147 | void elv_quiesce_start(struct request_queue *q); | ||
148 | void elv_quiesce_end(struct request_queue *q); | ||
149 | |||
150 | 148 | ||
151 | /* | 149 | /* |
152 | * Return the threshold (number of used requests) at which the queue is | 150 | * Return the threshold (number of used requests) at which the queue is |
@@ -186,32 +184,30 @@ static inline int blk_do_io_stat(struct request *rq) | |||
186 | */ | 184 | */ |
187 | void get_io_context(struct io_context *ioc); | 185 | void get_io_context(struct io_context *ioc); |
188 | struct io_cq *ioc_lookup_icq(struct io_context *ioc, struct request_queue *q); | 186 | struct io_cq *ioc_lookup_icq(struct io_context *ioc, struct request_queue *q); |
189 | struct io_cq *ioc_create_icq(struct request_queue *q, gfp_t gfp_mask); | 187 | struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q, |
188 | gfp_t gfp_mask); | ||
190 | void ioc_clear_queue(struct request_queue *q); | 189 | void ioc_clear_queue(struct request_queue *q); |
191 | 190 | ||
192 | void create_io_context_slowpath(struct task_struct *task, gfp_t gfp_mask, | 191 | int create_task_io_context(struct task_struct *task, gfp_t gfp_mask, int node); |
193 | int node); | ||
194 | 192 | ||
195 | /** | 193 | /** |
196 | * create_io_context - try to create task->io_context | 194 | * create_io_context - try to create task->io_context |
197 | * @task: target task | ||
198 | * @gfp_mask: allocation mask | 195 | * @gfp_mask: allocation mask |
199 | * @node: allocation node | 196 | * @node: allocation node |
200 | * | 197 | * |
201 | * If @task->io_context is %NULL, allocate a new io_context and install it. | 198 | * If %current->io_context is %NULL, allocate a new io_context and install |
202 | * Returns the current @task->io_context which may be %NULL if allocation | 199 | * it. Returns the current %current->io_context which may be %NULL if |
203 | * failed. | 200 | * allocation failed. |
204 | * | 201 | * |
205 | * Note that this function can't be called with IRQ disabled because | 202 | * Note that this function can't be called with IRQ disabled because |
206 | * task_lock which protects @task->io_context is IRQ-unsafe. | 203 | * task_lock which protects %current->io_context is IRQ-unsafe. |
207 | */ | 204 | */ |
208 | static inline struct io_context *create_io_context(struct task_struct *task, | 205 | static inline struct io_context *create_io_context(gfp_t gfp_mask, int node) |
209 | gfp_t gfp_mask, int node) | ||
210 | { | 206 | { |
211 | WARN_ON_ONCE(irqs_disabled()); | 207 | WARN_ON_ONCE(irqs_disabled()); |
212 | if (unlikely(!task->io_context)) | 208 | if (unlikely(!current->io_context)) |
213 | create_io_context_slowpath(task, gfp_mask, node); | 209 | create_task_io_context(current, gfp_mask, node); |
214 | return task->io_context; | 210 | return current->io_context; |
215 | } | 211 | } |
216 | 212 | ||
217 | /* | 213 | /* |
@@ -222,7 +218,6 @@ extern bool blk_throtl_bio(struct request_queue *q, struct bio *bio); | |||
222 | extern void blk_throtl_drain(struct request_queue *q); | 218 | extern void blk_throtl_drain(struct request_queue *q); |
223 | extern int blk_throtl_init(struct request_queue *q); | 219 | extern int blk_throtl_init(struct request_queue *q); |
224 | extern void blk_throtl_exit(struct request_queue *q); | 220 | extern void blk_throtl_exit(struct request_queue *q); |
225 | extern void blk_throtl_release(struct request_queue *q); | ||
226 | #else /* CONFIG_BLK_DEV_THROTTLING */ | 221 | #else /* CONFIG_BLK_DEV_THROTTLING */ |
227 | static inline bool blk_throtl_bio(struct request_queue *q, struct bio *bio) | 222 | static inline bool blk_throtl_bio(struct request_queue *q, struct bio *bio) |
228 | { | 223 | { |
@@ -231,7 +226,6 @@ static inline bool blk_throtl_bio(struct request_queue *q, struct bio *bio) | |||
231 | static inline void blk_throtl_drain(struct request_queue *q) { } | 226 | static inline void blk_throtl_drain(struct request_queue *q) { } |
232 | static inline int blk_throtl_init(struct request_queue *q) { return 0; } | 227 | static inline int blk_throtl_init(struct request_queue *q) { return 0; } |
233 | static inline void blk_throtl_exit(struct request_queue *q) { } | 228 | static inline void blk_throtl_exit(struct request_queue *q) { } |
234 | static inline void blk_throtl_release(struct request_queue *q) { } | ||
235 | #endif /* CONFIG_BLK_DEV_THROTTLING */ | 229 | #endif /* CONFIG_BLK_DEV_THROTTLING */ |
236 | 230 | ||
237 | #endif /* BLK_INTERNAL_H */ | 231 | #endif /* BLK_INTERNAL_H */ |
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 3c38536bd52c..673c977cc2bf 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c | |||
@@ -15,7 +15,9 @@ | |||
15 | #include <linux/ioprio.h> | 15 | #include <linux/ioprio.h> |
16 | #include <linux/blktrace_api.h> | 16 | #include <linux/blktrace_api.h> |
17 | #include "blk.h" | 17 | #include "blk.h" |
18 | #include "cfq.h" | 18 | #include "blk-cgroup.h" |
19 | |||
20 | static struct blkcg_policy blkcg_policy_cfq __maybe_unused; | ||
19 | 21 | ||
20 | /* | 22 | /* |
21 | * tunables | 23 | * tunables |
@@ -171,8 +173,53 @@ enum wl_type_t { | |||
171 | SYNC_WORKLOAD = 2 | 173 | SYNC_WORKLOAD = 2 |
172 | }; | 174 | }; |
173 | 175 | ||
176 | struct cfqg_stats { | ||
177 | #ifdef CONFIG_CFQ_GROUP_IOSCHED | ||
178 | /* total bytes transferred */ | ||
179 | struct blkg_rwstat service_bytes; | ||
180 | /* total IOs serviced, post merge */ | ||
181 | struct blkg_rwstat serviced; | ||
182 | /* number of ios merged */ | ||
183 | struct blkg_rwstat merged; | ||
184 | /* total time spent on device in ns, may not be accurate w/ queueing */ | ||
185 | struct blkg_rwstat service_time; | ||
186 | /* total time spent waiting in scheduler queue in ns */ | ||
187 | struct blkg_rwstat wait_time; | ||
188 | /* number of IOs queued up */ | ||
189 | struct blkg_rwstat queued; | ||
190 | /* total sectors transferred */ | ||
191 | struct blkg_stat sectors; | ||
192 | /* total disk time and nr sectors dispatched by this group */ | ||
193 | struct blkg_stat time; | ||
194 | #ifdef CONFIG_DEBUG_BLK_CGROUP | ||
195 | /* time not charged to this cgroup */ | ||
196 | struct blkg_stat unaccounted_time; | ||
197 | /* sum of number of ios queued across all samples */ | ||
198 | struct blkg_stat avg_queue_size_sum; | ||
199 | /* count of samples taken for average */ | ||
200 | struct blkg_stat avg_queue_size_samples; | ||
201 | /* how many times this group has been removed from service tree */ | ||
202 | struct blkg_stat dequeue; | ||
203 | /* total time spent waiting for it to be assigned a timeslice. */ | ||
204 | struct blkg_stat group_wait_time; | ||
205 | /* time spent idling for this blkcg_gq */ | ||
206 | struct blkg_stat idle_time; | ||
207 | /* total time with empty current active q with other requests queued */ | ||
208 | struct blkg_stat empty_time; | ||
209 | /* fields after this shouldn't be cleared on stat reset */ | ||
210 | uint64_t start_group_wait_time; | ||
211 | uint64_t start_idle_time; | ||
212 | uint64_t start_empty_time; | ||
213 | uint16_t flags; | ||
214 | #endif /* CONFIG_DEBUG_BLK_CGROUP */ | ||
215 | #endif /* CONFIG_CFQ_GROUP_IOSCHED */ | ||
216 | }; | ||
217 | |||
174 | /* This is per cgroup per device grouping structure */ | 218 | /* This is per cgroup per device grouping structure */ |
175 | struct cfq_group { | 219 | struct cfq_group { |
220 | /* must be the first member */ | ||
221 | struct blkg_policy_data pd; | ||
222 | |||
176 | /* group service_tree member */ | 223 | /* group service_tree member */ |
177 | struct rb_node rb_node; | 224 | struct rb_node rb_node; |
178 | 225 | ||
@@ -180,7 +227,7 @@ struct cfq_group { | |||
180 | u64 vdisktime; | 227 | u64 vdisktime; |
181 | unsigned int weight; | 228 | unsigned int weight; |
182 | unsigned int new_weight; | 229 | unsigned int new_weight; |
183 | bool needs_update; | 230 | unsigned int dev_weight; |
184 | 231 | ||
185 | /* number of cfqq currently on this group */ | 232 | /* number of cfqq currently on this group */ |
186 | int nr_cfqq; | 233 | int nr_cfqq; |
@@ -206,20 +253,21 @@ struct cfq_group { | |||
206 | unsigned long saved_workload_slice; | 253 | unsigned long saved_workload_slice; |
207 | enum wl_type_t saved_workload; | 254 | enum wl_type_t saved_workload; |
208 | enum wl_prio_t saved_serving_prio; | 255 | enum wl_prio_t saved_serving_prio; |
209 | struct blkio_group blkg; | 256 | |
210 | #ifdef CONFIG_CFQ_GROUP_IOSCHED | ||
211 | struct hlist_node cfqd_node; | ||
212 | int ref; | ||
213 | #endif | ||
214 | /* number of requests that are on the dispatch list or inside driver */ | 257 | /* number of requests that are on the dispatch list or inside driver */ |
215 | int dispatched; | 258 | int dispatched; |
216 | struct cfq_ttime ttime; | 259 | struct cfq_ttime ttime; |
260 | struct cfqg_stats stats; | ||
217 | }; | 261 | }; |
218 | 262 | ||
219 | struct cfq_io_cq { | 263 | struct cfq_io_cq { |
220 | struct io_cq icq; /* must be the first member */ | 264 | struct io_cq icq; /* must be the first member */ |
221 | struct cfq_queue *cfqq[2]; | 265 | struct cfq_queue *cfqq[2]; |
222 | struct cfq_ttime ttime; | 266 | struct cfq_ttime ttime; |
267 | int ioprio; /* the current ioprio */ | ||
268 | #ifdef CONFIG_CFQ_GROUP_IOSCHED | ||
269 | uint64_t blkcg_id; /* the current blkcg ID */ | ||
270 | #endif | ||
223 | }; | 271 | }; |
224 | 272 | ||
225 | /* | 273 | /* |
@@ -229,7 +277,7 @@ struct cfq_data { | |||
229 | struct request_queue *queue; | 277 | struct request_queue *queue; |
230 | /* Root service tree for cfq_groups */ | 278 | /* Root service tree for cfq_groups */ |
231 | struct cfq_rb_root grp_service_tree; | 279 | struct cfq_rb_root grp_service_tree; |
232 | struct cfq_group root_group; | 280 | struct cfq_group *root_group; |
233 | 281 | ||
234 | /* | 282 | /* |
235 | * The priority currently being served | 283 | * The priority currently being served |
@@ -303,12 +351,6 @@ struct cfq_data { | |||
303 | struct cfq_queue oom_cfqq; | 351 | struct cfq_queue oom_cfqq; |
304 | 352 | ||
305 | unsigned long last_delayed_sync; | 353 | unsigned long last_delayed_sync; |
306 | |||
307 | /* List of cfq groups being managed on this device*/ | ||
308 | struct hlist_head cfqg_list; | ||
309 | |||
310 | /* Number of groups which are on blkcg->blkg_list */ | ||
311 | unsigned int nr_blkcg_linked_grps; | ||
312 | }; | 354 | }; |
313 | 355 | ||
314 | static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd); | 356 | static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd); |
@@ -371,21 +413,284 @@ CFQ_CFQQ_FNS(deep); | |||
371 | CFQ_CFQQ_FNS(wait_busy); | 413 | CFQ_CFQQ_FNS(wait_busy); |
372 | #undef CFQ_CFQQ_FNS | 414 | #undef CFQ_CFQQ_FNS |
373 | 415 | ||
416 | static inline struct cfq_group *pd_to_cfqg(struct blkg_policy_data *pd) | ||
417 | { | ||
418 | return pd ? container_of(pd, struct cfq_group, pd) : NULL; | ||
419 | } | ||
420 | |||
421 | static inline struct cfq_group *blkg_to_cfqg(struct blkcg_gq *blkg) | ||
422 | { | ||
423 | return pd_to_cfqg(blkg_to_pd(blkg, &blkcg_policy_cfq)); | ||
424 | } | ||
425 | |||
426 | static inline struct blkcg_gq *cfqg_to_blkg(struct cfq_group *cfqg) | ||
427 | { | ||
428 | return pd_to_blkg(&cfqg->pd); | ||
429 | } | ||
430 | |||
431 | #if defined(CONFIG_CFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP) | ||
432 | |||
433 | /* cfqg stats flags */ | ||
434 | enum cfqg_stats_flags { | ||
435 | CFQG_stats_waiting = 0, | ||
436 | CFQG_stats_idling, | ||
437 | CFQG_stats_empty, | ||
438 | }; | ||
439 | |||
440 | #define CFQG_FLAG_FNS(name) \ | ||
441 | static inline void cfqg_stats_mark_##name(struct cfqg_stats *stats) \ | ||
442 | { \ | ||
443 | stats->flags |= (1 << CFQG_stats_##name); \ | ||
444 | } \ | ||
445 | static inline void cfqg_stats_clear_##name(struct cfqg_stats *stats) \ | ||
446 | { \ | ||
447 | stats->flags &= ~(1 << CFQG_stats_##name); \ | ||
448 | } \ | ||
449 | static inline int cfqg_stats_##name(struct cfqg_stats *stats) \ | ||
450 | { \ | ||
451 | return (stats->flags & (1 << CFQG_stats_##name)) != 0; \ | ||
452 | } \ | ||
453 | |||
454 | CFQG_FLAG_FNS(waiting) | ||
455 | CFQG_FLAG_FNS(idling) | ||
456 | CFQG_FLAG_FNS(empty) | ||
457 | #undef CFQG_FLAG_FNS | ||
458 | |||
459 | /* This should be called with the queue_lock held. */ | ||
460 | static void cfqg_stats_update_group_wait_time(struct cfqg_stats *stats) | ||
461 | { | ||
462 | unsigned long long now; | ||
463 | |||
464 | if (!cfqg_stats_waiting(stats)) | ||
465 | return; | ||
466 | |||
467 | now = sched_clock(); | ||
468 | if (time_after64(now, stats->start_group_wait_time)) | ||
469 | blkg_stat_add(&stats->group_wait_time, | ||
470 | now - stats->start_group_wait_time); | ||
471 | cfqg_stats_clear_waiting(stats); | ||
472 | } | ||
473 | |||
474 | /* This should be called with the queue_lock held. */ | ||
475 | static void cfqg_stats_set_start_group_wait_time(struct cfq_group *cfqg, | ||
476 | struct cfq_group *curr_cfqg) | ||
477 | { | ||
478 | struct cfqg_stats *stats = &cfqg->stats; | ||
479 | |||
480 | if (cfqg_stats_waiting(stats)) | ||
481 | return; | ||
482 | if (cfqg == curr_cfqg) | ||
483 | return; | ||
484 | stats->start_group_wait_time = sched_clock(); | ||
485 | cfqg_stats_mark_waiting(stats); | ||
486 | } | ||
487 | |||
488 | /* This should be called with the queue_lock held. */ | ||
489 | static void cfqg_stats_end_empty_time(struct cfqg_stats *stats) | ||
490 | { | ||
491 | unsigned long long now; | ||
492 | |||
493 | if (!cfqg_stats_empty(stats)) | ||
494 | return; | ||
495 | |||
496 | now = sched_clock(); | ||
497 | if (time_after64(now, stats->start_empty_time)) | ||
498 | blkg_stat_add(&stats->empty_time, | ||
499 | now - stats->start_empty_time); | ||
500 | cfqg_stats_clear_empty(stats); | ||
501 | } | ||
502 | |||
503 | static void cfqg_stats_update_dequeue(struct cfq_group *cfqg) | ||
504 | { | ||
505 | blkg_stat_add(&cfqg->stats.dequeue, 1); | ||
506 | } | ||
507 | |||
508 | static void cfqg_stats_set_start_empty_time(struct cfq_group *cfqg) | ||
509 | { | ||
510 | struct cfqg_stats *stats = &cfqg->stats; | ||
511 | |||
512 | if (blkg_rwstat_sum(&stats->queued)) | ||
513 | return; | ||
514 | |||
515 | /* | ||
516 | * group is already marked empty. This can happen if cfqq got new | ||
517 | * request in parent group and moved to this group while being added | ||
518 | * to service tree. Just ignore the event and move on. | ||
519 | */ | ||
520 | if (cfqg_stats_empty(stats)) | ||
521 | return; | ||
522 | |||
523 | stats->start_empty_time = sched_clock(); | ||
524 | cfqg_stats_mark_empty(stats); | ||
525 | } | ||
526 | |||
527 | static void cfqg_stats_update_idle_time(struct cfq_group *cfqg) | ||
528 | { | ||
529 | struct cfqg_stats *stats = &cfqg->stats; | ||
530 | |||
531 | if (cfqg_stats_idling(stats)) { | ||
532 | unsigned long long now = sched_clock(); | ||
533 | |||
534 | if (time_after64(now, stats->start_idle_time)) | ||
535 | blkg_stat_add(&stats->idle_time, | ||
536 | now - stats->start_idle_time); | ||
537 | cfqg_stats_clear_idling(stats); | ||
538 | } | ||
539 | } | ||
540 | |||
541 | static void cfqg_stats_set_start_idle_time(struct cfq_group *cfqg) | ||
542 | { | ||
543 | struct cfqg_stats *stats = &cfqg->stats; | ||
544 | |||
545 | BUG_ON(cfqg_stats_idling(stats)); | ||
546 | |||
547 | stats->start_idle_time = sched_clock(); | ||
548 | cfqg_stats_mark_idling(stats); | ||
549 | } | ||
550 | |||
551 | static void cfqg_stats_update_avg_queue_size(struct cfq_group *cfqg) | ||
552 | { | ||
553 | struct cfqg_stats *stats = &cfqg->stats; | ||
554 | |||
555 | blkg_stat_add(&stats->avg_queue_size_sum, | ||
556 | blkg_rwstat_sum(&stats->queued)); | ||
557 | blkg_stat_add(&stats->avg_queue_size_samples, 1); | ||
558 | cfqg_stats_update_group_wait_time(stats); | ||
559 | } | ||
560 | |||
561 | #else /* CONFIG_CFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */ | ||
562 | |||
563 | static inline void cfqg_stats_set_start_group_wait_time(struct cfq_group *cfqg, struct cfq_group *curr_cfqg) { } | ||
564 | static inline void cfqg_stats_end_empty_time(struct cfqg_stats *stats) { } | ||
565 | static inline void cfqg_stats_update_dequeue(struct cfq_group *cfqg) { } | ||
566 | static inline void cfqg_stats_set_start_empty_time(struct cfq_group *cfqg) { } | ||
567 | static inline void cfqg_stats_update_idle_time(struct cfq_group *cfqg) { } | ||
568 | static inline void cfqg_stats_set_start_idle_time(struct cfq_group *cfqg) { } | ||
569 | static inline void cfqg_stats_update_avg_queue_size(struct cfq_group *cfqg) { } | ||
570 | |||
571 | #endif /* CONFIG_CFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */ | ||
572 | |||
374 | #ifdef CONFIG_CFQ_GROUP_IOSCHED | 573 | #ifdef CONFIG_CFQ_GROUP_IOSCHED |
375 | #define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \ | 574 | |
575 | static inline void cfqg_get(struct cfq_group *cfqg) | ||
576 | { | ||
577 | return blkg_get(cfqg_to_blkg(cfqg)); | ||
578 | } | ||
579 | |||
580 | static inline void cfqg_put(struct cfq_group *cfqg) | ||
581 | { | ||
582 | return blkg_put(cfqg_to_blkg(cfqg)); | ||
583 | } | ||
584 | |||
585 | #define cfq_log_cfqq(cfqd, cfqq, fmt, args...) do { \ | ||
586 | char __pbuf[128]; \ | ||
587 | \ | ||
588 | blkg_path(cfqg_to_blkg((cfqq)->cfqg), __pbuf, sizeof(__pbuf)); \ | ||
376 | blk_add_trace_msg((cfqd)->queue, "cfq%d%c %s " fmt, (cfqq)->pid, \ | 589 | blk_add_trace_msg((cfqd)->queue, "cfq%d%c %s " fmt, (cfqq)->pid, \ |
377 | cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \ | 590 | cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \ |
378 | blkg_path(&(cfqq)->cfqg->blkg), ##args) | 591 | __pbuf, ##args); \ |
592 | } while (0) | ||
379 | 593 | ||
380 | #define cfq_log_cfqg(cfqd, cfqg, fmt, args...) \ | 594 | #define cfq_log_cfqg(cfqd, cfqg, fmt, args...) do { \ |
381 | blk_add_trace_msg((cfqd)->queue, "%s " fmt, \ | 595 | char __pbuf[128]; \ |
382 | blkg_path(&(cfqg)->blkg), ##args) \ | 596 | \ |
597 | blkg_path(cfqg_to_blkg(cfqg), __pbuf, sizeof(__pbuf)); \ | ||
598 | blk_add_trace_msg((cfqd)->queue, "%s " fmt, __pbuf, ##args); \ | ||
599 | } while (0) | ||
600 | |||
601 | static inline void cfqg_stats_update_io_add(struct cfq_group *cfqg, | ||
602 | struct cfq_group *curr_cfqg, int rw) | ||
603 | { | ||
604 | blkg_rwstat_add(&cfqg->stats.queued, rw, 1); | ||
605 | cfqg_stats_end_empty_time(&cfqg->stats); | ||
606 | cfqg_stats_set_start_group_wait_time(cfqg, curr_cfqg); | ||
607 | } | ||
608 | |||
609 | static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg, | ||
610 | unsigned long time, unsigned long unaccounted_time) | ||
611 | { | ||
612 | blkg_stat_add(&cfqg->stats.time, time); | ||
613 | #ifdef CONFIG_DEBUG_BLK_CGROUP | ||
614 | blkg_stat_add(&cfqg->stats.unaccounted_time, unaccounted_time); | ||
615 | #endif | ||
616 | } | ||
617 | |||
618 | static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg, int rw) | ||
619 | { | ||
620 | blkg_rwstat_add(&cfqg->stats.queued, rw, -1); | ||
621 | } | ||
622 | |||
623 | static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, int rw) | ||
624 | { | ||
625 | blkg_rwstat_add(&cfqg->stats.merged, rw, 1); | ||
626 | } | ||
627 | |||
628 | static inline void cfqg_stats_update_dispatch(struct cfq_group *cfqg, | ||
629 | uint64_t bytes, int rw) | ||
630 | { | ||
631 | blkg_stat_add(&cfqg->stats.sectors, bytes >> 9); | ||
632 | blkg_rwstat_add(&cfqg->stats.serviced, rw, 1); | ||
633 | blkg_rwstat_add(&cfqg->stats.service_bytes, rw, bytes); | ||
634 | } | ||
635 | |||
636 | static inline void cfqg_stats_update_completion(struct cfq_group *cfqg, | ||
637 | uint64_t start_time, uint64_t io_start_time, int rw) | ||
638 | { | ||
639 | struct cfqg_stats *stats = &cfqg->stats; | ||
640 | unsigned long long now = sched_clock(); | ||
641 | |||
642 | if (time_after64(now, io_start_time)) | ||
643 | blkg_rwstat_add(&stats->service_time, rw, now - io_start_time); | ||
644 | if (time_after64(io_start_time, start_time)) | ||
645 | blkg_rwstat_add(&stats->wait_time, rw, | ||
646 | io_start_time - start_time); | ||
647 | } | ||
648 | |||
649 | static void cfq_pd_reset_stats(struct blkcg_gq *blkg) | ||
650 | { | ||
651 | struct cfq_group *cfqg = blkg_to_cfqg(blkg); | ||
652 | struct cfqg_stats *stats = &cfqg->stats; | ||
653 | |||
654 | /* queued stats shouldn't be cleared */ | ||
655 | blkg_rwstat_reset(&stats->service_bytes); | ||
656 | blkg_rwstat_reset(&stats->serviced); | ||
657 | blkg_rwstat_reset(&stats->merged); | ||
658 | blkg_rwstat_reset(&stats->service_time); | ||
659 | blkg_rwstat_reset(&stats->wait_time); | ||
660 | blkg_stat_reset(&stats->time); | ||
661 | #ifdef CONFIG_DEBUG_BLK_CGROUP | ||
662 | blkg_stat_reset(&stats->unaccounted_time); | ||
663 | blkg_stat_reset(&stats->avg_queue_size_sum); | ||
664 | blkg_stat_reset(&stats->avg_queue_size_samples); | ||
665 | blkg_stat_reset(&stats->dequeue); | ||
666 | blkg_stat_reset(&stats->group_wait_time); | ||
667 | blkg_stat_reset(&stats->idle_time); | ||
668 | blkg_stat_reset(&stats->empty_time); | ||
669 | #endif | ||
670 | } | ||
671 | |||
672 | #else /* CONFIG_CFQ_GROUP_IOSCHED */ | ||
673 | |||
674 | static inline void cfqg_get(struct cfq_group *cfqg) { } | ||
675 | static inline void cfqg_put(struct cfq_group *cfqg) { } | ||
383 | 676 | ||
384 | #else | ||
385 | #define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \ | 677 | #define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \ |
386 | blk_add_trace_msg((cfqd)->queue, "cfq%d " fmt, (cfqq)->pid, ##args) | 678 | blk_add_trace_msg((cfqd)->queue, "cfq%d " fmt, (cfqq)->pid, ##args) |
387 | #define cfq_log_cfqg(cfqd, cfqg, fmt, args...) do {} while (0) | 679 | #define cfq_log_cfqg(cfqd, cfqg, fmt, args...) do {} while (0) |
388 | #endif | 680 | |
681 | static inline void cfqg_stats_update_io_add(struct cfq_group *cfqg, | ||
682 | struct cfq_group *curr_cfqg, int rw) { } | ||
683 | static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg, | ||
684 | unsigned long time, unsigned long unaccounted_time) { } | ||
685 | static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg, int rw) { } | ||
686 | static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, int rw) { } | ||
687 | static inline void cfqg_stats_update_dispatch(struct cfq_group *cfqg, | ||
688 | uint64_t bytes, int rw) { } | ||
689 | static inline void cfqg_stats_update_completion(struct cfq_group *cfqg, | ||
690 | uint64_t start_time, uint64_t io_start_time, int rw) { } | ||
691 | |||
692 | #endif /* CONFIG_CFQ_GROUP_IOSCHED */ | ||
693 | |||
389 | #define cfq_log(cfqd, fmt, args...) \ | 694 | #define cfq_log(cfqd, fmt, args...) \ |
390 | blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args) | 695 | blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args) |
391 | 696 | ||
@@ -466,8 +771,9 @@ static inline int cfqg_busy_async_queues(struct cfq_data *cfqd, | |||
466 | } | 771 | } |
467 | 772 | ||
468 | static void cfq_dispatch_insert(struct request_queue *, struct request *); | 773 | static void cfq_dispatch_insert(struct request_queue *, struct request *); |
469 | static struct cfq_queue *cfq_get_queue(struct cfq_data *, bool, | 774 | static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, bool is_sync, |
470 | struct io_context *, gfp_t); | 775 | struct cfq_io_cq *cic, struct bio *bio, |
776 | gfp_t gfp_mask); | ||
471 | 777 | ||
472 | static inline struct cfq_io_cq *icq_to_cic(struct io_cq *icq) | 778 | static inline struct cfq_io_cq *icq_to_cic(struct io_cq *icq) |
473 | { | 779 | { |
@@ -545,7 +851,7 @@ static inline u64 cfq_scale_slice(unsigned long delta, struct cfq_group *cfqg) | |||
545 | { | 851 | { |
546 | u64 d = delta << CFQ_SERVICE_SHIFT; | 852 | u64 d = delta << CFQ_SERVICE_SHIFT; |
547 | 853 | ||
548 | d = d * BLKIO_WEIGHT_DEFAULT; | 854 | d = d * CFQ_WEIGHT_DEFAULT; |
549 | do_div(d, cfqg->weight); | 855 | do_div(d, cfqg->weight); |
550 | return d; | 856 | return d; |
551 | } | 857 | } |
@@ -872,9 +1178,9 @@ static void | |||
872 | cfq_update_group_weight(struct cfq_group *cfqg) | 1178 | cfq_update_group_weight(struct cfq_group *cfqg) |
873 | { | 1179 | { |
874 | BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node)); | 1180 | BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node)); |
875 | if (cfqg->needs_update) { | 1181 | if (cfqg->new_weight) { |
876 | cfqg->weight = cfqg->new_weight; | 1182 | cfqg->weight = cfqg->new_weight; |
877 | cfqg->needs_update = false; | 1183 | cfqg->new_weight = 0; |
878 | } | 1184 | } |
879 | } | 1185 | } |
880 | 1186 | ||
@@ -936,7 +1242,7 @@ cfq_group_notify_queue_del(struct cfq_data *cfqd, struct cfq_group *cfqg) | |||
936 | cfq_log_cfqg(cfqd, cfqg, "del_from_rr group"); | 1242 | cfq_log_cfqg(cfqd, cfqg, "del_from_rr group"); |
937 | cfq_group_service_tree_del(st, cfqg); | 1243 | cfq_group_service_tree_del(st, cfqg); |
938 | cfqg->saved_workload_slice = 0; | 1244 | cfqg->saved_workload_slice = 0; |
939 | cfq_blkiocg_update_dequeue_stats(&cfqg->blkg, 1); | 1245 | cfqg_stats_update_dequeue(cfqg); |
940 | } | 1246 | } |
941 | 1247 | ||
942 | static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq, | 1248 | static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq, |
@@ -1008,178 +1314,59 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg, | |||
1008 | "sl_used=%u disp=%u charge=%u iops=%u sect=%lu", | 1314 | "sl_used=%u disp=%u charge=%u iops=%u sect=%lu", |
1009 | used_sl, cfqq->slice_dispatch, charge, | 1315 | used_sl, cfqq->slice_dispatch, charge, |
1010 | iops_mode(cfqd), cfqq->nr_sectors); | 1316 | iops_mode(cfqd), cfqq->nr_sectors); |
1011 | cfq_blkiocg_update_timeslice_used(&cfqg->blkg, used_sl, | 1317 | cfqg_stats_update_timeslice_used(cfqg, used_sl, unaccounted_sl); |
1012 | unaccounted_sl); | 1318 | cfqg_stats_set_start_empty_time(cfqg); |
1013 | cfq_blkiocg_set_start_empty_time(&cfqg->blkg); | ||
1014 | } | 1319 | } |
1015 | 1320 | ||
1016 | #ifdef CONFIG_CFQ_GROUP_IOSCHED | 1321 | /** |
1017 | static inline struct cfq_group *cfqg_of_blkg(struct blkio_group *blkg) | 1322 | * cfq_init_cfqg_base - initialize base part of a cfq_group |
1018 | { | 1323 | * @cfqg: cfq_group to initialize |
1019 | if (blkg) | 1324 | * |
1020 | return container_of(blkg, struct cfq_group, blkg); | 1325 | * Initialize the base part which is used whether %CONFIG_CFQ_GROUP_IOSCHED |
1021 | return NULL; | 1326 | * is enabled or not. |
1022 | } | ||
1023 | |||
1024 | static void cfq_update_blkio_group_weight(void *key, struct blkio_group *blkg, | ||
1025 | unsigned int weight) | ||
1026 | { | ||
1027 | struct cfq_group *cfqg = cfqg_of_blkg(blkg); | ||
1028 | cfqg->new_weight = weight; | ||
1029 | cfqg->needs_update = true; | ||
1030 | } | ||
1031 | |||
1032 | static void cfq_init_add_cfqg_lists(struct cfq_data *cfqd, | ||
1033 | struct cfq_group *cfqg, struct blkio_cgroup *blkcg) | ||
1034 | { | ||
1035 | struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info; | ||
1036 | unsigned int major, minor; | ||
1037 | |||
1038 | /* | ||
1039 | * Add group onto cgroup list. It might happen that bdi->dev is | ||
1040 | * not initialized yet. Initialize this new group without major | ||
1041 | * and minor info and this info will be filled in once a new thread | ||
1042 | * comes for IO. | ||
1043 | */ | ||
1044 | if (bdi->dev) { | ||
1045 | sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); | ||
1046 | cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, | ||
1047 | (void *)cfqd, MKDEV(major, minor)); | ||
1048 | } else | ||
1049 | cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, | ||
1050 | (void *)cfqd, 0); | ||
1051 | |||
1052 | cfqd->nr_blkcg_linked_grps++; | ||
1053 | cfqg->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev); | ||
1054 | |||
1055 | /* Add group on cfqd list */ | ||
1056 | hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list); | ||
1057 | } | ||
1058 | |||
1059 | /* | ||
1060 | * Should be called from sleepable context. No request queue lock as per | ||
1061 | * cpu stats are allocated dynamically and alloc_percpu needs to be called | ||
1062 | * from sleepable context. | ||
1063 | */ | 1327 | */ |
1064 | static struct cfq_group * cfq_alloc_cfqg(struct cfq_data *cfqd) | 1328 | static void cfq_init_cfqg_base(struct cfq_group *cfqg) |
1065 | { | 1329 | { |
1066 | struct cfq_group *cfqg = NULL; | ||
1067 | int i, j, ret; | ||
1068 | struct cfq_rb_root *st; | 1330 | struct cfq_rb_root *st; |
1069 | 1331 | int i, j; | |
1070 | cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node); | ||
1071 | if (!cfqg) | ||
1072 | return NULL; | ||
1073 | 1332 | ||
1074 | for_each_cfqg_st(cfqg, i, j, st) | 1333 | for_each_cfqg_st(cfqg, i, j, st) |
1075 | *st = CFQ_RB_ROOT; | 1334 | *st = CFQ_RB_ROOT; |
1076 | RB_CLEAR_NODE(&cfqg->rb_node); | 1335 | RB_CLEAR_NODE(&cfqg->rb_node); |
1077 | 1336 | ||
1078 | cfqg->ttime.last_end_request = jiffies; | 1337 | cfqg->ttime.last_end_request = jiffies; |
1079 | |||
1080 | /* | ||
1081 | * Take the initial reference that will be released on destroy | ||
1082 | * This can be thought of a joint reference by cgroup and | ||
1083 | * elevator which will be dropped by either elevator exit | ||
1084 | * or cgroup deletion path depending on who is exiting first. | ||
1085 | */ | ||
1086 | cfqg->ref = 1; | ||
1087 | |||
1088 | ret = blkio_alloc_blkg_stats(&cfqg->blkg); | ||
1089 | if (ret) { | ||
1090 | kfree(cfqg); | ||
1091 | return NULL; | ||
1092 | } | ||
1093 | |||
1094 | return cfqg; | ||
1095 | } | 1338 | } |
1096 | 1339 | ||
1097 | static struct cfq_group * | 1340 | #ifdef CONFIG_CFQ_GROUP_IOSCHED |
1098 | cfq_find_cfqg(struct cfq_data *cfqd, struct blkio_cgroup *blkcg) | 1341 | static void cfq_pd_init(struct blkcg_gq *blkg) |
1099 | { | 1342 | { |
1100 | struct cfq_group *cfqg = NULL; | 1343 | struct cfq_group *cfqg = blkg_to_cfqg(blkg); |
1101 | void *key = cfqd; | ||
1102 | struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info; | ||
1103 | unsigned int major, minor; | ||
1104 | |||
1105 | /* | ||
1106 | * This is the common case when there are no blkio cgroups. | ||
1107 | * Avoid lookup in this case | ||
1108 | */ | ||
1109 | if (blkcg == &blkio_root_cgroup) | ||
1110 | cfqg = &cfqd->root_group; | ||
1111 | else | ||
1112 | cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key)); | ||
1113 | |||
1114 | if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) { | ||
1115 | sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); | ||
1116 | cfqg->blkg.dev = MKDEV(major, minor); | ||
1117 | } | ||
1118 | 1344 | ||
1119 | return cfqg; | 1345 | cfq_init_cfqg_base(cfqg); |
1346 | cfqg->weight = blkg->blkcg->cfq_weight; | ||
1120 | } | 1347 | } |
1121 | 1348 | ||
1122 | /* | 1349 | /* |
1123 | * Search for the cfq group current task belongs to. request_queue lock must | 1350 | * Search for the cfq group current task belongs to. request_queue lock must |
1124 | * be held. | 1351 | * be held. |
1125 | */ | 1352 | */ |
1126 | static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd) | 1353 | static struct cfq_group *cfq_lookup_create_cfqg(struct cfq_data *cfqd, |
1354 | struct blkcg *blkcg) | ||
1127 | { | 1355 | { |
1128 | struct blkio_cgroup *blkcg; | ||
1129 | struct cfq_group *cfqg = NULL, *__cfqg = NULL; | ||
1130 | struct request_queue *q = cfqd->queue; | 1356 | struct request_queue *q = cfqd->queue; |
1357 | struct cfq_group *cfqg = NULL; | ||
1131 | 1358 | ||
1132 | rcu_read_lock(); | 1359 | /* avoid lookup for the common case where there's no blkcg */ |
1133 | blkcg = task_blkio_cgroup(current); | 1360 | if (blkcg == &blkcg_root) { |
1134 | cfqg = cfq_find_cfqg(cfqd, blkcg); | 1361 | cfqg = cfqd->root_group; |
1135 | if (cfqg) { | 1362 | } else { |
1136 | rcu_read_unlock(); | 1363 | struct blkcg_gq *blkg; |
1137 | return cfqg; | ||
1138 | } | ||
1139 | |||
1140 | /* | ||
1141 | * Need to allocate a group. Allocation of group also needs allocation | ||
1142 | * of per cpu stats which in-turn takes a mutex() and can block. Hence | ||
1143 | * we need to drop rcu lock and queue_lock before we call alloc. | ||
1144 | * | ||
1145 | * Not taking any queue reference here and assuming that queue is | ||
1146 | * around by the time we return. CFQ queue allocation code does | ||
1147 | * the same. It might be racy though. | ||
1148 | */ | ||
1149 | |||
1150 | rcu_read_unlock(); | ||
1151 | spin_unlock_irq(q->queue_lock); | ||
1152 | |||
1153 | cfqg = cfq_alloc_cfqg(cfqd); | ||
1154 | |||
1155 | spin_lock_irq(q->queue_lock); | ||
1156 | |||
1157 | rcu_read_lock(); | ||
1158 | blkcg = task_blkio_cgroup(current); | ||
1159 | |||
1160 | /* | ||
1161 | * If some other thread already allocated the group while we were | ||
1162 | * not holding queue lock, free up the group | ||
1163 | */ | ||
1164 | __cfqg = cfq_find_cfqg(cfqd, blkcg); | ||
1165 | 1364 | ||
1166 | if (__cfqg) { | 1365 | blkg = blkg_lookup_create(blkcg, q); |
1167 | kfree(cfqg); | 1366 | if (!IS_ERR(blkg)) |
1168 | rcu_read_unlock(); | 1367 | cfqg = blkg_to_cfqg(blkg); |
1169 | return __cfqg; | ||
1170 | } | 1368 | } |
1171 | 1369 | ||
1172 | if (!cfqg) | ||
1173 | cfqg = &cfqd->root_group; | ||
1174 | |||
1175 | cfq_init_add_cfqg_lists(cfqd, cfqg, blkcg); | ||
1176 | rcu_read_unlock(); | ||
1177 | return cfqg; | ||
1178 | } | ||
1179 | |||
1180 | static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg) | ||
1181 | { | ||
1182 | cfqg->ref++; | ||
1183 | return cfqg; | 1370 | return cfqg; |
1184 | } | 1371 | } |
1185 | 1372 | ||
@@ -1187,94 +1374,224 @@ static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) | |||
1187 | { | 1374 | { |
1188 | /* Currently, all async queues are mapped to root group */ | 1375 | /* Currently, all async queues are mapped to root group */ |
1189 | if (!cfq_cfqq_sync(cfqq)) | 1376 | if (!cfq_cfqq_sync(cfqq)) |
1190 | cfqg = &cfqq->cfqd->root_group; | 1377 | cfqg = cfqq->cfqd->root_group; |
1191 | 1378 | ||
1192 | cfqq->cfqg = cfqg; | 1379 | cfqq->cfqg = cfqg; |
1193 | /* cfqq reference on cfqg */ | 1380 | /* cfqq reference on cfqg */ |
1194 | cfqq->cfqg->ref++; | 1381 | cfqg_get(cfqg); |
1195 | } | 1382 | } |
1196 | 1383 | ||
1197 | static void cfq_put_cfqg(struct cfq_group *cfqg) | 1384 | static u64 cfqg_prfill_weight_device(struct seq_file *sf, |
1385 | struct blkg_policy_data *pd, int off) | ||
1198 | { | 1386 | { |
1199 | struct cfq_rb_root *st; | 1387 | struct cfq_group *cfqg = pd_to_cfqg(pd); |
1200 | int i, j; | ||
1201 | 1388 | ||
1202 | BUG_ON(cfqg->ref <= 0); | 1389 | if (!cfqg->dev_weight) |
1203 | cfqg->ref--; | 1390 | return 0; |
1204 | if (cfqg->ref) | 1391 | return __blkg_prfill_u64(sf, pd, cfqg->dev_weight); |
1205 | return; | ||
1206 | for_each_cfqg_st(cfqg, i, j, st) | ||
1207 | BUG_ON(!RB_EMPTY_ROOT(&st->rb)); | ||
1208 | free_percpu(cfqg->blkg.stats_cpu); | ||
1209 | kfree(cfqg); | ||
1210 | } | 1392 | } |
1211 | 1393 | ||
1212 | static void cfq_destroy_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg) | 1394 | static int cfqg_print_weight_device(struct cgroup *cgrp, struct cftype *cft, |
1395 | struct seq_file *sf) | ||
1213 | { | 1396 | { |
1214 | /* Something wrong if we are trying to remove same group twice */ | 1397 | blkcg_print_blkgs(sf, cgroup_to_blkcg(cgrp), |
1215 | BUG_ON(hlist_unhashed(&cfqg->cfqd_node)); | 1398 | cfqg_prfill_weight_device, &blkcg_policy_cfq, 0, |
1399 | false); | ||
1400 | return 0; | ||
1401 | } | ||
1216 | 1402 | ||
1217 | hlist_del_init(&cfqg->cfqd_node); | 1403 | static int cfq_print_weight(struct cgroup *cgrp, struct cftype *cft, |
1404 | struct seq_file *sf) | ||
1405 | { | ||
1406 | seq_printf(sf, "%u\n", cgroup_to_blkcg(cgrp)->cfq_weight); | ||
1407 | return 0; | ||
1408 | } | ||
1218 | 1409 | ||
1219 | BUG_ON(cfqd->nr_blkcg_linked_grps <= 0); | 1410 | static int cfqg_set_weight_device(struct cgroup *cgrp, struct cftype *cft, |
1220 | cfqd->nr_blkcg_linked_grps--; | 1411 | const char *buf) |
1412 | { | ||
1413 | struct blkcg *blkcg = cgroup_to_blkcg(cgrp); | ||
1414 | struct blkg_conf_ctx ctx; | ||
1415 | struct cfq_group *cfqg; | ||
1416 | int ret; | ||
1221 | 1417 | ||
1222 | /* | 1418 | ret = blkg_conf_prep(blkcg, &blkcg_policy_cfq, buf, &ctx); |
1223 | * Put the reference taken at the time of creation so that when all | 1419 | if (ret) |
1224 | * queues are gone, group can be destroyed. | 1420 | return ret; |
1225 | */ | 1421 | |
1226 | cfq_put_cfqg(cfqg); | 1422 | ret = -EINVAL; |
1423 | cfqg = blkg_to_cfqg(ctx.blkg); | ||
1424 | if (!ctx.v || (ctx.v >= CFQ_WEIGHT_MIN && ctx.v <= CFQ_WEIGHT_MAX)) { | ||
1425 | cfqg->dev_weight = ctx.v; | ||
1426 | cfqg->new_weight = cfqg->dev_weight ?: blkcg->cfq_weight; | ||
1427 | ret = 0; | ||
1428 | } | ||
1429 | |||
1430 | blkg_conf_finish(&ctx); | ||
1431 | return ret; | ||
1227 | } | 1432 | } |
1228 | 1433 | ||
1229 | static void cfq_release_cfq_groups(struct cfq_data *cfqd) | 1434 | static int cfq_set_weight(struct cgroup *cgrp, struct cftype *cft, u64 val) |
1230 | { | 1435 | { |
1231 | struct hlist_node *pos, *n; | 1436 | struct blkcg *blkcg = cgroup_to_blkcg(cgrp); |
1232 | struct cfq_group *cfqg; | 1437 | struct blkcg_gq *blkg; |
1438 | struct hlist_node *n; | ||
1233 | 1439 | ||
1234 | hlist_for_each_entry_safe(cfqg, pos, n, &cfqd->cfqg_list, cfqd_node) { | 1440 | if (val < CFQ_WEIGHT_MIN || val > CFQ_WEIGHT_MAX) |
1235 | /* | 1441 | return -EINVAL; |
1236 | * If cgroup removal path got to blk_group first and removed | 1442 | |
1237 | * it from cgroup list, then it will take care of destroying | 1443 | spin_lock_irq(&blkcg->lock); |
1238 | * cfqg also. | 1444 | blkcg->cfq_weight = (unsigned int)val; |
1239 | */ | 1445 | |
1240 | if (!cfq_blkiocg_del_blkio_group(&cfqg->blkg)) | 1446 | hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) { |
1241 | cfq_destroy_cfqg(cfqd, cfqg); | 1447 | struct cfq_group *cfqg = blkg_to_cfqg(blkg); |
1448 | |||
1449 | if (cfqg && !cfqg->dev_weight) | ||
1450 | cfqg->new_weight = blkcg->cfq_weight; | ||
1242 | } | 1451 | } |
1452 | |||
1453 | spin_unlock_irq(&blkcg->lock); | ||
1454 | return 0; | ||
1243 | } | 1455 | } |
1244 | 1456 | ||
1245 | /* | 1457 | static int cfqg_print_stat(struct cgroup *cgrp, struct cftype *cft, |
1246 | * Blk cgroup controller notification saying that blkio_group object is being | 1458 | struct seq_file *sf) |
1247 | * delinked as associated cgroup object is going away. That also means that | ||
1248 | * no new IO will come in this group. So get rid of this group as soon as | ||
1249 | * any pending IO in the group is finished. | ||
1250 | * | ||
1251 | * This function is called under rcu_read_lock(). key is the rcu protected | ||
1252 | * pointer. That means "key" is a valid cfq_data pointer as long as we are rcu | ||
1253 | * read lock. | ||
1254 | * | ||
1255 | * "key" was fetched from blkio_group under blkio_cgroup->lock. That means | ||
1256 | * it should not be NULL as even if elevator was exiting, cgroup deltion | ||
1257 | * path got to it first. | ||
1258 | */ | ||
1259 | static void cfq_unlink_blkio_group(void *key, struct blkio_group *blkg) | ||
1260 | { | 1459 | { |
1261 | unsigned long flags; | 1460 | struct blkcg *blkcg = cgroup_to_blkcg(cgrp); |
1262 | struct cfq_data *cfqd = key; | ||
1263 | 1461 | ||
1264 | spin_lock_irqsave(cfqd->queue->queue_lock, flags); | 1462 | blkcg_print_blkgs(sf, blkcg, blkg_prfill_stat, &blkcg_policy_cfq, |
1265 | cfq_destroy_cfqg(cfqd, cfqg_of_blkg(blkg)); | 1463 | cft->private, false); |
1266 | spin_unlock_irqrestore(cfqd->queue->queue_lock, flags); | 1464 | return 0; |
1267 | } | 1465 | } |
1268 | 1466 | ||
1269 | #else /* GROUP_IOSCHED */ | 1467 | static int cfqg_print_rwstat(struct cgroup *cgrp, struct cftype *cft, |
1270 | static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd) | 1468 | struct seq_file *sf) |
1271 | { | 1469 | { |
1272 | return &cfqd->root_group; | 1470 | struct blkcg *blkcg = cgroup_to_blkcg(cgrp); |
1471 | |||
1472 | blkcg_print_blkgs(sf, blkcg, blkg_prfill_rwstat, &blkcg_policy_cfq, | ||
1473 | cft->private, true); | ||
1474 | return 0; | ||
1273 | } | 1475 | } |
1274 | 1476 | ||
1275 | static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg) | 1477 | #ifdef CONFIG_DEBUG_BLK_CGROUP |
1478 | static u64 cfqg_prfill_avg_queue_size(struct seq_file *sf, | ||
1479 | struct blkg_policy_data *pd, int off) | ||
1276 | { | 1480 | { |
1277 | return cfqg; | 1481 | struct cfq_group *cfqg = pd_to_cfqg(pd); |
1482 | u64 samples = blkg_stat_read(&cfqg->stats.avg_queue_size_samples); | ||
1483 | u64 v = 0; | ||
1484 | |||
1485 | if (samples) { | ||
1486 | v = blkg_stat_read(&cfqg->stats.avg_queue_size_sum); | ||
1487 | do_div(v, samples); | ||
1488 | } | ||
1489 | __blkg_prfill_u64(sf, pd, v); | ||
1490 | return 0; | ||
1491 | } | ||
1492 | |||
1493 | /* print avg_queue_size */ | ||
1494 | static int cfqg_print_avg_queue_size(struct cgroup *cgrp, struct cftype *cft, | ||
1495 | struct seq_file *sf) | ||
1496 | { | ||
1497 | struct blkcg *blkcg = cgroup_to_blkcg(cgrp); | ||
1498 | |||
1499 | blkcg_print_blkgs(sf, blkcg, cfqg_prfill_avg_queue_size, | ||
1500 | &blkcg_policy_cfq, 0, false); | ||
1501 | return 0; | ||
1502 | } | ||
1503 | #endif /* CONFIG_DEBUG_BLK_CGROUP */ | ||
1504 | |||
1505 | static struct cftype cfq_blkcg_files[] = { | ||
1506 | { | ||
1507 | .name = "weight_device", | ||
1508 | .read_seq_string = cfqg_print_weight_device, | ||
1509 | .write_string = cfqg_set_weight_device, | ||
1510 | .max_write_len = 256, | ||
1511 | }, | ||
1512 | { | ||
1513 | .name = "weight", | ||
1514 | .read_seq_string = cfq_print_weight, | ||
1515 | .write_u64 = cfq_set_weight, | ||
1516 | }, | ||
1517 | { | ||
1518 | .name = "time", | ||
1519 | .private = offsetof(struct cfq_group, stats.time), | ||
1520 | .read_seq_string = cfqg_print_stat, | ||
1521 | }, | ||
1522 | { | ||
1523 | .name = "sectors", | ||
1524 | .private = offsetof(struct cfq_group, stats.sectors), | ||
1525 | .read_seq_string = cfqg_print_stat, | ||
1526 | }, | ||
1527 | { | ||
1528 | .name = "io_service_bytes", | ||
1529 | .private = offsetof(struct cfq_group, stats.service_bytes), | ||
1530 | .read_seq_string = cfqg_print_rwstat, | ||
1531 | }, | ||
1532 | { | ||
1533 | .name = "io_serviced", | ||
1534 | .private = offsetof(struct cfq_group, stats.serviced), | ||
1535 | .read_seq_string = cfqg_print_rwstat, | ||
1536 | }, | ||
1537 | { | ||
1538 | .name = "io_service_time", | ||
1539 | .private = offsetof(struct cfq_group, stats.service_time), | ||
1540 | .read_seq_string = cfqg_print_rwstat, | ||
1541 | }, | ||
1542 | { | ||
1543 | .name = "io_wait_time", | ||
1544 | .private = offsetof(struct cfq_group, stats.wait_time), | ||
1545 | .read_seq_string = cfqg_print_rwstat, | ||
1546 | }, | ||
1547 | { | ||
1548 | .name = "io_merged", | ||
1549 | .private = offsetof(struct cfq_group, stats.merged), | ||
1550 | .read_seq_string = cfqg_print_rwstat, | ||
1551 | }, | ||
1552 | { | ||
1553 | .name = "io_queued", | ||
1554 | .private = offsetof(struct cfq_group, stats.queued), | ||
1555 | .read_seq_string = cfqg_print_rwstat, | ||
1556 | }, | ||
1557 | #ifdef CONFIG_DEBUG_BLK_CGROUP | ||
1558 | { | ||
1559 | .name = "avg_queue_size", | ||
1560 | .read_seq_string = cfqg_print_avg_queue_size, | ||
1561 | }, | ||
1562 | { | ||
1563 | .name = "group_wait_time", | ||
1564 | .private = offsetof(struct cfq_group, stats.group_wait_time), | ||
1565 | .read_seq_string = cfqg_print_stat, | ||
1566 | }, | ||
1567 | { | ||
1568 | .name = "idle_time", | ||
1569 | .private = offsetof(struct cfq_group, stats.idle_time), | ||
1570 | .read_seq_string = cfqg_print_stat, | ||
1571 | }, | ||
1572 | { | ||
1573 | .name = "empty_time", | ||
1574 | .private = offsetof(struct cfq_group, stats.empty_time), | ||
1575 | .read_seq_string = cfqg_print_stat, | ||
1576 | }, | ||
1577 | { | ||
1578 | .name = "dequeue", | ||
1579 | .private = offsetof(struct cfq_group, stats.dequeue), | ||
1580 | .read_seq_string = cfqg_print_stat, | ||
1581 | }, | ||
1582 | { | ||
1583 | .name = "unaccounted_time", | ||
1584 | .private = offsetof(struct cfq_group, stats.unaccounted_time), | ||
1585 | .read_seq_string = cfqg_print_stat, | ||
1586 | }, | ||
1587 | #endif /* CONFIG_DEBUG_BLK_CGROUP */ | ||
1588 | { } /* terminate */ | ||
1589 | }; | ||
1590 | #else /* GROUP_IOSCHED */ | ||
1591 | static struct cfq_group *cfq_lookup_create_cfqg(struct cfq_data *cfqd, | ||
1592 | struct blkcg *blkcg) | ||
1593 | { | ||
1594 | return cfqd->root_group; | ||
1278 | } | 1595 | } |
1279 | 1596 | ||
1280 | static inline void | 1597 | static inline void |
@@ -1282,9 +1599,6 @@ cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) { | |||
1282 | cfqq->cfqg = cfqg; | 1599 | cfqq->cfqg = cfqg; |
1283 | } | 1600 | } |
1284 | 1601 | ||
1285 | static void cfq_release_cfq_groups(struct cfq_data *cfqd) {} | ||
1286 | static inline void cfq_put_cfqg(struct cfq_group *cfqg) {} | ||
1287 | |||
1288 | #endif /* GROUP_IOSCHED */ | 1602 | #endif /* GROUP_IOSCHED */ |
1289 | 1603 | ||
1290 | /* | 1604 | /* |
@@ -1551,12 +1865,10 @@ static void cfq_reposition_rq_rb(struct cfq_queue *cfqq, struct request *rq) | |||
1551 | { | 1865 | { |
1552 | elv_rb_del(&cfqq->sort_list, rq); | 1866 | elv_rb_del(&cfqq->sort_list, rq); |
1553 | cfqq->queued[rq_is_sync(rq)]--; | 1867 | cfqq->queued[rq_is_sync(rq)]--; |
1554 | cfq_blkiocg_update_io_remove_stats(&(RQ_CFQG(rq))->blkg, | 1868 | cfqg_stats_update_io_remove(RQ_CFQG(rq), rq->cmd_flags); |
1555 | rq_data_dir(rq), rq_is_sync(rq)); | ||
1556 | cfq_add_rq_rb(rq); | 1869 | cfq_add_rq_rb(rq); |
1557 | cfq_blkiocg_update_io_add_stats(&(RQ_CFQG(rq))->blkg, | 1870 | cfqg_stats_update_io_add(RQ_CFQG(rq), cfqq->cfqd->serving_group, |
1558 | &cfqq->cfqd->serving_group->blkg, rq_data_dir(rq), | 1871 | rq->cmd_flags); |
1559 | rq_is_sync(rq)); | ||
1560 | } | 1872 | } |
1561 | 1873 | ||
1562 | static struct request * | 1874 | static struct request * |
@@ -1612,8 +1924,7 @@ static void cfq_remove_request(struct request *rq) | |||
1612 | cfq_del_rq_rb(rq); | 1924 | cfq_del_rq_rb(rq); |
1613 | 1925 | ||
1614 | cfqq->cfqd->rq_queued--; | 1926 | cfqq->cfqd->rq_queued--; |
1615 | cfq_blkiocg_update_io_remove_stats(&(RQ_CFQG(rq))->blkg, | 1927 | cfqg_stats_update_io_remove(RQ_CFQG(rq), rq->cmd_flags); |
1616 | rq_data_dir(rq), rq_is_sync(rq)); | ||
1617 | if (rq->cmd_flags & REQ_PRIO) { | 1928 | if (rq->cmd_flags & REQ_PRIO) { |
1618 | WARN_ON(!cfqq->prio_pending); | 1929 | WARN_ON(!cfqq->prio_pending); |
1619 | cfqq->prio_pending--; | 1930 | cfqq->prio_pending--; |
@@ -1648,8 +1959,7 @@ static void cfq_merged_request(struct request_queue *q, struct request *req, | |||
1648 | static void cfq_bio_merged(struct request_queue *q, struct request *req, | 1959 | static void cfq_bio_merged(struct request_queue *q, struct request *req, |
1649 | struct bio *bio) | 1960 | struct bio *bio) |
1650 | { | 1961 | { |
1651 | cfq_blkiocg_update_io_merged_stats(&(RQ_CFQG(req))->blkg, | 1962 | cfqg_stats_update_io_merged(RQ_CFQG(req), bio->bi_rw); |
1652 | bio_data_dir(bio), cfq_bio_sync(bio)); | ||
1653 | } | 1963 | } |
1654 | 1964 | ||
1655 | static void | 1965 | static void |
@@ -1671,8 +1981,7 @@ cfq_merged_requests(struct request_queue *q, struct request *rq, | |||
1671 | if (cfqq->next_rq == next) | 1981 | if (cfqq->next_rq == next) |
1672 | cfqq->next_rq = rq; | 1982 | cfqq->next_rq = rq; |
1673 | cfq_remove_request(next); | 1983 | cfq_remove_request(next); |
1674 | cfq_blkiocg_update_io_merged_stats(&(RQ_CFQG(rq))->blkg, | 1984 | cfqg_stats_update_io_merged(RQ_CFQG(rq), next->cmd_flags); |
1675 | rq_data_dir(next), rq_is_sync(next)); | ||
1676 | 1985 | ||
1677 | cfqq = RQ_CFQQ(next); | 1986 | cfqq = RQ_CFQQ(next); |
1678 | /* | 1987 | /* |
@@ -1713,7 +2022,7 @@ static int cfq_allow_merge(struct request_queue *q, struct request *rq, | |||
1713 | static inline void cfq_del_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq) | 2022 | static inline void cfq_del_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq) |
1714 | { | 2023 | { |
1715 | del_timer(&cfqd->idle_slice_timer); | 2024 | del_timer(&cfqd->idle_slice_timer); |
1716 | cfq_blkiocg_update_idle_time_stats(&cfqq->cfqg->blkg); | 2025 | cfqg_stats_update_idle_time(cfqq->cfqg); |
1717 | } | 2026 | } |
1718 | 2027 | ||
1719 | static void __cfq_set_active_queue(struct cfq_data *cfqd, | 2028 | static void __cfq_set_active_queue(struct cfq_data *cfqd, |
@@ -1722,7 +2031,7 @@ static void __cfq_set_active_queue(struct cfq_data *cfqd, | |||
1722 | if (cfqq) { | 2031 | if (cfqq) { |
1723 | cfq_log_cfqq(cfqd, cfqq, "set_active wl_prio:%d wl_type:%d", | 2032 | cfq_log_cfqq(cfqd, cfqq, "set_active wl_prio:%d wl_type:%d", |
1724 | cfqd->serving_prio, cfqd->serving_type); | 2033 | cfqd->serving_prio, cfqd->serving_type); |
1725 | cfq_blkiocg_update_avg_queue_size_stats(&cfqq->cfqg->blkg); | 2034 | cfqg_stats_update_avg_queue_size(cfqq->cfqg); |
1726 | cfqq->slice_start = 0; | 2035 | cfqq->slice_start = 0; |
1727 | cfqq->dispatch_start = jiffies; | 2036 | cfqq->dispatch_start = jiffies; |
1728 | cfqq->allocated_slice = 0; | 2037 | cfqq->allocated_slice = 0; |
@@ -2043,7 +2352,7 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd) | |||
2043 | * task has exited, don't wait | 2352 | * task has exited, don't wait |
2044 | */ | 2353 | */ |
2045 | cic = cfqd->active_cic; | 2354 | cic = cfqd->active_cic; |
2046 | if (!cic || !atomic_read(&cic->icq.ioc->nr_tasks)) | 2355 | if (!cic || !atomic_read(&cic->icq.ioc->active_ref)) |
2047 | return; | 2356 | return; |
2048 | 2357 | ||
2049 | /* | 2358 | /* |
@@ -2070,7 +2379,7 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd) | |||
2070 | sl = cfqd->cfq_slice_idle; | 2379 | sl = cfqd->cfq_slice_idle; |
2071 | 2380 | ||
2072 | mod_timer(&cfqd->idle_slice_timer, jiffies + sl); | 2381 | mod_timer(&cfqd->idle_slice_timer, jiffies + sl); |
2073 | cfq_blkiocg_update_set_idle_time_stats(&cfqq->cfqg->blkg); | 2382 | cfqg_stats_set_start_idle_time(cfqq->cfqg); |
2074 | cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu group_idle: %d", sl, | 2383 | cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu group_idle: %d", sl, |
2075 | group_idle ? 1 : 0); | 2384 | group_idle ? 1 : 0); |
2076 | } | 2385 | } |
@@ -2093,8 +2402,7 @@ static void cfq_dispatch_insert(struct request_queue *q, struct request *rq) | |||
2093 | 2402 | ||
2094 | cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++; | 2403 | cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++; |
2095 | cfqq->nr_sectors += blk_rq_sectors(rq); | 2404 | cfqq->nr_sectors += blk_rq_sectors(rq); |
2096 | cfq_blkiocg_update_dispatch_stats(&cfqq->cfqg->blkg, blk_rq_bytes(rq), | 2405 | cfqg_stats_update_dispatch(cfqq->cfqg, blk_rq_bytes(rq), rq->cmd_flags); |
2097 | rq_data_dir(rq), rq_is_sync(rq)); | ||
2098 | } | 2406 | } |
2099 | 2407 | ||
2100 | /* | 2408 | /* |
@@ -2677,7 +2985,7 @@ static void cfq_put_queue(struct cfq_queue *cfqq) | |||
2677 | 2985 | ||
2678 | BUG_ON(cfq_cfqq_on_rr(cfqq)); | 2986 | BUG_ON(cfq_cfqq_on_rr(cfqq)); |
2679 | kmem_cache_free(cfq_pool, cfqq); | 2987 | kmem_cache_free(cfq_pool, cfqq); |
2680 | cfq_put_cfqg(cfqg); | 2988 | cfqg_put(cfqg); |
2681 | } | 2989 | } |
2682 | 2990 | ||
2683 | static void cfq_put_cooperator(struct cfq_queue *cfqq) | 2991 | static void cfq_put_cooperator(struct cfq_queue *cfqq) |
@@ -2736,7 +3044,7 @@ static void cfq_exit_icq(struct io_cq *icq) | |||
2736 | } | 3044 | } |
2737 | } | 3045 | } |
2738 | 3046 | ||
2739 | static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc) | 3047 | static void cfq_init_prio_data(struct cfq_queue *cfqq, struct cfq_io_cq *cic) |
2740 | { | 3048 | { |
2741 | struct task_struct *tsk = current; | 3049 | struct task_struct *tsk = current; |
2742 | int ioprio_class; | 3050 | int ioprio_class; |
@@ -2744,7 +3052,7 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc) | |||
2744 | if (!cfq_cfqq_prio_changed(cfqq)) | 3052 | if (!cfq_cfqq_prio_changed(cfqq)) |
2745 | return; | 3053 | return; |
2746 | 3054 | ||
2747 | ioprio_class = IOPRIO_PRIO_CLASS(ioc->ioprio); | 3055 | ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio); |
2748 | switch (ioprio_class) { | 3056 | switch (ioprio_class) { |
2749 | default: | 3057 | default: |
2750 | printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class); | 3058 | printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class); |
@@ -2756,11 +3064,11 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc) | |||
2756 | cfqq->ioprio_class = task_nice_ioclass(tsk); | 3064 | cfqq->ioprio_class = task_nice_ioclass(tsk); |
2757 | break; | 3065 | break; |
2758 | case IOPRIO_CLASS_RT: | 3066 | case IOPRIO_CLASS_RT: |
2759 | cfqq->ioprio = task_ioprio(ioc); | 3067 | cfqq->ioprio = IOPRIO_PRIO_DATA(cic->ioprio); |
2760 | cfqq->ioprio_class = IOPRIO_CLASS_RT; | 3068 | cfqq->ioprio_class = IOPRIO_CLASS_RT; |
2761 | break; | 3069 | break; |
2762 | case IOPRIO_CLASS_BE: | 3070 | case IOPRIO_CLASS_BE: |
2763 | cfqq->ioprio = task_ioprio(ioc); | 3071 | cfqq->ioprio = IOPRIO_PRIO_DATA(cic->ioprio); |
2764 | cfqq->ioprio_class = IOPRIO_CLASS_BE; | 3072 | cfqq->ioprio_class = IOPRIO_CLASS_BE; |
2765 | break; | 3073 | break; |
2766 | case IOPRIO_CLASS_IDLE: | 3074 | case IOPRIO_CLASS_IDLE: |
@@ -2778,19 +3086,24 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc) | |||
2778 | cfq_clear_cfqq_prio_changed(cfqq); | 3086 | cfq_clear_cfqq_prio_changed(cfqq); |
2779 | } | 3087 | } |
2780 | 3088 | ||
2781 | static void changed_ioprio(struct cfq_io_cq *cic) | 3089 | static void check_ioprio_changed(struct cfq_io_cq *cic, struct bio *bio) |
2782 | { | 3090 | { |
3091 | int ioprio = cic->icq.ioc->ioprio; | ||
2783 | struct cfq_data *cfqd = cic_to_cfqd(cic); | 3092 | struct cfq_data *cfqd = cic_to_cfqd(cic); |
2784 | struct cfq_queue *cfqq; | 3093 | struct cfq_queue *cfqq; |
2785 | 3094 | ||
2786 | if (unlikely(!cfqd)) | 3095 | /* |
3096 | * Check whether ioprio has changed. The condition may trigger | ||
3097 | * spuriously on a newly created cic but there's no harm. | ||
3098 | */ | ||
3099 | if (unlikely(!cfqd) || likely(cic->ioprio == ioprio)) | ||
2787 | return; | 3100 | return; |
2788 | 3101 | ||
2789 | cfqq = cic->cfqq[BLK_RW_ASYNC]; | 3102 | cfqq = cic->cfqq[BLK_RW_ASYNC]; |
2790 | if (cfqq) { | 3103 | if (cfqq) { |
2791 | struct cfq_queue *new_cfqq; | 3104 | struct cfq_queue *new_cfqq; |
2792 | new_cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic->icq.ioc, | 3105 | new_cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic, bio, |
2793 | GFP_ATOMIC); | 3106 | GFP_ATOMIC); |
2794 | if (new_cfqq) { | 3107 | if (new_cfqq) { |
2795 | cic->cfqq[BLK_RW_ASYNC] = new_cfqq; | 3108 | cic->cfqq[BLK_RW_ASYNC] = new_cfqq; |
2796 | cfq_put_queue(cfqq); | 3109 | cfq_put_queue(cfqq); |
@@ -2800,6 +3113,8 @@ static void changed_ioprio(struct cfq_io_cq *cic) | |||
2800 | cfqq = cic->cfqq[BLK_RW_SYNC]; | 3113 | cfqq = cic->cfqq[BLK_RW_SYNC]; |
2801 | if (cfqq) | 3114 | if (cfqq) |
2802 | cfq_mark_cfqq_prio_changed(cfqq); | 3115 | cfq_mark_cfqq_prio_changed(cfqq); |
3116 | |||
3117 | cic->ioprio = ioprio; | ||
2803 | } | 3118 | } |
2804 | 3119 | ||
2805 | static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq, | 3120 | static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq, |
@@ -2823,17 +3138,24 @@ static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq, | |||
2823 | } | 3138 | } |
2824 | 3139 | ||
2825 | #ifdef CONFIG_CFQ_GROUP_IOSCHED | 3140 | #ifdef CONFIG_CFQ_GROUP_IOSCHED |
2826 | static void changed_cgroup(struct cfq_io_cq *cic) | 3141 | static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) |
2827 | { | 3142 | { |
2828 | struct cfq_queue *sync_cfqq = cic_to_cfqq(cic, 1); | ||
2829 | struct cfq_data *cfqd = cic_to_cfqd(cic); | 3143 | struct cfq_data *cfqd = cic_to_cfqd(cic); |
2830 | struct request_queue *q; | 3144 | struct cfq_queue *sync_cfqq; |
3145 | uint64_t id; | ||
2831 | 3146 | ||
2832 | if (unlikely(!cfqd)) | 3147 | rcu_read_lock(); |
2833 | return; | 3148 | id = bio_blkcg(bio)->id; |
3149 | rcu_read_unlock(); | ||
2834 | 3150 | ||
2835 | q = cfqd->queue; | 3151 | /* |
3152 | * Check whether blkcg has changed. The condition may trigger | ||
3153 | * spuriously on a newly created cic but there's no harm. | ||
3154 | */ | ||
3155 | if (unlikely(!cfqd) || likely(cic->blkcg_id == id)) | ||
3156 | return; | ||
2836 | 3157 | ||
3158 | sync_cfqq = cic_to_cfqq(cic, 1); | ||
2837 | if (sync_cfqq) { | 3159 | if (sync_cfqq) { |
2838 | /* | 3160 | /* |
2839 | * Drop reference to sync queue. A new sync queue will be | 3161 | * Drop reference to sync queue. A new sync queue will be |
@@ -2843,21 +3165,26 @@ static void changed_cgroup(struct cfq_io_cq *cic) | |||
2843 | cic_set_cfqq(cic, NULL, 1); | 3165 | cic_set_cfqq(cic, NULL, 1); |
2844 | cfq_put_queue(sync_cfqq); | 3166 | cfq_put_queue(sync_cfqq); |
2845 | } | 3167 | } |
3168 | |||
3169 | cic->blkcg_id = id; | ||
2846 | } | 3170 | } |
3171 | #else | ||
3172 | static inline void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) { } | ||
2847 | #endif /* CONFIG_CFQ_GROUP_IOSCHED */ | 3173 | #endif /* CONFIG_CFQ_GROUP_IOSCHED */ |
2848 | 3174 | ||
2849 | static struct cfq_queue * | 3175 | static struct cfq_queue * |
2850 | cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync, | 3176 | cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic, |
2851 | struct io_context *ioc, gfp_t gfp_mask) | 3177 | struct bio *bio, gfp_t gfp_mask) |
2852 | { | 3178 | { |
3179 | struct blkcg *blkcg; | ||
2853 | struct cfq_queue *cfqq, *new_cfqq = NULL; | 3180 | struct cfq_queue *cfqq, *new_cfqq = NULL; |
2854 | struct cfq_io_cq *cic; | ||
2855 | struct cfq_group *cfqg; | 3181 | struct cfq_group *cfqg; |
2856 | 3182 | ||
2857 | retry: | 3183 | retry: |
2858 | cfqg = cfq_get_cfqg(cfqd); | 3184 | rcu_read_lock(); |
2859 | cic = cfq_cic_lookup(cfqd, ioc); | 3185 | |
2860 | /* cic always exists here */ | 3186 | blkcg = bio_blkcg(bio); |
3187 | cfqg = cfq_lookup_create_cfqg(cfqd, blkcg); | ||
2861 | cfqq = cic_to_cfqq(cic, is_sync); | 3188 | cfqq = cic_to_cfqq(cic, is_sync); |
2862 | 3189 | ||
2863 | /* | 3190 | /* |
@@ -2870,6 +3197,7 @@ retry: | |||
2870 | cfqq = new_cfqq; | 3197 | cfqq = new_cfqq; |
2871 | new_cfqq = NULL; | 3198 | new_cfqq = NULL; |
2872 | } else if (gfp_mask & __GFP_WAIT) { | 3199 | } else if (gfp_mask & __GFP_WAIT) { |
3200 | rcu_read_unlock(); | ||
2873 | spin_unlock_irq(cfqd->queue->queue_lock); | 3201 | spin_unlock_irq(cfqd->queue->queue_lock); |
2874 | new_cfqq = kmem_cache_alloc_node(cfq_pool, | 3202 | new_cfqq = kmem_cache_alloc_node(cfq_pool, |
2875 | gfp_mask | __GFP_ZERO, | 3203 | gfp_mask | __GFP_ZERO, |
@@ -2885,7 +3213,7 @@ retry: | |||
2885 | 3213 | ||
2886 | if (cfqq) { | 3214 | if (cfqq) { |
2887 | cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync); | 3215 | cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync); |
2888 | cfq_init_prio_data(cfqq, ioc); | 3216 | cfq_init_prio_data(cfqq, cic); |
2889 | cfq_link_cfqq_cfqg(cfqq, cfqg); | 3217 | cfq_link_cfqq_cfqg(cfqq, cfqg); |
2890 | cfq_log_cfqq(cfqd, cfqq, "alloced"); | 3218 | cfq_log_cfqq(cfqd, cfqq, "alloced"); |
2891 | } else | 3219 | } else |
@@ -2895,6 +3223,7 @@ retry: | |||
2895 | if (new_cfqq) | 3223 | if (new_cfqq) |
2896 | kmem_cache_free(cfq_pool, new_cfqq); | 3224 | kmem_cache_free(cfq_pool, new_cfqq); |
2897 | 3225 | ||
3226 | rcu_read_unlock(); | ||
2898 | return cfqq; | 3227 | return cfqq; |
2899 | } | 3228 | } |
2900 | 3229 | ||
@@ -2904,6 +3233,9 @@ cfq_async_queue_prio(struct cfq_data *cfqd, int ioprio_class, int ioprio) | |||
2904 | switch (ioprio_class) { | 3233 | switch (ioprio_class) { |
2905 | case IOPRIO_CLASS_RT: | 3234 | case IOPRIO_CLASS_RT: |
2906 | return &cfqd->async_cfqq[0][ioprio]; | 3235 | return &cfqd->async_cfqq[0][ioprio]; |
3236 | case IOPRIO_CLASS_NONE: | ||
3237 | ioprio = IOPRIO_NORM; | ||
3238 | /* fall through */ | ||
2907 | case IOPRIO_CLASS_BE: | 3239 | case IOPRIO_CLASS_BE: |
2908 | return &cfqd->async_cfqq[1][ioprio]; | 3240 | return &cfqd->async_cfqq[1][ioprio]; |
2909 | case IOPRIO_CLASS_IDLE: | 3241 | case IOPRIO_CLASS_IDLE: |
@@ -2914,11 +3246,11 @@ cfq_async_queue_prio(struct cfq_data *cfqd, int ioprio_class, int ioprio) | |||
2914 | } | 3246 | } |
2915 | 3247 | ||
2916 | static struct cfq_queue * | 3248 | static struct cfq_queue * |
2917 | cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct io_context *ioc, | 3249 | cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic, |
2918 | gfp_t gfp_mask) | 3250 | struct bio *bio, gfp_t gfp_mask) |
2919 | { | 3251 | { |
2920 | const int ioprio = task_ioprio(ioc); | 3252 | const int ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio); |
2921 | const int ioprio_class = task_ioprio_class(ioc); | 3253 | const int ioprio = IOPRIO_PRIO_DATA(cic->ioprio); |
2922 | struct cfq_queue **async_cfqq = NULL; | 3254 | struct cfq_queue **async_cfqq = NULL; |
2923 | struct cfq_queue *cfqq = NULL; | 3255 | struct cfq_queue *cfqq = NULL; |
2924 | 3256 | ||
@@ -2928,7 +3260,7 @@ cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct io_context *ioc, | |||
2928 | } | 3260 | } |
2929 | 3261 | ||
2930 | if (!cfqq) | 3262 | if (!cfqq) |
2931 | cfqq = cfq_find_alloc_queue(cfqd, is_sync, ioc, gfp_mask); | 3263 | cfqq = cfq_find_alloc_queue(cfqd, is_sync, cic, bio, gfp_mask); |
2932 | 3264 | ||
2933 | /* | 3265 | /* |
2934 | * pin the queue now that it's allocated, scheduler exit will prune it | 3266 | * pin the queue now that it's allocated, scheduler exit will prune it |
@@ -3010,7 +3342,7 @@ cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq, | |||
3010 | 3342 | ||
3011 | if (cfqq->next_rq && (cfqq->next_rq->cmd_flags & REQ_NOIDLE)) | 3343 | if (cfqq->next_rq && (cfqq->next_rq->cmd_flags & REQ_NOIDLE)) |
3012 | enable_idle = 0; | 3344 | enable_idle = 0; |
3013 | else if (!atomic_read(&cic->icq.ioc->nr_tasks) || | 3345 | else if (!atomic_read(&cic->icq.ioc->active_ref) || |
3014 | !cfqd->cfq_slice_idle || | 3346 | !cfqd->cfq_slice_idle || |
3015 | (!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq))) | 3347 | (!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq))) |
3016 | enable_idle = 0; | 3348 | enable_idle = 0; |
@@ -3174,8 +3506,7 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq, | |||
3174 | cfq_clear_cfqq_wait_request(cfqq); | 3506 | cfq_clear_cfqq_wait_request(cfqq); |
3175 | __blk_run_queue(cfqd->queue); | 3507 | __blk_run_queue(cfqd->queue); |
3176 | } else { | 3508 | } else { |
3177 | cfq_blkiocg_update_idle_time_stats( | 3509 | cfqg_stats_update_idle_time(cfqq->cfqg); |
3178 | &cfqq->cfqg->blkg); | ||
3179 | cfq_mark_cfqq_must_dispatch(cfqq); | 3510 | cfq_mark_cfqq_must_dispatch(cfqq); |
3180 | } | 3511 | } |
3181 | } | 3512 | } |
@@ -3197,14 +3528,13 @@ static void cfq_insert_request(struct request_queue *q, struct request *rq) | |||
3197 | struct cfq_queue *cfqq = RQ_CFQQ(rq); | 3528 | struct cfq_queue *cfqq = RQ_CFQQ(rq); |
3198 | 3529 | ||
3199 | cfq_log_cfqq(cfqd, cfqq, "insert_request"); | 3530 | cfq_log_cfqq(cfqd, cfqq, "insert_request"); |
3200 | cfq_init_prio_data(cfqq, RQ_CIC(rq)->icq.ioc); | 3531 | cfq_init_prio_data(cfqq, RQ_CIC(rq)); |
3201 | 3532 | ||
3202 | rq_set_fifo_time(rq, jiffies + cfqd->cfq_fifo_expire[rq_is_sync(rq)]); | 3533 | rq_set_fifo_time(rq, jiffies + cfqd->cfq_fifo_expire[rq_is_sync(rq)]); |
3203 | list_add_tail(&rq->queuelist, &cfqq->fifo); | 3534 | list_add_tail(&rq->queuelist, &cfqq->fifo); |
3204 | cfq_add_rq_rb(rq); | 3535 | cfq_add_rq_rb(rq); |
3205 | cfq_blkiocg_update_io_add_stats(&(RQ_CFQG(rq))->blkg, | 3536 | cfqg_stats_update_io_add(RQ_CFQG(rq), cfqd->serving_group, |
3206 | &cfqd->serving_group->blkg, rq_data_dir(rq), | 3537 | rq->cmd_flags); |
3207 | rq_is_sync(rq)); | ||
3208 | cfq_rq_enqueued(cfqd, cfqq, rq); | 3538 | cfq_rq_enqueued(cfqd, cfqq, rq); |
3209 | } | 3539 | } |
3210 | 3540 | ||
@@ -3300,9 +3630,8 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq) | |||
3300 | cfqd->rq_in_driver--; | 3630 | cfqd->rq_in_driver--; |
3301 | cfqq->dispatched--; | 3631 | cfqq->dispatched--; |
3302 | (RQ_CFQG(rq))->dispatched--; | 3632 | (RQ_CFQG(rq))->dispatched--; |
3303 | cfq_blkiocg_update_completion_stats(&cfqq->cfqg->blkg, | 3633 | cfqg_stats_update_completion(cfqq->cfqg, rq_start_time_ns(rq), |
3304 | rq_start_time_ns(rq), rq_io_start_time_ns(rq), | 3634 | rq_io_start_time_ns(rq), rq->cmd_flags); |
3305 | rq_data_dir(rq), rq_is_sync(rq)); | ||
3306 | 3635 | ||
3307 | cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--; | 3636 | cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--; |
3308 | 3637 | ||
@@ -3399,7 +3728,7 @@ static int cfq_may_queue(struct request_queue *q, int rw) | |||
3399 | 3728 | ||
3400 | cfqq = cic_to_cfqq(cic, rw_is_sync(rw)); | 3729 | cfqq = cic_to_cfqq(cic, rw_is_sync(rw)); |
3401 | if (cfqq) { | 3730 | if (cfqq) { |
3402 | cfq_init_prio_data(cfqq, cic->icq.ioc); | 3731 | cfq_init_prio_data(cfqq, cic); |
3403 | 3732 | ||
3404 | return __cfq_may_queue(cfqq); | 3733 | return __cfq_may_queue(cfqq); |
3405 | } | 3734 | } |
@@ -3421,7 +3750,7 @@ static void cfq_put_request(struct request *rq) | |||
3421 | cfqq->allocated[rw]--; | 3750 | cfqq->allocated[rw]--; |
3422 | 3751 | ||
3423 | /* Put down rq reference on cfqg */ | 3752 | /* Put down rq reference on cfqg */ |
3424 | cfq_put_cfqg(RQ_CFQG(rq)); | 3753 | cfqg_put(RQ_CFQG(rq)); |
3425 | rq->elv.priv[0] = NULL; | 3754 | rq->elv.priv[0] = NULL; |
3426 | rq->elv.priv[1] = NULL; | 3755 | rq->elv.priv[1] = NULL; |
3427 | 3756 | ||
@@ -3465,32 +3794,25 @@ split_cfqq(struct cfq_io_cq *cic, struct cfq_queue *cfqq) | |||
3465 | * Allocate cfq data structures associated with this request. | 3794 | * Allocate cfq data structures associated with this request. |
3466 | */ | 3795 | */ |
3467 | static int | 3796 | static int |
3468 | cfq_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask) | 3797 | cfq_set_request(struct request_queue *q, struct request *rq, struct bio *bio, |
3798 | gfp_t gfp_mask) | ||
3469 | { | 3799 | { |
3470 | struct cfq_data *cfqd = q->elevator->elevator_data; | 3800 | struct cfq_data *cfqd = q->elevator->elevator_data; |
3471 | struct cfq_io_cq *cic = icq_to_cic(rq->elv.icq); | 3801 | struct cfq_io_cq *cic = icq_to_cic(rq->elv.icq); |
3472 | const int rw = rq_data_dir(rq); | 3802 | const int rw = rq_data_dir(rq); |
3473 | const bool is_sync = rq_is_sync(rq); | 3803 | const bool is_sync = rq_is_sync(rq); |
3474 | struct cfq_queue *cfqq; | 3804 | struct cfq_queue *cfqq; |
3475 | unsigned int changed; | ||
3476 | 3805 | ||
3477 | might_sleep_if(gfp_mask & __GFP_WAIT); | 3806 | might_sleep_if(gfp_mask & __GFP_WAIT); |
3478 | 3807 | ||
3479 | spin_lock_irq(q->queue_lock); | 3808 | spin_lock_irq(q->queue_lock); |
3480 | 3809 | ||
3481 | /* handle changed notifications */ | 3810 | check_ioprio_changed(cic, bio); |
3482 | changed = icq_get_changed(&cic->icq); | 3811 | check_blkcg_changed(cic, bio); |
3483 | if (unlikely(changed & ICQ_IOPRIO_CHANGED)) | ||
3484 | changed_ioprio(cic); | ||
3485 | #ifdef CONFIG_CFQ_GROUP_IOSCHED | ||
3486 | if (unlikely(changed & ICQ_CGROUP_CHANGED)) | ||
3487 | changed_cgroup(cic); | ||
3488 | #endif | ||
3489 | |||
3490 | new_queue: | 3812 | new_queue: |
3491 | cfqq = cic_to_cfqq(cic, is_sync); | 3813 | cfqq = cic_to_cfqq(cic, is_sync); |
3492 | if (!cfqq || cfqq == &cfqd->oom_cfqq) { | 3814 | if (!cfqq || cfqq == &cfqd->oom_cfqq) { |
3493 | cfqq = cfq_get_queue(cfqd, is_sync, cic->icq.ioc, gfp_mask); | 3815 | cfqq = cfq_get_queue(cfqd, is_sync, cic, bio, gfp_mask); |
3494 | cic_set_cfqq(cic, cfqq, is_sync); | 3816 | cic_set_cfqq(cic, cfqq, is_sync); |
3495 | } else { | 3817 | } else { |
3496 | /* | 3818 | /* |
@@ -3516,8 +3838,9 @@ new_queue: | |||
3516 | cfqq->allocated[rw]++; | 3838 | cfqq->allocated[rw]++; |
3517 | 3839 | ||
3518 | cfqq->ref++; | 3840 | cfqq->ref++; |
3841 | cfqg_get(cfqq->cfqg); | ||
3519 | rq->elv.priv[0] = cfqq; | 3842 | rq->elv.priv[0] = cfqq; |
3520 | rq->elv.priv[1] = cfq_ref_get_cfqg(cfqq->cfqg); | 3843 | rq->elv.priv[1] = cfqq->cfqg; |
3521 | spin_unlock_irq(q->queue_lock); | 3844 | spin_unlock_irq(q->queue_lock); |
3522 | return 0; | 3845 | return 0; |
3523 | } | 3846 | } |
@@ -3614,7 +3937,6 @@ static void cfq_exit_queue(struct elevator_queue *e) | |||
3614 | { | 3937 | { |
3615 | struct cfq_data *cfqd = e->elevator_data; | 3938 | struct cfq_data *cfqd = e->elevator_data; |
3616 | struct request_queue *q = cfqd->queue; | 3939 | struct request_queue *q = cfqd->queue; |
3617 | bool wait = false; | ||
3618 | 3940 | ||
3619 | cfq_shutdown_timer_wq(cfqd); | 3941 | cfq_shutdown_timer_wq(cfqd); |
3620 | 3942 | ||
@@ -3624,89 +3946,52 @@ static void cfq_exit_queue(struct elevator_queue *e) | |||
3624 | __cfq_slice_expired(cfqd, cfqd->active_queue, 0); | 3946 | __cfq_slice_expired(cfqd, cfqd->active_queue, 0); |
3625 | 3947 | ||
3626 | cfq_put_async_queues(cfqd); | 3948 | cfq_put_async_queues(cfqd); |
3627 | cfq_release_cfq_groups(cfqd); | ||
3628 | |||
3629 | /* | ||
3630 | * If there are groups which we could not unlink from blkcg list, | ||
3631 | * wait for a rcu period for them to be freed. | ||
3632 | */ | ||
3633 | if (cfqd->nr_blkcg_linked_grps) | ||
3634 | wait = true; | ||
3635 | 3949 | ||
3636 | spin_unlock_irq(q->queue_lock); | 3950 | spin_unlock_irq(q->queue_lock); |
3637 | 3951 | ||
3638 | cfq_shutdown_timer_wq(cfqd); | 3952 | cfq_shutdown_timer_wq(cfqd); |
3639 | 3953 | ||
3640 | /* | 3954 | #ifndef CONFIG_CFQ_GROUP_IOSCHED |
3641 | * Wait for cfqg->blkg->key accessors to exit their grace periods. | 3955 | kfree(cfqd->root_group); |
3642 | * Do this wait only if there are other unlinked groups out | ||
3643 | * there. This can happen if cgroup deletion path claimed the | ||
3644 | * responsibility of cleaning up a group before queue cleanup code | ||
3645 | * get to the group. | ||
3646 | * | ||
3647 | * Do not call synchronize_rcu() unconditionally as there are drivers | ||
3648 | * which create/delete request queue hundreds of times during scan/boot | ||
3649 | * and synchronize_rcu() can take significant time and slow down boot. | ||
3650 | */ | ||
3651 | if (wait) | ||
3652 | synchronize_rcu(); | ||
3653 | |||
3654 | #ifdef CONFIG_CFQ_GROUP_IOSCHED | ||
3655 | /* Free up per cpu stats for root group */ | ||
3656 | free_percpu(cfqd->root_group.blkg.stats_cpu); | ||
3657 | #endif | 3956 | #endif |
3957 | blkcg_deactivate_policy(q, &blkcg_policy_cfq); | ||
3658 | kfree(cfqd); | 3958 | kfree(cfqd); |
3659 | } | 3959 | } |
3660 | 3960 | ||
3661 | static void *cfq_init_queue(struct request_queue *q) | 3961 | static int cfq_init_queue(struct request_queue *q) |
3662 | { | 3962 | { |
3663 | struct cfq_data *cfqd; | 3963 | struct cfq_data *cfqd; |
3664 | int i, j; | 3964 | struct blkcg_gq *blkg __maybe_unused; |
3665 | struct cfq_group *cfqg; | 3965 | int i, ret; |
3666 | struct cfq_rb_root *st; | ||
3667 | 3966 | ||
3668 | cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node); | 3967 | cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node); |
3669 | if (!cfqd) | 3968 | if (!cfqd) |
3670 | return NULL; | 3969 | return -ENOMEM; |
3970 | |||
3971 | cfqd->queue = q; | ||
3972 | q->elevator->elevator_data = cfqd; | ||
3671 | 3973 | ||
3672 | /* Init root service tree */ | 3974 | /* Init root service tree */ |
3673 | cfqd->grp_service_tree = CFQ_RB_ROOT; | 3975 | cfqd->grp_service_tree = CFQ_RB_ROOT; |
3674 | 3976 | ||
3675 | /* Init root group */ | 3977 | /* Init root group and prefer root group over other groups by default */ |
3676 | cfqg = &cfqd->root_group; | ||
3677 | for_each_cfqg_st(cfqg, i, j, st) | ||
3678 | *st = CFQ_RB_ROOT; | ||
3679 | RB_CLEAR_NODE(&cfqg->rb_node); | ||
3680 | |||
3681 | /* Give preference to root group over other groups */ | ||
3682 | cfqg->weight = 2*BLKIO_WEIGHT_DEFAULT; | ||
3683 | |||
3684 | #ifdef CONFIG_CFQ_GROUP_IOSCHED | 3978 | #ifdef CONFIG_CFQ_GROUP_IOSCHED |
3685 | /* | 3979 | ret = blkcg_activate_policy(q, &blkcg_policy_cfq); |
3686 | * Set root group reference to 2. One reference will be dropped when | 3980 | if (ret) |
3687 | * all groups on cfqd->cfqg_list are being deleted during queue exit. | 3981 | goto out_free; |
3688 | * Other reference will remain there as we don't want to delete this | ||
3689 | * group as it is statically allocated and gets destroyed when | ||
3690 | * throtl_data goes away. | ||
3691 | */ | ||
3692 | cfqg->ref = 2; | ||
3693 | |||
3694 | if (blkio_alloc_blkg_stats(&cfqg->blkg)) { | ||
3695 | kfree(cfqg); | ||
3696 | kfree(cfqd); | ||
3697 | return NULL; | ||
3698 | } | ||
3699 | |||
3700 | rcu_read_lock(); | ||
3701 | 3982 | ||
3702 | cfq_blkiocg_add_blkio_group(&blkio_root_cgroup, &cfqg->blkg, | 3983 | cfqd->root_group = blkg_to_cfqg(q->root_blkg); |
3703 | (void *)cfqd, 0); | 3984 | #else |
3704 | rcu_read_unlock(); | 3985 | ret = -ENOMEM; |
3705 | cfqd->nr_blkcg_linked_grps++; | 3986 | cfqd->root_group = kzalloc_node(sizeof(*cfqd->root_group), |
3987 | GFP_KERNEL, cfqd->queue->node); | ||
3988 | if (!cfqd->root_group) | ||
3989 | goto out_free; | ||
3706 | 3990 | ||
3707 | /* Add group on cfqd->cfqg_list */ | 3991 | cfq_init_cfqg_base(cfqd->root_group); |
3708 | hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list); | ||
3709 | #endif | 3992 | #endif |
3993 | cfqd->root_group->weight = 2 * CFQ_WEIGHT_DEFAULT; | ||
3994 | |||
3710 | /* | 3995 | /* |
3711 | * Not strictly needed (since RB_ROOT just clears the node and we | 3996 | * Not strictly needed (since RB_ROOT just clears the node and we |
3712 | * zeroed cfqd on alloc), but better be safe in case someone decides | 3997 | * zeroed cfqd on alloc), but better be safe in case someone decides |
@@ -3718,13 +4003,17 @@ static void *cfq_init_queue(struct request_queue *q) | |||
3718 | /* | 4003 | /* |
3719 | * Our fallback cfqq if cfq_find_alloc_queue() runs into OOM issues. | 4004 | * Our fallback cfqq if cfq_find_alloc_queue() runs into OOM issues. |
3720 | * Grab a permanent reference to it, so that the normal code flow | 4005 | * Grab a permanent reference to it, so that the normal code flow |
3721 | * will not attempt to free it. | 4006 | * will not attempt to free it. oom_cfqq is linked to root_group |
4007 | * but shouldn't hold a reference as it'll never be unlinked. Lose | ||
4008 | * the reference from linking right away. | ||
3722 | */ | 4009 | */ |
3723 | cfq_init_cfqq(cfqd, &cfqd->oom_cfqq, 1, 0); | 4010 | cfq_init_cfqq(cfqd, &cfqd->oom_cfqq, 1, 0); |
3724 | cfqd->oom_cfqq.ref++; | 4011 | cfqd->oom_cfqq.ref++; |
3725 | cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, &cfqd->root_group); | ||
3726 | 4012 | ||
3727 | cfqd->queue = q; | 4013 | spin_lock_irq(q->queue_lock); |
4014 | cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, cfqd->root_group); | ||
4015 | cfqg_put(cfqd->root_group); | ||
4016 | spin_unlock_irq(q->queue_lock); | ||
3728 | 4017 | ||
3729 | init_timer(&cfqd->idle_slice_timer); | 4018 | init_timer(&cfqd->idle_slice_timer); |
3730 | cfqd->idle_slice_timer.function = cfq_idle_slice_timer; | 4019 | cfqd->idle_slice_timer.function = cfq_idle_slice_timer; |
@@ -3750,7 +4039,11 @@ static void *cfq_init_queue(struct request_queue *q) | |||
3750 | * second, in order to have larger depth for async operations. | 4039 | * second, in order to have larger depth for async operations. |
3751 | */ | 4040 | */ |
3752 | cfqd->last_delayed_sync = jiffies - HZ; | 4041 | cfqd->last_delayed_sync = jiffies - HZ; |
3753 | return cfqd; | 4042 | return 0; |
4043 | |||
4044 | out_free: | ||
4045 | kfree(cfqd); | ||
4046 | return ret; | ||
3754 | } | 4047 | } |
3755 | 4048 | ||
3756 | /* | 4049 | /* |
@@ -3877,15 +4170,13 @@ static struct elevator_type iosched_cfq = { | |||
3877 | }; | 4170 | }; |
3878 | 4171 | ||
3879 | #ifdef CONFIG_CFQ_GROUP_IOSCHED | 4172 | #ifdef CONFIG_CFQ_GROUP_IOSCHED |
3880 | static struct blkio_policy_type blkio_policy_cfq = { | 4173 | static struct blkcg_policy blkcg_policy_cfq = { |
3881 | .ops = { | 4174 | .pd_size = sizeof(struct cfq_group), |
3882 | .blkio_unlink_group_fn = cfq_unlink_blkio_group, | 4175 | .cftypes = cfq_blkcg_files, |
3883 | .blkio_update_group_weight_fn = cfq_update_blkio_group_weight, | 4176 | |
3884 | }, | 4177 | .pd_init_fn = cfq_pd_init, |
3885 | .plid = BLKIO_POLICY_PROP, | 4178 | .pd_reset_stats_fn = cfq_pd_reset_stats, |
3886 | }; | 4179 | }; |
3887 | #else | ||
3888 | static struct blkio_policy_type blkio_policy_cfq; | ||
3889 | #endif | 4180 | #endif |
3890 | 4181 | ||
3891 | static int __init cfq_init(void) | 4182 | static int __init cfq_init(void) |
@@ -3906,24 +4197,31 @@ static int __init cfq_init(void) | |||
3906 | #else | 4197 | #else |
3907 | cfq_group_idle = 0; | 4198 | cfq_group_idle = 0; |
3908 | #endif | 4199 | #endif |
4200 | |||
4201 | ret = blkcg_policy_register(&blkcg_policy_cfq); | ||
4202 | if (ret) | ||
4203 | return ret; | ||
4204 | |||
3909 | cfq_pool = KMEM_CACHE(cfq_queue, 0); | 4205 | cfq_pool = KMEM_CACHE(cfq_queue, 0); |
3910 | if (!cfq_pool) | 4206 | if (!cfq_pool) |
3911 | return -ENOMEM; | 4207 | goto err_pol_unreg; |
3912 | 4208 | ||
3913 | ret = elv_register(&iosched_cfq); | 4209 | ret = elv_register(&iosched_cfq); |
3914 | if (ret) { | 4210 | if (ret) |
3915 | kmem_cache_destroy(cfq_pool); | 4211 | goto err_free_pool; |
3916 | return ret; | ||
3917 | } | ||
3918 | |||
3919 | blkio_policy_register(&blkio_policy_cfq); | ||
3920 | 4212 | ||
3921 | return 0; | 4213 | return 0; |
4214 | |||
4215 | err_free_pool: | ||
4216 | kmem_cache_destroy(cfq_pool); | ||
4217 | err_pol_unreg: | ||
4218 | blkcg_policy_unregister(&blkcg_policy_cfq); | ||
4219 | return ret; | ||
3922 | } | 4220 | } |
3923 | 4221 | ||
3924 | static void __exit cfq_exit(void) | 4222 | static void __exit cfq_exit(void) |
3925 | { | 4223 | { |
3926 | blkio_policy_unregister(&blkio_policy_cfq); | 4224 | blkcg_policy_unregister(&blkcg_policy_cfq); |
3927 | elv_unregister(&iosched_cfq); | 4225 | elv_unregister(&iosched_cfq); |
3928 | kmem_cache_destroy(cfq_pool); | 4226 | kmem_cache_destroy(cfq_pool); |
3929 | } | 4227 | } |
diff --git a/block/cfq.h b/block/cfq.h deleted file mode 100644 index 2a155927e37c..000000000000 --- a/block/cfq.h +++ /dev/null | |||
@@ -1,115 +0,0 @@ | |||
1 | #ifndef _CFQ_H | ||
2 | #define _CFQ_H | ||
3 | #include "blk-cgroup.h" | ||
4 | |||
5 | #ifdef CONFIG_CFQ_GROUP_IOSCHED | ||
6 | static inline void cfq_blkiocg_update_io_add_stats(struct blkio_group *blkg, | ||
7 | struct blkio_group *curr_blkg, bool direction, bool sync) | ||
8 | { | ||
9 | blkiocg_update_io_add_stats(blkg, curr_blkg, direction, sync); | ||
10 | } | ||
11 | |||
12 | static inline void cfq_blkiocg_update_dequeue_stats(struct blkio_group *blkg, | ||
13 | unsigned long dequeue) | ||
14 | { | ||
15 | blkiocg_update_dequeue_stats(blkg, dequeue); | ||
16 | } | ||
17 | |||
18 | static inline void cfq_blkiocg_update_timeslice_used(struct blkio_group *blkg, | ||
19 | unsigned long time, unsigned long unaccounted_time) | ||
20 | { | ||
21 | blkiocg_update_timeslice_used(blkg, time, unaccounted_time); | ||
22 | } | ||
23 | |||
24 | static inline void cfq_blkiocg_set_start_empty_time(struct blkio_group *blkg) | ||
25 | { | ||
26 | blkiocg_set_start_empty_time(blkg); | ||
27 | } | ||
28 | |||
29 | static inline void cfq_blkiocg_update_io_remove_stats(struct blkio_group *blkg, | ||
30 | bool direction, bool sync) | ||
31 | { | ||
32 | blkiocg_update_io_remove_stats(blkg, direction, sync); | ||
33 | } | ||
34 | |||
35 | static inline void cfq_blkiocg_update_io_merged_stats(struct blkio_group *blkg, | ||
36 | bool direction, bool sync) | ||
37 | { | ||
38 | blkiocg_update_io_merged_stats(blkg, direction, sync); | ||
39 | } | ||
40 | |||
41 | static inline void cfq_blkiocg_update_idle_time_stats(struct blkio_group *blkg) | ||
42 | { | ||
43 | blkiocg_update_idle_time_stats(blkg); | ||
44 | } | ||
45 | |||
46 | static inline void | ||
47 | cfq_blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg) | ||
48 | { | ||
49 | blkiocg_update_avg_queue_size_stats(blkg); | ||
50 | } | ||
51 | |||
52 | static inline void | ||
53 | cfq_blkiocg_update_set_idle_time_stats(struct blkio_group *blkg) | ||
54 | { | ||
55 | blkiocg_update_set_idle_time_stats(blkg); | ||
56 | } | ||
57 | |||
58 | static inline void cfq_blkiocg_update_dispatch_stats(struct blkio_group *blkg, | ||
59 | uint64_t bytes, bool direction, bool sync) | ||
60 | { | ||
61 | blkiocg_update_dispatch_stats(blkg, bytes, direction, sync); | ||
62 | } | ||
63 | |||
64 | static inline void cfq_blkiocg_update_completion_stats(struct blkio_group *blkg, uint64_t start_time, uint64_t io_start_time, bool direction, bool sync) | ||
65 | { | ||
66 | blkiocg_update_completion_stats(blkg, start_time, io_start_time, | ||
67 | direction, sync); | ||
68 | } | ||
69 | |||
70 | static inline void cfq_blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, | ||
71 | struct blkio_group *blkg, void *key, dev_t dev) { | ||
72 | blkiocg_add_blkio_group(blkcg, blkg, key, dev, BLKIO_POLICY_PROP); | ||
73 | } | ||
74 | |||
75 | static inline int cfq_blkiocg_del_blkio_group(struct blkio_group *blkg) | ||
76 | { | ||
77 | return blkiocg_del_blkio_group(blkg); | ||
78 | } | ||
79 | |||
80 | #else /* CFQ_GROUP_IOSCHED */ | ||
81 | static inline void cfq_blkiocg_update_io_add_stats(struct blkio_group *blkg, | ||
82 | struct blkio_group *curr_blkg, bool direction, bool sync) {} | ||
83 | |||
84 | static inline void cfq_blkiocg_update_dequeue_stats(struct blkio_group *blkg, | ||
85 | unsigned long dequeue) {} | ||
86 | |||
87 | static inline void cfq_blkiocg_update_timeslice_used(struct blkio_group *blkg, | ||
88 | unsigned long time, unsigned long unaccounted_time) {} | ||
89 | static inline void cfq_blkiocg_set_start_empty_time(struct blkio_group *blkg) {} | ||
90 | static inline void cfq_blkiocg_update_io_remove_stats(struct blkio_group *blkg, | ||
91 | bool direction, bool sync) {} | ||
92 | static inline void cfq_blkiocg_update_io_merged_stats(struct blkio_group *blkg, | ||
93 | bool direction, bool sync) {} | ||
94 | static inline void cfq_blkiocg_update_idle_time_stats(struct blkio_group *blkg) | ||
95 | { | ||
96 | } | ||
97 | static inline void | ||
98 | cfq_blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg) {} | ||
99 | |||
100 | static inline void | ||
101 | cfq_blkiocg_update_set_idle_time_stats(struct blkio_group *blkg) {} | ||
102 | |||
103 | static inline void cfq_blkiocg_update_dispatch_stats(struct blkio_group *blkg, | ||
104 | uint64_t bytes, bool direction, bool sync) {} | ||
105 | static inline void cfq_blkiocg_update_completion_stats(struct blkio_group *blkg, uint64_t start_time, uint64_t io_start_time, bool direction, bool sync) {} | ||
106 | |||
107 | static inline void cfq_blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, | ||
108 | struct blkio_group *blkg, void *key, dev_t dev) {} | ||
109 | static inline int cfq_blkiocg_del_blkio_group(struct blkio_group *blkg) | ||
110 | { | ||
111 | return 0; | ||
112 | } | ||
113 | |||
114 | #endif /* CFQ_GROUP_IOSCHED */ | ||
115 | #endif | ||
diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c index 7bf12d793fcd..599b12e5380f 100644 --- a/block/deadline-iosched.c +++ b/block/deadline-iosched.c | |||
@@ -337,13 +337,13 @@ static void deadline_exit_queue(struct elevator_queue *e) | |||
337 | /* | 337 | /* |
338 | * initialize elevator private data (deadline_data). | 338 | * initialize elevator private data (deadline_data). |
339 | */ | 339 | */ |
340 | static void *deadline_init_queue(struct request_queue *q) | 340 | static int deadline_init_queue(struct request_queue *q) |
341 | { | 341 | { |
342 | struct deadline_data *dd; | 342 | struct deadline_data *dd; |
343 | 343 | ||
344 | dd = kmalloc_node(sizeof(*dd), GFP_KERNEL | __GFP_ZERO, q->node); | 344 | dd = kmalloc_node(sizeof(*dd), GFP_KERNEL | __GFP_ZERO, q->node); |
345 | if (!dd) | 345 | if (!dd) |
346 | return NULL; | 346 | return -ENOMEM; |
347 | 347 | ||
348 | INIT_LIST_HEAD(&dd->fifo_list[READ]); | 348 | INIT_LIST_HEAD(&dd->fifo_list[READ]); |
349 | INIT_LIST_HEAD(&dd->fifo_list[WRITE]); | 349 | INIT_LIST_HEAD(&dd->fifo_list[WRITE]); |
@@ -354,7 +354,9 @@ static void *deadline_init_queue(struct request_queue *q) | |||
354 | dd->writes_starved = writes_starved; | 354 | dd->writes_starved = writes_starved; |
355 | dd->front_merges = 1; | 355 | dd->front_merges = 1; |
356 | dd->fifo_batch = fifo_batch; | 356 | dd->fifo_batch = fifo_batch; |
357 | return dd; | 357 | |
358 | q->elevator->elevator_data = dd; | ||
359 | return 0; | ||
358 | } | 360 | } |
359 | 361 | ||
360 | /* | 362 | /* |
diff --git a/block/elevator.c b/block/elevator.c index f016855a46b0..6a55d418896f 100644 --- a/block/elevator.c +++ b/block/elevator.c | |||
@@ -38,6 +38,7 @@ | |||
38 | #include <trace/events/block.h> | 38 | #include <trace/events/block.h> |
39 | 39 | ||
40 | #include "blk.h" | 40 | #include "blk.h" |
41 | #include "blk-cgroup.h" | ||
41 | 42 | ||
42 | static DEFINE_SPINLOCK(elv_list_lock); | 43 | static DEFINE_SPINLOCK(elv_list_lock); |
43 | static LIST_HEAD(elv_list); | 44 | static LIST_HEAD(elv_list); |
@@ -121,15 +122,6 @@ static struct elevator_type *elevator_get(const char *name) | |||
121 | return e; | 122 | return e; |
122 | } | 123 | } |
123 | 124 | ||
124 | static int elevator_init_queue(struct request_queue *q, | ||
125 | struct elevator_queue *eq) | ||
126 | { | ||
127 | eq->elevator_data = eq->type->ops.elevator_init_fn(q); | ||
128 | if (eq->elevator_data) | ||
129 | return 0; | ||
130 | return -ENOMEM; | ||
131 | } | ||
132 | |||
133 | static char chosen_elevator[ELV_NAME_MAX]; | 125 | static char chosen_elevator[ELV_NAME_MAX]; |
134 | 126 | ||
135 | static int __init elevator_setup(char *str) | 127 | static int __init elevator_setup(char *str) |
@@ -188,7 +180,6 @@ static void elevator_release(struct kobject *kobj) | |||
188 | int elevator_init(struct request_queue *q, char *name) | 180 | int elevator_init(struct request_queue *q, char *name) |
189 | { | 181 | { |
190 | struct elevator_type *e = NULL; | 182 | struct elevator_type *e = NULL; |
191 | struct elevator_queue *eq; | ||
192 | int err; | 183 | int err; |
193 | 184 | ||
194 | if (unlikely(q->elevator)) | 185 | if (unlikely(q->elevator)) |
@@ -222,17 +213,16 @@ int elevator_init(struct request_queue *q, char *name) | |||
222 | } | 213 | } |
223 | } | 214 | } |
224 | 215 | ||
225 | eq = elevator_alloc(q, e); | 216 | q->elevator = elevator_alloc(q, e); |
226 | if (!eq) | 217 | if (!q->elevator) |
227 | return -ENOMEM; | 218 | return -ENOMEM; |
228 | 219 | ||
229 | err = elevator_init_queue(q, eq); | 220 | err = e->ops.elevator_init_fn(q); |
230 | if (err) { | 221 | if (err) { |
231 | kobject_put(&eq->kobj); | 222 | kobject_put(&q->elevator->kobj); |
232 | return err; | 223 | return err; |
233 | } | 224 | } |
234 | 225 | ||
235 | q->elevator = eq; | ||
236 | return 0; | 226 | return 0; |
237 | } | 227 | } |
238 | EXPORT_SYMBOL(elevator_init); | 228 | EXPORT_SYMBOL(elevator_init); |
@@ -564,25 +554,6 @@ void elv_drain_elevator(struct request_queue *q) | |||
564 | } | 554 | } |
565 | } | 555 | } |
566 | 556 | ||
567 | void elv_quiesce_start(struct request_queue *q) | ||
568 | { | ||
569 | if (!q->elevator) | ||
570 | return; | ||
571 | |||
572 | spin_lock_irq(q->queue_lock); | ||
573 | queue_flag_set(QUEUE_FLAG_ELVSWITCH, q); | ||
574 | spin_unlock_irq(q->queue_lock); | ||
575 | |||
576 | blk_drain_queue(q, false); | ||
577 | } | ||
578 | |||
579 | void elv_quiesce_end(struct request_queue *q) | ||
580 | { | ||
581 | spin_lock_irq(q->queue_lock); | ||
582 | queue_flag_clear(QUEUE_FLAG_ELVSWITCH, q); | ||
583 | spin_unlock_irq(q->queue_lock); | ||
584 | } | ||
585 | |||
586 | void __elv_add_request(struct request_queue *q, struct request *rq, int where) | 557 | void __elv_add_request(struct request_queue *q, struct request *rq, int where) |
587 | { | 558 | { |
588 | trace_block_rq_insert(q, rq); | 559 | trace_block_rq_insert(q, rq); |
@@ -692,12 +663,13 @@ struct request *elv_former_request(struct request_queue *q, struct request *rq) | |||
692 | return NULL; | 663 | return NULL; |
693 | } | 664 | } |
694 | 665 | ||
695 | int elv_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask) | 666 | int elv_set_request(struct request_queue *q, struct request *rq, |
667 | struct bio *bio, gfp_t gfp_mask) | ||
696 | { | 668 | { |
697 | struct elevator_queue *e = q->elevator; | 669 | struct elevator_queue *e = q->elevator; |
698 | 670 | ||
699 | if (e->type->ops.elevator_set_req_fn) | 671 | if (e->type->ops.elevator_set_req_fn) |
700 | return e->type->ops.elevator_set_req_fn(q, rq, gfp_mask); | 672 | return e->type->ops.elevator_set_req_fn(q, rq, bio, gfp_mask); |
701 | return 0; | 673 | return 0; |
702 | } | 674 | } |
703 | 675 | ||
@@ -801,8 +773,9 @@ static struct kobj_type elv_ktype = { | |||
801 | .release = elevator_release, | 773 | .release = elevator_release, |
802 | }; | 774 | }; |
803 | 775 | ||
804 | int __elv_register_queue(struct request_queue *q, struct elevator_queue *e) | 776 | int elv_register_queue(struct request_queue *q) |
805 | { | 777 | { |
778 | struct elevator_queue *e = q->elevator; | ||
806 | int error; | 779 | int error; |
807 | 780 | ||
808 | error = kobject_add(&e->kobj, &q->kobj, "%s", "iosched"); | 781 | error = kobject_add(&e->kobj, &q->kobj, "%s", "iosched"); |
@@ -820,11 +793,6 @@ int __elv_register_queue(struct request_queue *q, struct elevator_queue *e) | |||
820 | } | 793 | } |
821 | return error; | 794 | return error; |
822 | } | 795 | } |
823 | |||
824 | int elv_register_queue(struct request_queue *q) | ||
825 | { | ||
826 | return __elv_register_queue(q, q->elevator); | ||
827 | } | ||
828 | EXPORT_SYMBOL(elv_register_queue); | 796 | EXPORT_SYMBOL(elv_register_queue); |
829 | 797 | ||
830 | void elv_unregister_queue(struct request_queue *q) | 798 | void elv_unregister_queue(struct request_queue *q) |
@@ -907,53 +875,60 @@ EXPORT_SYMBOL_GPL(elv_unregister); | |||
907 | */ | 875 | */ |
908 | static int elevator_switch(struct request_queue *q, struct elevator_type *new_e) | 876 | static int elevator_switch(struct request_queue *q, struct elevator_type *new_e) |
909 | { | 877 | { |
910 | struct elevator_queue *old_elevator, *e; | 878 | struct elevator_queue *old = q->elevator; |
879 | bool registered = old->registered; | ||
911 | int err; | 880 | int err; |
912 | 881 | ||
913 | /* allocate new elevator */ | 882 | /* |
914 | e = elevator_alloc(q, new_e); | 883 | * Turn on BYPASS and drain all requests w/ elevator private data. |
915 | if (!e) | 884 | * Block layer doesn't call into a quiesced elevator - all requests |
916 | return -ENOMEM; | 885 | * are directly put on the dispatch list without elevator data |
886 | * using INSERT_BACK. All requests have SOFTBARRIER set and no | ||
887 | * merge happens either. | ||
888 | */ | ||
889 | blk_queue_bypass_start(q); | ||
890 | |||
891 | /* unregister and clear all auxiliary data of the old elevator */ | ||
892 | if (registered) | ||
893 | elv_unregister_queue(q); | ||
894 | |||
895 | spin_lock_irq(q->queue_lock); | ||
896 | ioc_clear_queue(q); | ||
897 | spin_unlock_irq(q->queue_lock); | ||
917 | 898 | ||
918 | err = elevator_init_queue(q, e); | 899 | /* allocate, init and register new elevator */ |
900 | err = -ENOMEM; | ||
901 | q->elevator = elevator_alloc(q, new_e); | ||
902 | if (!q->elevator) | ||
903 | goto fail_init; | ||
904 | |||
905 | err = new_e->ops.elevator_init_fn(q); | ||
919 | if (err) { | 906 | if (err) { |
920 | kobject_put(&e->kobj); | 907 | kobject_put(&q->elevator->kobj); |
921 | return err; | 908 | goto fail_init; |
922 | } | 909 | } |
923 | 910 | ||
924 | /* turn on BYPASS and drain all requests w/ elevator private data */ | 911 | if (registered) { |
925 | elv_quiesce_start(q); | 912 | err = elv_register_queue(q); |
926 | |||
927 | /* unregister old queue, register new one and kill old elevator */ | ||
928 | if (q->elevator->registered) { | ||
929 | elv_unregister_queue(q); | ||
930 | err = __elv_register_queue(q, e); | ||
931 | if (err) | 913 | if (err) |
932 | goto fail_register; | 914 | goto fail_register; |
933 | } | 915 | } |
934 | 916 | ||
935 | /* done, clear io_cq's, switch elevators and turn off BYPASS */ | 917 | /* done, kill the old one and finish */ |
936 | spin_lock_irq(q->queue_lock); | 918 | elevator_exit(old); |
937 | ioc_clear_queue(q); | 919 | blk_queue_bypass_end(q); |
938 | old_elevator = q->elevator; | ||
939 | q->elevator = e; | ||
940 | spin_unlock_irq(q->queue_lock); | ||
941 | |||
942 | elevator_exit(old_elevator); | ||
943 | elv_quiesce_end(q); | ||
944 | 920 | ||
945 | blk_add_trace_msg(q, "elv switch: %s", e->type->elevator_name); | 921 | blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name); |
946 | 922 | ||
947 | return 0; | 923 | return 0; |
948 | 924 | ||
949 | fail_register: | 925 | fail_register: |
950 | /* | 926 | elevator_exit(q->elevator); |
951 | * switch failed, exit the new io scheduler and reattach the old | 927 | fail_init: |
952 | * one again (along with re-adding the sysfs dir) | 928 | /* switch failed, restore and re-register old elevator */ |
953 | */ | 929 | q->elevator = old; |
954 | elevator_exit(e); | ||
955 | elv_register_queue(q); | 930 | elv_register_queue(q); |
956 | elv_quiesce_end(q); | 931 | blk_queue_bypass_end(q); |
957 | 932 | ||
958 | return err; | 933 | return err; |
959 | } | 934 | } |
diff --git a/block/noop-iosched.c b/block/noop-iosched.c index 413a0b1d788c..5d1bf70e33d5 100644 --- a/block/noop-iosched.c +++ b/block/noop-iosched.c | |||
@@ -59,15 +59,17 @@ noop_latter_request(struct request_queue *q, struct request *rq) | |||
59 | return list_entry(rq->queuelist.next, struct request, queuelist); | 59 | return list_entry(rq->queuelist.next, struct request, queuelist); |
60 | } | 60 | } |
61 | 61 | ||
62 | static void *noop_init_queue(struct request_queue *q) | 62 | static int noop_init_queue(struct request_queue *q) |
63 | { | 63 | { |
64 | struct noop_data *nd; | 64 | struct noop_data *nd; |
65 | 65 | ||
66 | nd = kmalloc_node(sizeof(*nd), GFP_KERNEL, q->node); | 66 | nd = kmalloc_node(sizeof(*nd), GFP_KERNEL, q->node); |
67 | if (!nd) | 67 | if (!nd) |
68 | return NULL; | 68 | return -ENOMEM; |
69 | |||
69 | INIT_LIST_HEAD(&nd->queue); | 70 | INIT_LIST_HEAD(&nd->queue); |
70 | return nd; | 71 | q->elevator->elevator_data = nd; |
72 | return 0; | ||
71 | } | 73 | } |
72 | 74 | ||
73 | static void noop_exit_queue(struct elevator_queue *e) | 75 | static void noop_exit_queue(struct elevator_queue *e) |
@@ -19,12 +19,14 @@ | |||
19 | #include <linux/swap.h> | 19 | #include <linux/swap.h> |
20 | #include <linux/bio.h> | 20 | #include <linux/bio.h> |
21 | #include <linux/blkdev.h> | 21 | #include <linux/blkdev.h> |
22 | #include <linux/iocontext.h> | ||
22 | #include <linux/slab.h> | 23 | #include <linux/slab.h> |
23 | #include <linux/init.h> | 24 | #include <linux/init.h> |
24 | #include <linux/kernel.h> | 25 | #include <linux/kernel.h> |
25 | #include <linux/export.h> | 26 | #include <linux/export.h> |
26 | #include <linux/mempool.h> | 27 | #include <linux/mempool.h> |
27 | #include <linux/workqueue.h> | 28 | #include <linux/workqueue.h> |
29 | #include <linux/cgroup.h> | ||
28 | #include <scsi/sg.h> /* for struct sg_iovec */ | 30 | #include <scsi/sg.h> /* for struct sg_iovec */ |
29 | 31 | ||
30 | #include <trace/events/block.h> | 32 | #include <trace/events/block.h> |
@@ -418,6 +420,7 @@ void bio_put(struct bio *bio) | |||
418 | * last put frees it | 420 | * last put frees it |
419 | */ | 421 | */ |
420 | if (atomic_dec_and_test(&bio->bi_cnt)) { | 422 | if (atomic_dec_and_test(&bio->bi_cnt)) { |
423 | bio_disassociate_task(bio); | ||
421 | bio->bi_next = NULL; | 424 | bio->bi_next = NULL; |
422 | bio->bi_destructor(bio); | 425 | bio->bi_destructor(bio); |
423 | } | 426 | } |
@@ -1646,6 +1649,64 @@ bad: | |||
1646 | } | 1649 | } |
1647 | EXPORT_SYMBOL(bioset_create); | 1650 | EXPORT_SYMBOL(bioset_create); |
1648 | 1651 | ||
1652 | #ifdef CONFIG_BLK_CGROUP | ||
1653 | /** | ||
1654 | * bio_associate_current - associate a bio with %current | ||
1655 | * @bio: target bio | ||
1656 | * | ||
1657 | * Associate @bio with %current if it hasn't been associated yet. Block | ||
1658 | * layer will treat @bio as if it were issued by %current no matter which | ||
1659 | * task actually issues it. | ||
1660 | * | ||
1661 | * This function takes an extra reference of @task's io_context and blkcg | ||
1662 | * which will be put when @bio is released. The caller must own @bio, | ||
1663 | * ensure %current->io_context exists, and is responsible for synchronizing | ||
1664 | * calls to this function. | ||
1665 | */ | ||
1666 | int bio_associate_current(struct bio *bio) | ||
1667 | { | ||
1668 | struct io_context *ioc; | ||
1669 | struct cgroup_subsys_state *css; | ||
1670 | |||
1671 | if (bio->bi_ioc) | ||
1672 | return -EBUSY; | ||
1673 | |||
1674 | ioc = current->io_context; | ||
1675 | if (!ioc) | ||
1676 | return -ENOENT; | ||
1677 | |||
1678 | /* acquire active ref on @ioc and associate */ | ||
1679 | get_io_context_active(ioc); | ||
1680 | bio->bi_ioc = ioc; | ||
1681 | |||
1682 | /* associate blkcg if exists */ | ||
1683 | rcu_read_lock(); | ||
1684 | css = task_subsys_state(current, blkio_subsys_id); | ||
1685 | if (css && css_tryget(css)) | ||
1686 | bio->bi_css = css; | ||
1687 | rcu_read_unlock(); | ||
1688 | |||
1689 | return 0; | ||
1690 | } | ||
1691 | |||
1692 | /** | ||
1693 | * bio_disassociate_task - undo bio_associate_current() | ||
1694 | * @bio: target bio | ||
1695 | */ | ||
1696 | void bio_disassociate_task(struct bio *bio) | ||
1697 | { | ||
1698 | if (bio->bi_ioc) { | ||
1699 | put_io_context(bio->bi_ioc); | ||
1700 | bio->bi_ioc = NULL; | ||
1701 | } | ||
1702 | if (bio->bi_css) { | ||
1703 | css_put(bio->bi_css); | ||
1704 | bio->bi_css = NULL; | ||
1705 | } | ||
1706 | } | ||
1707 | |||
1708 | #endif /* CONFIG_BLK_CGROUP */ | ||
1709 | |||
1649 | static void __init biovec_init_slabs(void) | 1710 | static void __init biovec_init_slabs(void) |
1650 | { | 1711 | { |
1651 | int i; | 1712 | int i; |
diff --git a/fs/ioprio.c b/fs/ioprio.c index 5e6dbe8958fc..e50170ca7c33 100644 --- a/fs/ioprio.c +++ b/fs/ioprio.c | |||
@@ -50,7 +50,7 @@ int set_task_ioprio(struct task_struct *task, int ioprio) | |||
50 | 50 | ||
51 | ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE); | 51 | ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE); |
52 | if (ioc) { | 52 | if (ioc) { |
53 | ioc_ioprio_changed(ioc, ioprio); | 53 | ioc->ioprio = ioprio; |
54 | put_io_context(ioc); | 54 | put_io_context(ioc); |
55 | } | 55 | } |
56 | 56 | ||
diff --git a/fs/splice.c b/fs/splice.c index f8476841eb04..406ef2b792c2 100644 --- a/fs/splice.c +++ b/fs/splice.c | |||
@@ -1388,7 +1388,7 @@ static long do_splice(struct file *in, loff_t __user *off_in, | |||
1388 | */ | 1388 | */ |
1389 | static int get_iovec_page_array(const struct iovec __user *iov, | 1389 | static int get_iovec_page_array(const struct iovec __user *iov, |
1390 | unsigned int nr_vecs, struct page **pages, | 1390 | unsigned int nr_vecs, struct page **pages, |
1391 | struct partial_page *partial, int aligned, | 1391 | struct partial_page *partial, bool aligned, |
1392 | unsigned int pipe_buffers) | 1392 | unsigned int pipe_buffers) |
1393 | { | 1393 | { |
1394 | int buffers = 0, error = 0; | 1394 | int buffers = 0, error = 0; |
@@ -1626,7 +1626,7 @@ static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov, | |||
1626 | return -ENOMEM; | 1626 | return -ENOMEM; |
1627 | 1627 | ||
1628 | spd.nr_pages = get_iovec_page_array(iov, nr_segs, spd.pages, | 1628 | spd.nr_pages = get_iovec_page_array(iov, nr_segs, spd.pages, |
1629 | spd.partial, flags & SPLICE_F_GIFT, | 1629 | spd.partial, false, |
1630 | pipe->buffers); | 1630 | pipe->buffers); |
1631 | if (spd.nr_pages <= 0) | 1631 | if (spd.nr_pages <= 0) |
1632 | ret = spd.nr_pages; | 1632 | ret = spd.nr_pages; |
diff --git a/include/linux/bio.h b/include/linux/bio.h index 4d94eb8bcbcc..26435890dc87 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h | |||
@@ -269,6 +269,14 @@ extern struct bio_vec *bvec_alloc_bs(gfp_t, int, unsigned long *, struct bio_set | |||
269 | extern void bvec_free_bs(struct bio_set *, struct bio_vec *, unsigned int); | 269 | extern void bvec_free_bs(struct bio_set *, struct bio_vec *, unsigned int); |
270 | extern unsigned int bvec_nr_vecs(unsigned short idx); | 270 | extern unsigned int bvec_nr_vecs(unsigned short idx); |
271 | 271 | ||
272 | #ifdef CONFIG_BLK_CGROUP | ||
273 | int bio_associate_current(struct bio *bio); | ||
274 | void bio_disassociate_task(struct bio *bio); | ||
275 | #else /* CONFIG_BLK_CGROUP */ | ||
276 | static inline int bio_associate_current(struct bio *bio) { return -ENOENT; } | ||
277 | static inline void bio_disassociate_task(struct bio *bio) { } | ||
278 | #endif /* CONFIG_BLK_CGROUP */ | ||
279 | |||
272 | /* | 280 | /* |
273 | * bio_set is used to allow other portions of the IO system to | 281 | * bio_set is used to allow other portions of the IO system to |
274 | * allocate their own private memory pools for bio and iovec structures. | 282 | * allocate their own private memory pools for bio and iovec structures. |
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 4053cbd4490e..0edb65dd8edd 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h | |||
@@ -14,6 +14,8 @@ struct bio; | |||
14 | struct bio_integrity_payload; | 14 | struct bio_integrity_payload; |
15 | struct page; | 15 | struct page; |
16 | struct block_device; | 16 | struct block_device; |
17 | struct io_context; | ||
18 | struct cgroup_subsys_state; | ||
17 | typedef void (bio_end_io_t) (struct bio *, int); | 19 | typedef void (bio_end_io_t) (struct bio *, int); |
18 | typedef void (bio_destructor_t) (struct bio *); | 20 | typedef void (bio_destructor_t) (struct bio *); |
19 | 21 | ||
@@ -66,6 +68,14 @@ struct bio { | |||
66 | bio_end_io_t *bi_end_io; | 68 | bio_end_io_t *bi_end_io; |
67 | 69 | ||
68 | void *bi_private; | 70 | void *bi_private; |
71 | #ifdef CONFIG_BLK_CGROUP | ||
72 | /* | ||
73 | * Optional ioc and css associated with this bio. Put on bio | ||
74 | * release. Read comment on top of bio_associate_current(). | ||
75 | */ | ||
76 | struct io_context *bi_ioc; | ||
77 | struct cgroup_subsys_state *bi_css; | ||
78 | #endif | ||
69 | #if defined(CONFIG_BLK_DEV_INTEGRITY) | 79 | #if defined(CONFIG_BLK_DEV_INTEGRITY) |
70 | struct bio_integrity_payload *bi_integrity; /* data integrity */ | 80 | struct bio_integrity_payload *bi_integrity; /* data integrity */ |
71 | #endif | 81 | #endif |
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 4d4ac24a263e..ba43f408baa3 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h | |||
@@ -32,10 +32,17 @@ struct blk_trace; | |||
32 | struct request; | 32 | struct request; |
33 | struct sg_io_hdr; | 33 | struct sg_io_hdr; |
34 | struct bsg_job; | 34 | struct bsg_job; |
35 | struct blkcg_gq; | ||
35 | 36 | ||
36 | #define BLKDEV_MIN_RQ 4 | 37 | #define BLKDEV_MIN_RQ 4 |
37 | #define BLKDEV_MAX_RQ 128 /* Default maximum */ | 38 | #define BLKDEV_MAX_RQ 128 /* Default maximum */ |
38 | 39 | ||
40 | /* | ||
41 | * Maximum number of blkcg policies allowed to be registered concurrently. | ||
42 | * Defined here to simplify include dependency. | ||
43 | */ | ||
44 | #define BLKCG_MAX_POLS 2 | ||
45 | |||
39 | struct request; | 46 | struct request; |
40 | typedef void (rq_end_io_fn)(struct request *, int); | 47 | typedef void (rq_end_io_fn)(struct request *, int); |
41 | 48 | ||
@@ -363,6 +370,11 @@ struct request_queue { | |||
363 | struct list_head timeout_list; | 370 | struct list_head timeout_list; |
364 | 371 | ||
365 | struct list_head icq_list; | 372 | struct list_head icq_list; |
373 | #ifdef CONFIG_BLK_CGROUP | ||
374 | DECLARE_BITMAP (blkcg_pols, BLKCG_MAX_POLS); | ||
375 | struct blkcg_gq *root_blkg; | ||
376 | struct list_head blkg_list; | ||
377 | #endif | ||
366 | 378 | ||
367 | struct queue_limits limits; | 379 | struct queue_limits limits; |
368 | 380 | ||
@@ -390,12 +402,17 @@ struct request_queue { | |||
390 | 402 | ||
391 | struct mutex sysfs_lock; | 403 | struct mutex sysfs_lock; |
392 | 404 | ||
405 | int bypass_depth; | ||
406 | |||
393 | #if defined(CONFIG_BLK_DEV_BSG) | 407 | #if defined(CONFIG_BLK_DEV_BSG) |
394 | bsg_job_fn *bsg_job_fn; | 408 | bsg_job_fn *bsg_job_fn; |
395 | int bsg_job_size; | 409 | int bsg_job_size; |
396 | struct bsg_class_device bsg_dev; | 410 | struct bsg_class_device bsg_dev; |
397 | #endif | 411 | #endif |
398 | 412 | ||
413 | #ifdef CONFIG_BLK_CGROUP | ||
414 | struct list_head all_q_node; | ||
415 | #endif | ||
399 | #ifdef CONFIG_BLK_DEV_THROTTLING | 416 | #ifdef CONFIG_BLK_DEV_THROTTLING |
400 | /* Throttle data */ | 417 | /* Throttle data */ |
401 | struct throtl_data *td; | 418 | struct throtl_data *td; |
@@ -407,7 +424,7 @@ struct request_queue { | |||
407 | #define QUEUE_FLAG_SYNCFULL 3 /* read queue has been filled */ | 424 | #define QUEUE_FLAG_SYNCFULL 3 /* read queue has been filled */ |
408 | #define QUEUE_FLAG_ASYNCFULL 4 /* write queue has been filled */ | 425 | #define QUEUE_FLAG_ASYNCFULL 4 /* write queue has been filled */ |
409 | #define QUEUE_FLAG_DEAD 5 /* queue being torn down */ | 426 | #define QUEUE_FLAG_DEAD 5 /* queue being torn down */ |
410 | #define QUEUE_FLAG_ELVSWITCH 6 /* don't use elevator, just do FIFO */ | 427 | #define QUEUE_FLAG_BYPASS 6 /* act as dumb FIFO queue */ |
411 | #define QUEUE_FLAG_BIDI 7 /* queue supports bidi requests */ | 428 | #define QUEUE_FLAG_BIDI 7 /* queue supports bidi requests */ |
412 | #define QUEUE_FLAG_NOMERGES 8 /* disable merge attempts */ | 429 | #define QUEUE_FLAG_NOMERGES 8 /* disable merge attempts */ |
413 | #define QUEUE_FLAG_SAME_COMP 9 /* complete on same CPU-group */ | 430 | #define QUEUE_FLAG_SAME_COMP 9 /* complete on same CPU-group */ |
@@ -491,6 +508,7 @@ static inline void queue_flag_clear(unsigned int flag, struct request_queue *q) | |||
491 | #define blk_queue_tagged(q) test_bit(QUEUE_FLAG_QUEUED, &(q)->queue_flags) | 508 | #define blk_queue_tagged(q) test_bit(QUEUE_FLAG_QUEUED, &(q)->queue_flags) |
492 | #define blk_queue_stopped(q) test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags) | 509 | #define blk_queue_stopped(q) test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags) |
493 | #define blk_queue_dead(q) test_bit(QUEUE_FLAG_DEAD, &(q)->queue_flags) | 510 | #define blk_queue_dead(q) test_bit(QUEUE_FLAG_DEAD, &(q)->queue_flags) |
511 | #define blk_queue_bypass(q) test_bit(QUEUE_FLAG_BYPASS, &(q)->queue_flags) | ||
494 | #define blk_queue_nomerges(q) test_bit(QUEUE_FLAG_NOMERGES, &(q)->queue_flags) | 512 | #define blk_queue_nomerges(q) test_bit(QUEUE_FLAG_NOMERGES, &(q)->queue_flags) |
495 | #define blk_queue_noxmerges(q) \ | 513 | #define blk_queue_noxmerges(q) \ |
496 | test_bit(QUEUE_FLAG_NOXMERGES, &(q)->queue_flags) | 514 | test_bit(QUEUE_FLAG_NOXMERGES, &(q)->queue_flags) |
diff --git a/include/linux/elevator.h b/include/linux/elevator.h index 7d4e0356f329..c03af7687bb4 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h | |||
@@ -28,12 +28,13 @@ typedef int (elevator_may_queue_fn) (struct request_queue *, int); | |||
28 | 28 | ||
29 | typedef void (elevator_init_icq_fn) (struct io_cq *); | 29 | typedef void (elevator_init_icq_fn) (struct io_cq *); |
30 | typedef void (elevator_exit_icq_fn) (struct io_cq *); | 30 | typedef void (elevator_exit_icq_fn) (struct io_cq *); |
31 | typedef int (elevator_set_req_fn) (struct request_queue *, struct request *, gfp_t); | 31 | typedef int (elevator_set_req_fn) (struct request_queue *, struct request *, |
32 | struct bio *, gfp_t); | ||
32 | typedef void (elevator_put_req_fn) (struct request *); | 33 | typedef void (elevator_put_req_fn) (struct request *); |
33 | typedef void (elevator_activate_req_fn) (struct request_queue *, struct request *); | 34 | typedef void (elevator_activate_req_fn) (struct request_queue *, struct request *); |
34 | typedef void (elevator_deactivate_req_fn) (struct request_queue *, struct request *); | 35 | typedef void (elevator_deactivate_req_fn) (struct request_queue *, struct request *); |
35 | 36 | ||
36 | typedef void *(elevator_init_fn) (struct request_queue *); | 37 | typedef int (elevator_init_fn) (struct request_queue *); |
37 | typedef void (elevator_exit_fn) (struct elevator_queue *); | 38 | typedef void (elevator_exit_fn) (struct elevator_queue *); |
38 | 39 | ||
39 | struct elevator_ops | 40 | struct elevator_ops |
@@ -129,7 +130,8 @@ extern void elv_unregister_queue(struct request_queue *q); | |||
129 | extern int elv_may_queue(struct request_queue *, int); | 130 | extern int elv_may_queue(struct request_queue *, int); |
130 | extern void elv_abort_queue(struct request_queue *); | 131 | extern void elv_abort_queue(struct request_queue *); |
131 | extern void elv_completed_request(struct request_queue *, struct request *); | 132 | extern void elv_completed_request(struct request_queue *, struct request *); |
132 | extern int elv_set_request(struct request_queue *, struct request *, gfp_t); | 133 | extern int elv_set_request(struct request_queue *q, struct request *rq, |
134 | struct bio *bio, gfp_t gfp_mask); | ||
133 | extern void elv_put_request(struct request_queue *, struct request *); | 135 | extern void elv_put_request(struct request_queue *, struct request *); |
134 | extern void elv_drain_elevator(struct request_queue *); | 136 | extern void elv_drain_elevator(struct request_queue *); |
135 | 137 | ||
diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h index 1a3018063034..df38db2ef45b 100644 --- a/include/linux/iocontext.h +++ b/include/linux/iocontext.h | |||
@@ -6,11 +6,7 @@ | |||
6 | #include <linux/workqueue.h> | 6 | #include <linux/workqueue.h> |
7 | 7 | ||
8 | enum { | 8 | enum { |
9 | ICQ_IOPRIO_CHANGED = 1 << 0, | ||
10 | ICQ_CGROUP_CHANGED = 1 << 1, | ||
11 | ICQ_EXITED = 1 << 2, | 9 | ICQ_EXITED = 1 << 2, |
12 | |||
13 | ICQ_CHANGED_MASK = ICQ_IOPRIO_CHANGED | ICQ_CGROUP_CHANGED, | ||
14 | }; | 10 | }; |
15 | 11 | ||
16 | /* | 12 | /* |
@@ -100,6 +96,7 @@ struct io_cq { | |||
100 | */ | 96 | */ |
101 | struct io_context { | 97 | struct io_context { |
102 | atomic_long_t refcount; | 98 | atomic_long_t refcount; |
99 | atomic_t active_ref; | ||
103 | atomic_t nr_tasks; | 100 | atomic_t nr_tasks; |
104 | 101 | ||
105 | /* all the fields below are protected by this lock */ | 102 | /* all the fields below are protected by this lock */ |
@@ -120,29 +117,37 @@ struct io_context { | |||
120 | struct work_struct release_work; | 117 | struct work_struct release_work; |
121 | }; | 118 | }; |
122 | 119 | ||
123 | static inline struct io_context *ioc_task_link(struct io_context *ioc) | 120 | /** |
121 | * get_io_context_active - get active reference on ioc | ||
122 | * @ioc: ioc of interest | ||
123 | * | ||
124 | * Only iocs with active reference can issue new IOs. This function | ||
125 | * acquires an active reference on @ioc. The caller must already have an | ||
126 | * active reference on @ioc. | ||
127 | */ | ||
128 | static inline void get_io_context_active(struct io_context *ioc) | ||
124 | { | 129 | { |
125 | /* | 130 | WARN_ON_ONCE(atomic_long_read(&ioc->refcount) <= 0); |
126 | * if ref count is zero, don't allow sharing (ioc is going away, it's | 131 | WARN_ON_ONCE(atomic_read(&ioc->active_ref) <= 0); |
127 | * a race). | 132 | atomic_long_inc(&ioc->refcount); |
128 | */ | 133 | atomic_inc(&ioc->active_ref); |
129 | if (ioc && atomic_long_inc_not_zero(&ioc->refcount)) { | 134 | } |
130 | atomic_inc(&ioc->nr_tasks); | 135 | |
131 | return ioc; | 136 | static inline void ioc_task_link(struct io_context *ioc) |
132 | } | 137 | { |
138 | get_io_context_active(ioc); | ||
133 | 139 | ||
134 | return NULL; | 140 | WARN_ON_ONCE(atomic_read(&ioc->nr_tasks) <= 0); |
141 | atomic_inc(&ioc->nr_tasks); | ||
135 | } | 142 | } |
136 | 143 | ||
137 | struct task_struct; | 144 | struct task_struct; |
138 | #ifdef CONFIG_BLOCK | 145 | #ifdef CONFIG_BLOCK |
139 | void put_io_context(struct io_context *ioc); | 146 | void put_io_context(struct io_context *ioc); |
147 | void put_io_context_active(struct io_context *ioc); | ||
140 | void exit_io_context(struct task_struct *task); | 148 | void exit_io_context(struct task_struct *task); |
141 | struct io_context *get_task_io_context(struct task_struct *task, | 149 | struct io_context *get_task_io_context(struct task_struct *task, |
142 | gfp_t gfp_flags, int node); | 150 | gfp_t gfp_flags, int node); |
143 | void ioc_ioprio_changed(struct io_context *ioc, int ioprio); | ||
144 | void ioc_cgroup_changed(struct io_context *ioc); | ||
145 | unsigned int icq_get_changed(struct io_cq *icq); | ||
146 | #else | 151 | #else |
147 | struct io_context; | 152 | struct io_context; |
148 | static inline void put_io_context(struct io_context *ioc) { } | 153 | static inline void put_io_context(struct io_context *ioc) { } |
diff --git a/include/linux/ioprio.h b/include/linux/ioprio.h index 76dad4808847..beb9ce1c2c23 100644 --- a/include/linux/ioprio.h +++ b/include/linux/ioprio.h | |||
@@ -42,26 +42,14 @@ enum { | |||
42 | }; | 42 | }; |
43 | 43 | ||
44 | /* | 44 | /* |
45 | * if process has set io priority explicitly, use that. if not, convert | 45 | * Fallback BE priority |
46 | * the cpu scheduler nice value to an io priority | ||
47 | */ | 46 | */ |
48 | #define IOPRIO_NORM (4) | 47 | #define IOPRIO_NORM (4) |
49 | static inline int task_ioprio(struct io_context *ioc) | ||
50 | { | ||
51 | if (ioprio_valid(ioc->ioprio)) | ||
52 | return IOPRIO_PRIO_DATA(ioc->ioprio); | ||
53 | |||
54 | return IOPRIO_NORM; | ||
55 | } | ||
56 | |||
57 | static inline int task_ioprio_class(struct io_context *ioc) | ||
58 | { | ||
59 | if (ioprio_valid(ioc->ioprio)) | ||
60 | return IOPRIO_PRIO_CLASS(ioc->ioprio); | ||
61 | |||
62 | return IOPRIO_CLASS_BE; | ||
63 | } | ||
64 | 48 | ||
49 | /* | ||
50 | * if process has set io priority explicitly, use that. if not, convert | ||
51 | * the cpu scheduler nice value to an io priority | ||
52 | */ | ||
65 | static inline int task_nice_ioprio(struct task_struct *task) | 53 | static inline int task_nice_ioprio(struct task_struct *task) |
66 | { | 54 | { |
67 | return (task_nice(task) + 20) / 5; | 55 | return (task_nice(task) + 20) / 5; |
diff --git a/init/Kconfig b/init/Kconfig index 81816b82860b..1e004d057468 100644 --- a/init/Kconfig +++ b/init/Kconfig | |||
@@ -803,7 +803,7 @@ config RT_GROUP_SCHED | |||
803 | endif #CGROUP_SCHED | 803 | endif #CGROUP_SCHED |
804 | 804 | ||
805 | config BLK_CGROUP | 805 | config BLK_CGROUP |
806 | tristate "Block IO controller" | 806 | bool "Block IO controller" |
807 | depends on BLOCK | 807 | depends on BLOCK |
808 | default n | 808 | default n |
809 | ---help--- | 809 | ---help--- |
diff --git a/kernel/fork.c b/kernel/fork.c index 017fb23d5983..31a32c7dd169 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -976,9 +976,8 @@ static int copy_io(unsigned long clone_flags, struct task_struct *tsk) | |||
976 | * Share io context with parent, if CLONE_IO is set | 976 | * Share io context with parent, if CLONE_IO is set |
977 | */ | 977 | */ |
978 | if (clone_flags & CLONE_IO) { | 978 | if (clone_flags & CLONE_IO) { |
979 | tsk->io_context = ioc_task_link(ioc); | 979 | ioc_task_link(ioc); |
980 | if (unlikely(!tsk->io_context)) | 980 | tsk->io_context = ioc; |
981 | return -ENOMEM; | ||
982 | } else if (ioprio_valid(ioc->ioprio)) { | 981 | } else if (ioprio_valid(ioc->ioprio)) { |
983 | new_ioc = get_task_io_context(tsk, GFP_KERNEL, NUMA_NO_NODE); | 982 | new_ioc = get_task_io_context(tsk, GFP_KERNEL, NUMA_NO_NODE); |
984 | if (unlikely(!new_ioc)) | 983 | if (unlikely(!new_ioc)) |