diff options
author | Jonathan Herman <hermanjl@cs.unc.edu> | 2013-01-17 16:15:55 -0500 |
---|---|---|
committer | Jonathan Herman <hermanjl@cs.unc.edu> | 2013-01-17 16:15:55 -0500 |
commit | 8dea78da5cee153b8af9c07a2745f6c55057fe12 (patch) | |
tree | a8f4d49d63b1ecc92f2fddceba0655b2472c5bd9 /block | |
parent | 406089d01562f1e2bf9f089fd7637009ebaad589 (diff) |
Patched in Tegra support.
Diffstat (limited to 'block')
62 files changed, 4105 insertions, 10436 deletions
diff --git a/block/Kconfig b/block/Kconfig index 4a85ccf8d4c..e97934eecec 100644 --- a/block/Kconfig +++ b/block/Kconfig | |||
@@ -4,7 +4,6 @@ | |||
4 | menuconfig BLOCK | 4 | menuconfig BLOCK |
5 | bool "Enable the block layer" if EXPERT | 5 | bool "Enable the block layer" if EXPERT |
6 | default y | 6 | default y |
7 | select PERCPU_RWSEM | ||
8 | help | 7 | help |
9 | Provide block layer support for the kernel. | 8 | Provide block layer support for the kernel. |
10 | 9 | ||
@@ -90,7 +89,7 @@ config BLK_DEV_INTEGRITY | |||
90 | 89 | ||
91 | config BLK_DEV_THROTTLING | 90 | config BLK_DEV_THROTTLING |
92 | bool "Block layer bio throttling support" | 91 | bool "Block layer bio throttling support" |
93 | depends on BLK_CGROUP=y | 92 | depends on BLK_CGROUP=y && EXPERIMENTAL |
94 | default n | 93 | default n |
95 | ---help--- | 94 | ---help--- |
96 | Block layer bio throttling support. It can be used to limit | 95 | Block layer bio throttling support. It can be used to limit |
@@ -100,12 +99,6 @@ config BLK_DEV_THROTTLING | |||
100 | 99 | ||
101 | See Documentation/cgroups/blkio-controller.txt for more information. | 100 | See Documentation/cgroups/blkio-controller.txt for more information. |
102 | 101 | ||
103 | menu "Partition Types" | ||
104 | |||
105 | source "block/partitions/Kconfig" | ||
106 | |||
107 | endmenu | ||
108 | |||
109 | endif # BLOCK | 102 | endif # BLOCK |
110 | 103 | ||
111 | config BLOCK_COMPAT | 104 | config BLOCK_COMPAT |
diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched index 421bef9c4c4..3199b76f795 100644 --- a/block/Kconfig.iosched +++ b/block/Kconfig.iosched | |||
@@ -23,6 +23,8 @@ config IOSCHED_DEADLINE | |||
23 | 23 | ||
24 | config IOSCHED_CFQ | 24 | config IOSCHED_CFQ |
25 | tristate "CFQ I/O scheduler" | 25 | tristate "CFQ I/O scheduler" |
26 | # If BLK_CGROUP is a module, CFQ has to be built as module. | ||
27 | depends on (BLK_CGROUP=m && m) || !BLK_CGROUP || BLK_CGROUP=y | ||
26 | default y | 28 | default y |
27 | ---help--- | 29 | ---help--- |
28 | The CFQ I/O scheduler tries to distribute bandwidth equally | 30 | The CFQ I/O scheduler tries to distribute bandwidth equally |
@@ -32,6 +34,8 @@ config IOSCHED_CFQ | |||
32 | 34 | ||
33 | This is the default I/O scheduler. | 35 | This is the default I/O scheduler. |
34 | 36 | ||
37 | Note: If BLK_CGROUP=m, then CFQ can be built only as module. | ||
38 | |||
35 | config CFQ_GROUP_IOSCHED | 39 | config CFQ_GROUP_IOSCHED |
36 | bool "CFQ Group Scheduling support" | 40 | bool "CFQ Group Scheduling support" |
37 | depends on IOSCHED_CFQ && BLK_CGROUP | 41 | depends on IOSCHED_CFQ && BLK_CGROUP |
diff --git a/block/Makefile b/block/Makefile index 39b76ba66ff..514c6e4f427 100644 --- a/block/Makefile +++ b/block/Makefile | |||
@@ -5,8 +5,7 @@ | |||
5 | obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \ | 5 | obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \ |
6 | blk-flush.o blk-settings.o blk-ioc.o blk-map.o \ | 6 | blk-flush.o blk-settings.o blk-ioc.o blk-map.o \ |
7 | blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \ | 7 | blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \ |
8 | blk-iopoll.o blk-lib.o ioctl.o genhd.o scsi_ioctl.o \ | 8 | blk-iopoll.o blk-lib.o ioctl.o genhd.o scsi_ioctl.o |
9 | partition-generic.o partitions/ | ||
10 | 9 | ||
11 | obj-$(CONFIG_BLK_DEV_BSG) += bsg.o | 10 | obj-$(CONFIG_BLK_DEV_BSG) += bsg.o |
12 | obj-$(CONFIG_BLK_DEV_BSGLIB) += bsg-lib.o | 11 | obj-$(CONFIG_BLK_DEV_BSGLIB) += bsg-lib.o |
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index b8858fb0caf..b596e54ddd7 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c | |||
@@ -11,959 +11,1656 @@ | |||
11 | * Nauman Rafique <nauman@google.com> | 11 | * Nauman Rafique <nauman@google.com> |
12 | */ | 12 | */ |
13 | #include <linux/ioprio.h> | 13 | #include <linux/ioprio.h> |
14 | #include <linux/seq_file.h> | ||
14 | #include <linux/kdev_t.h> | 15 | #include <linux/kdev_t.h> |
15 | #include <linux/module.h> | 16 | #include <linux/module.h> |
16 | #include <linux/err.h> | 17 | #include <linux/err.h> |
17 | #include <linux/blkdev.h> | 18 | #include <linux/blkdev.h> |
18 | #include <linux/slab.h> | 19 | #include <linux/slab.h> |
19 | #include <linux/genhd.h> | ||
20 | #include <linux/delay.h> | ||
21 | #include <linux/atomic.h> | ||
22 | #include "blk-cgroup.h" | 20 | #include "blk-cgroup.h" |
23 | #include "blk.h" | 21 | #include <linux/genhd.h> |
24 | 22 | ||
25 | #define MAX_KEY_LEN 100 | 23 | #define MAX_KEY_LEN 100 |
26 | 24 | ||
27 | static DEFINE_MUTEX(blkcg_pol_mutex); | 25 | static DEFINE_SPINLOCK(blkio_list_lock); |
26 | static LIST_HEAD(blkio_list); | ||
28 | 27 | ||
29 | struct blkcg blkcg_root = { .cfq_weight = 2 * CFQ_WEIGHT_DEFAULT }; | 28 | struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT }; |
30 | EXPORT_SYMBOL_GPL(blkcg_root); | 29 | EXPORT_SYMBOL_GPL(blkio_root_cgroup); |
31 | 30 | ||
32 | static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS]; | 31 | static struct cgroup_subsys_state *blkiocg_create(struct cgroup_subsys *, |
32 | struct cgroup *); | ||
33 | static int blkiocg_can_attach_task(struct cgroup *, struct task_struct *); | ||
34 | static void blkiocg_attach_task(struct cgroup *, struct task_struct *); | ||
35 | static void blkiocg_destroy(struct cgroup_subsys *, struct cgroup *); | ||
36 | static int blkiocg_populate(struct cgroup_subsys *, struct cgroup *); | ||
33 | 37 | ||
34 | static bool blkcg_policy_enabled(struct request_queue *q, | 38 | /* for encoding cft->private value on file */ |
35 | const struct blkcg_policy *pol) | 39 | #define BLKIOFILE_PRIVATE(x, val) (((x) << 16) | (val)) |
40 | /* What policy owns the file, proportional or throttle */ | ||
41 | #define BLKIOFILE_POLICY(val) (((val) >> 16) & 0xffff) | ||
42 | #define BLKIOFILE_ATTR(val) ((val) & 0xffff) | ||
43 | |||
44 | struct cgroup_subsys blkio_subsys = { | ||
45 | .name = "blkio", | ||
46 | .create = blkiocg_create, | ||
47 | .can_attach_task = blkiocg_can_attach_task, | ||
48 | .attach_task = blkiocg_attach_task, | ||
49 | .destroy = blkiocg_destroy, | ||
50 | .populate = blkiocg_populate, | ||
51 | #ifdef CONFIG_BLK_CGROUP | ||
52 | /* note: blkio_subsys_id is otherwise defined in blk-cgroup.h */ | ||
53 | .subsys_id = blkio_subsys_id, | ||
54 | #endif | ||
55 | .use_id = 1, | ||
56 | .module = THIS_MODULE, | ||
57 | }; | ||
58 | EXPORT_SYMBOL_GPL(blkio_subsys); | ||
59 | |||
60 | static inline void blkio_policy_insert_node(struct blkio_cgroup *blkcg, | ||
61 | struct blkio_policy_node *pn) | ||
36 | { | 62 | { |
37 | return pol && test_bit(pol->plid, q->blkcg_pols); | 63 | list_add(&pn->node, &blkcg->policy_list); |
38 | } | 64 | } |
39 | 65 | ||
40 | /** | 66 | static inline bool cftype_blkg_same_policy(struct cftype *cft, |
41 | * blkg_free - free a blkg | 67 | struct blkio_group *blkg) |
42 | * @blkg: blkg to free | ||
43 | * | ||
44 | * Free @blkg which may be partially allocated. | ||
45 | */ | ||
46 | static void blkg_free(struct blkcg_gq *blkg) | ||
47 | { | 68 | { |
48 | int i; | 69 | enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); |
49 | 70 | ||
50 | if (!blkg) | 71 | if (blkg->plid == plid) |
51 | return; | 72 | return 1; |
73 | |||
74 | return 0; | ||
75 | } | ||
52 | 76 | ||
53 | for (i = 0; i < BLKCG_MAX_POLS; i++) { | 77 | /* Determines if policy node matches cgroup file being accessed */ |
54 | struct blkcg_policy *pol = blkcg_policy[i]; | 78 | static inline bool pn_matches_cftype(struct cftype *cft, |
55 | struct blkg_policy_data *pd = blkg->pd[i]; | 79 | struct blkio_policy_node *pn) |
80 | { | ||
81 | enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); | ||
82 | int fileid = BLKIOFILE_ATTR(cft->private); | ||
56 | 83 | ||
57 | if (!pd) | 84 | return (plid == pn->plid && fileid == pn->fileid); |
58 | continue; | 85 | } |
59 | 86 | ||
60 | if (pol && pol->pd_exit_fn) | 87 | /* Must be called with blkcg->lock held */ |
61 | pol->pd_exit_fn(blkg); | 88 | static inline void blkio_policy_delete_node(struct blkio_policy_node *pn) |
89 | { | ||
90 | list_del(&pn->node); | ||
91 | } | ||
62 | 92 | ||
63 | kfree(pd); | 93 | /* Must be called with blkcg->lock held */ |
94 | static struct blkio_policy_node * | ||
95 | blkio_policy_search_node(const struct blkio_cgroup *blkcg, dev_t dev, | ||
96 | enum blkio_policy_id plid, int fileid) | ||
97 | { | ||
98 | struct blkio_policy_node *pn; | ||
99 | |||
100 | list_for_each_entry(pn, &blkcg->policy_list, node) { | ||
101 | if (pn->dev == dev && pn->plid == plid && pn->fileid == fileid) | ||
102 | return pn; | ||
64 | } | 103 | } |
65 | 104 | ||
66 | blk_exit_rl(&blkg->rl); | 105 | return NULL; |
67 | kfree(blkg); | ||
68 | } | 106 | } |
69 | 107 | ||
70 | /** | 108 | struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup) |
71 | * blkg_alloc - allocate a blkg | ||
72 | * @blkcg: block cgroup the new blkg is associated with | ||
73 | * @q: request_queue the new blkg is associated with | ||
74 | * @gfp_mask: allocation mask to use | ||
75 | * | ||
76 | * Allocate a new blkg assocating @blkcg and @q. | ||
77 | */ | ||
78 | static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q, | ||
79 | gfp_t gfp_mask) | ||
80 | { | 109 | { |
81 | struct blkcg_gq *blkg; | 110 | return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id), |
82 | int i; | 111 | struct blkio_cgroup, css); |
112 | } | ||
113 | EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup); | ||
83 | 114 | ||
84 | /* alloc and init base part */ | 115 | struct blkio_cgroup *task_blkio_cgroup(struct task_struct *tsk) |
85 | blkg = kzalloc_node(sizeof(*blkg), gfp_mask, q->node); | 116 | { |
86 | if (!blkg) | 117 | return container_of(task_subsys_state(tsk, blkio_subsys_id), |
87 | return NULL; | 118 | struct blkio_cgroup, css); |
119 | } | ||
120 | EXPORT_SYMBOL_GPL(task_blkio_cgroup); | ||
88 | 121 | ||
89 | blkg->q = q; | 122 | static inline void |
90 | INIT_LIST_HEAD(&blkg->q_node); | 123 | blkio_update_group_weight(struct blkio_group *blkg, unsigned int weight) |
91 | blkg->blkcg = blkcg; | 124 | { |
92 | blkg->refcnt = 1; | 125 | struct blkio_policy_type *blkiop; |
93 | 126 | ||
94 | /* root blkg uses @q->root_rl, init rl only for !root blkgs */ | 127 | list_for_each_entry(blkiop, &blkio_list, list) { |
95 | if (blkcg != &blkcg_root) { | 128 | /* If this policy does not own the blkg, do not send updates */ |
96 | if (blk_init_rl(&blkg->rl, q, gfp_mask)) | 129 | if (blkiop->plid != blkg->plid) |
97 | goto err_free; | 130 | continue; |
98 | blkg->rl.blkg = blkg; | 131 | if (blkiop->ops.blkio_update_group_weight_fn) |
132 | blkiop->ops.blkio_update_group_weight_fn(blkg->key, | ||
133 | blkg, weight); | ||
99 | } | 134 | } |
135 | } | ||
100 | 136 | ||
101 | for (i = 0; i < BLKCG_MAX_POLS; i++) { | 137 | static inline void blkio_update_group_bps(struct blkio_group *blkg, u64 bps, |
102 | struct blkcg_policy *pol = blkcg_policy[i]; | 138 | int fileid) |
103 | struct blkg_policy_data *pd; | 139 | { |
140 | struct blkio_policy_type *blkiop; | ||
104 | 141 | ||
105 | if (!blkcg_policy_enabled(q, pol)) | 142 | list_for_each_entry(blkiop, &blkio_list, list) { |
106 | continue; | ||
107 | 143 | ||
108 | /* alloc per-policy data and attach it to blkg */ | 144 | /* If this policy does not own the blkg, do not send updates */ |
109 | pd = kzalloc_node(pol->pd_size, gfp_mask, q->node); | 145 | if (blkiop->plid != blkg->plid) |
110 | if (!pd) | 146 | continue; |
111 | goto err_free; | ||
112 | 147 | ||
113 | blkg->pd[i] = pd; | 148 | if (fileid == BLKIO_THROTL_read_bps_device |
114 | pd->blkg = blkg; | 149 | && blkiop->ops.blkio_update_group_read_bps_fn) |
150 | blkiop->ops.blkio_update_group_read_bps_fn(blkg->key, | ||
151 | blkg, bps); | ||
115 | 152 | ||
116 | /* invoke per-policy init */ | 153 | if (fileid == BLKIO_THROTL_write_bps_device |
117 | if (blkcg_policy_enabled(blkg->q, pol)) | 154 | && blkiop->ops.blkio_update_group_write_bps_fn) |
118 | pol->pd_init_fn(blkg); | 155 | blkiop->ops.blkio_update_group_write_bps_fn(blkg->key, |
156 | blkg, bps); | ||
119 | } | 157 | } |
120 | |||
121 | return blkg; | ||
122 | |||
123 | err_free: | ||
124 | blkg_free(blkg); | ||
125 | return NULL; | ||
126 | } | 158 | } |
127 | 159 | ||
128 | static struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, | 160 | static inline void blkio_update_group_iops(struct blkio_group *blkg, |
129 | struct request_queue *q) | 161 | unsigned int iops, int fileid) |
130 | { | 162 | { |
131 | struct blkcg_gq *blkg; | 163 | struct blkio_policy_type *blkiop; |
132 | 164 | ||
133 | blkg = rcu_dereference(blkcg->blkg_hint); | 165 | list_for_each_entry(blkiop, &blkio_list, list) { |
134 | if (blkg && blkg->q == q) | ||
135 | return blkg; | ||
136 | 166 | ||
137 | /* | 167 | /* If this policy does not own the blkg, do not send updates */ |
138 | * Hint didn't match. Look up from the radix tree. Note that we | 168 | if (blkiop->plid != blkg->plid) |
139 | * may not be holding queue_lock and thus are not sure whether | 169 | continue; |
140 | * @blkg from blkg_tree has already been removed or not, so we | ||
141 | * can't update hint to the lookup result. Leave it to the caller. | ||
142 | */ | ||
143 | blkg = radix_tree_lookup(&blkcg->blkg_tree, q->id); | ||
144 | if (blkg && blkg->q == q) | ||
145 | return blkg; | ||
146 | 170 | ||
147 | return NULL; | 171 | if (fileid == BLKIO_THROTL_read_iops_device |
172 | && blkiop->ops.blkio_update_group_read_iops_fn) | ||
173 | blkiop->ops.blkio_update_group_read_iops_fn(blkg->key, | ||
174 | blkg, iops); | ||
175 | |||
176 | if (fileid == BLKIO_THROTL_write_iops_device | ||
177 | && blkiop->ops.blkio_update_group_write_iops_fn) | ||
178 | blkiop->ops.blkio_update_group_write_iops_fn(blkg->key, | ||
179 | blkg,iops); | ||
180 | } | ||
148 | } | 181 | } |
149 | 182 | ||
150 | /** | 183 | /* |
151 | * blkg_lookup - lookup blkg for the specified blkcg - q pair | 184 | * Add to the appropriate stat variable depending on the request type. |
152 | * @blkcg: blkcg of interest | 185 | * This should be called with the blkg->stats_lock held. |
153 | * @q: request_queue of interest | ||
154 | * | ||
155 | * Lookup blkg for the @blkcg - @q pair. This function should be called | ||
156 | * under RCU read lock and is guaranteed to return %NULL if @q is bypassing | ||
157 | * - see blk_queue_bypass_start() for details. | ||
158 | */ | 186 | */ |
159 | struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q) | 187 | static void blkio_add_stat(uint64_t *stat, uint64_t add, bool direction, |
188 | bool sync) | ||
160 | { | 189 | { |
161 | WARN_ON_ONCE(!rcu_read_lock_held()); | 190 | if (direction) |
162 | 191 | stat[BLKIO_STAT_WRITE] += add; | |
163 | if (unlikely(blk_queue_bypass(q))) | 192 | else |
164 | return NULL; | 193 | stat[BLKIO_STAT_READ] += add; |
165 | return __blkg_lookup(blkcg, q); | 194 | if (sync) |
195 | stat[BLKIO_STAT_SYNC] += add; | ||
196 | else | ||
197 | stat[BLKIO_STAT_ASYNC] += add; | ||
166 | } | 198 | } |
167 | EXPORT_SYMBOL_GPL(blkg_lookup); | ||
168 | 199 | ||
169 | /* | 200 | /* |
170 | * If @new_blkg is %NULL, this function tries to allocate a new one as | 201 | * Decrements the appropriate stat variable if non-zero depending on the |
171 | * necessary using %GFP_ATOMIC. @new_blkg is always consumed on return. | 202 | * request type. Panics on value being zero. |
203 | * This should be called with the blkg->stats_lock held. | ||
172 | */ | 204 | */ |
173 | static struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg, | 205 | static void blkio_check_and_dec_stat(uint64_t *stat, bool direction, bool sync) |
174 | struct request_queue *q, | ||
175 | struct blkcg_gq *new_blkg) | ||
176 | { | 206 | { |
177 | struct blkcg_gq *blkg; | 207 | if (direction) { |
178 | int ret; | 208 | BUG_ON(stat[BLKIO_STAT_WRITE] == 0); |
209 | stat[BLKIO_STAT_WRITE]--; | ||
210 | } else { | ||
211 | BUG_ON(stat[BLKIO_STAT_READ] == 0); | ||
212 | stat[BLKIO_STAT_READ]--; | ||
213 | } | ||
214 | if (sync) { | ||
215 | BUG_ON(stat[BLKIO_STAT_SYNC] == 0); | ||
216 | stat[BLKIO_STAT_SYNC]--; | ||
217 | } else { | ||
218 | BUG_ON(stat[BLKIO_STAT_ASYNC] == 0); | ||
219 | stat[BLKIO_STAT_ASYNC]--; | ||
220 | } | ||
221 | } | ||
179 | 222 | ||
180 | WARN_ON_ONCE(!rcu_read_lock_held()); | 223 | #ifdef CONFIG_DEBUG_BLK_CGROUP |
181 | lockdep_assert_held(q->queue_lock); | 224 | /* This should be called with the blkg->stats_lock held. */ |
225 | static void blkio_set_start_group_wait_time(struct blkio_group *blkg, | ||
226 | struct blkio_group *curr_blkg) | ||
227 | { | ||
228 | if (blkio_blkg_waiting(&blkg->stats)) | ||
229 | return; | ||
230 | if (blkg == curr_blkg) | ||
231 | return; | ||
232 | blkg->stats.start_group_wait_time = sched_clock(); | ||
233 | blkio_mark_blkg_waiting(&blkg->stats); | ||
234 | } | ||
182 | 235 | ||
183 | /* lookup and update hint on success, see __blkg_lookup() for details */ | 236 | /* This should be called with the blkg->stats_lock held. */ |
184 | blkg = __blkg_lookup(blkcg, q); | 237 | static void blkio_update_group_wait_time(struct blkio_group_stats *stats) |
185 | if (blkg) { | 238 | { |
186 | rcu_assign_pointer(blkcg->blkg_hint, blkg); | 239 | unsigned long long now; |
187 | goto out_free; | ||
188 | } | ||
189 | 240 | ||
190 | /* blkg holds a reference to blkcg */ | 241 | if (!blkio_blkg_waiting(stats)) |
191 | if (!css_tryget(&blkcg->css)) { | 242 | return; |
192 | blkg = ERR_PTR(-EINVAL); | ||
193 | goto out_free; | ||
194 | } | ||
195 | 243 | ||
196 | /* allocate */ | 244 | now = sched_clock(); |
197 | if (!new_blkg) { | 245 | if (time_after64(now, stats->start_group_wait_time)) |
198 | new_blkg = blkg_alloc(blkcg, q, GFP_ATOMIC); | 246 | stats->group_wait_time += now - stats->start_group_wait_time; |
199 | if (unlikely(!new_blkg)) { | 247 | blkio_clear_blkg_waiting(stats); |
200 | blkg = ERR_PTR(-ENOMEM); | 248 | } |
201 | goto out_put; | ||
202 | } | ||
203 | } | ||
204 | blkg = new_blkg; | ||
205 | 249 | ||
206 | /* insert */ | 250 | /* This should be called with the blkg->stats_lock held. */ |
207 | spin_lock(&blkcg->lock); | 251 | static void blkio_end_empty_time(struct blkio_group_stats *stats) |
208 | ret = radix_tree_insert(&blkcg->blkg_tree, q->id, blkg); | 252 | { |
209 | if (likely(!ret)) { | 253 | unsigned long long now; |
210 | hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list); | ||
211 | list_add(&blkg->q_node, &q->blkg_list); | ||
212 | } | ||
213 | spin_unlock(&blkcg->lock); | ||
214 | 254 | ||
215 | if (!ret) | 255 | if (!blkio_blkg_empty(stats)) |
216 | return blkg; | 256 | return; |
217 | 257 | ||
218 | blkg = ERR_PTR(ret); | 258 | now = sched_clock(); |
219 | out_put: | 259 | if (time_after64(now, stats->start_empty_time)) |
220 | css_put(&blkcg->css); | 260 | stats->empty_time += now - stats->start_empty_time; |
221 | out_free: | 261 | blkio_clear_blkg_empty(stats); |
222 | blkg_free(new_blkg); | ||
223 | return blkg; | ||
224 | } | 262 | } |
225 | 263 | ||
226 | struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, | 264 | void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg) |
227 | struct request_queue *q) | ||
228 | { | 265 | { |
229 | /* | 266 | unsigned long flags; |
230 | * This could be the first entry point of blkcg implementation and | 267 | |
231 | * we shouldn't allow anything to go through for a bypassing queue. | 268 | spin_lock_irqsave(&blkg->stats_lock, flags); |
232 | */ | 269 | BUG_ON(blkio_blkg_idling(&blkg->stats)); |
233 | if (unlikely(blk_queue_bypass(q))) | 270 | blkg->stats.start_idle_time = sched_clock(); |
234 | return ERR_PTR(blk_queue_dying(q) ? -EINVAL : -EBUSY); | 271 | blkio_mark_blkg_idling(&blkg->stats); |
235 | return __blkg_lookup_create(blkcg, q, NULL); | 272 | spin_unlock_irqrestore(&blkg->stats_lock, flags); |
236 | } | 273 | } |
237 | EXPORT_SYMBOL_GPL(blkg_lookup_create); | 274 | EXPORT_SYMBOL_GPL(blkiocg_update_set_idle_time_stats); |
238 | 275 | ||
239 | static void blkg_destroy(struct blkcg_gq *blkg) | 276 | void blkiocg_update_idle_time_stats(struct blkio_group *blkg) |
240 | { | 277 | { |
241 | struct blkcg *blkcg = blkg->blkcg; | 278 | unsigned long flags; |
279 | unsigned long long now; | ||
280 | struct blkio_group_stats *stats; | ||
281 | |||
282 | spin_lock_irqsave(&blkg->stats_lock, flags); | ||
283 | stats = &blkg->stats; | ||
284 | if (blkio_blkg_idling(stats)) { | ||
285 | now = sched_clock(); | ||
286 | if (time_after64(now, stats->start_idle_time)) | ||
287 | stats->idle_time += now - stats->start_idle_time; | ||
288 | blkio_clear_blkg_idling(stats); | ||
289 | } | ||
290 | spin_unlock_irqrestore(&blkg->stats_lock, flags); | ||
291 | } | ||
292 | EXPORT_SYMBOL_GPL(blkiocg_update_idle_time_stats); | ||
242 | 293 | ||
243 | lockdep_assert_held(blkg->q->queue_lock); | 294 | void blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg) |
244 | lockdep_assert_held(&blkcg->lock); | 295 | { |
296 | unsigned long flags; | ||
297 | struct blkio_group_stats *stats; | ||
298 | |||
299 | spin_lock_irqsave(&blkg->stats_lock, flags); | ||
300 | stats = &blkg->stats; | ||
301 | stats->avg_queue_size_sum += | ||
302 | stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] + | ||
303 | stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE]; | ||
304 | stats->avg_queue_size_samples++; | ||
305 | blkio_update_group_wait_time(stats); | ||
306 | spin_unlock_irqrestore(&blkg->stats_lock, flags); | ||
307 | } | ||
308 | EXPORT_SYMBOL_GPL(blkiocg_update_avg_queue_size_stats); | ||
245 | 309 | ||
246 | /* Something wrong if we are trying to remove same group twice */ | 310 | void blkiocg_set_start_empty_time(struct blkio_group *blkg) |
247 | WARN_ON_ONCE(list_empty(&blkg->q_node)); | 311 | { |
248 | WARN_ON_ONCE(hlist_unhashed(&blkg->blkcg_node)); | 312 | unsigned long flags; |
313 | struct blkio_group_stats *stats; | ||
249 | 314 | ||
250 | radix_tree_delete(&blkcg->blkg_tree, blkg->q->id); | 315 | spin_lock_irqsave(&blkg->stats_lock, flags); |
251 | list_del_init(&blkg->q_node); | 316 | stats = &blkg->stats; |
252 | hlist_del_init_rcu(&blkg->blkcg_node); | ||
253 | 317 | ||
254 | /* | 318 | if (stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] || |
255 | * Both setting lookup hint to and clearing it from @blkg are done | 319 | stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE]) { |
256 | * under queue_lock. If it's not pointing to @blkg now, it never | 320 | spin_unlock_irqrestore(&blkg->stats_lock, flags); |
257 | * will. Hint assignment itself can race safely. | 321 | return; |
258 | */ | 322 | } |
259 | if (rcu_dereference_raw(blkcg->blkg_hint) == blkg) | ||
260 | rcu_assign_pointer(blkcg->blkg_hint, NULL); | ||
261 | 323 | ||
262 | /* | 324 | /* |
263 | * Put the reference taken at the time of creation so that when all | 325 | * group is already marked empty. This can happen if cfqq got new |
264 | * queues are gone, group can be destroyed. | 326 | * request in parent group and moved to this group while being added |
327 | * to service tree. Just ignore the event and move on. | ||
265 | */ | 328 | */ |
266 | blkg_put(blkg); | 329 | if(blkio_blkg_empty(stats)) { |
330 | spin_unlock_irqrestore(&blkg->stats_lock, flags); | ||
331 | return; | ||
332 | } | ||
333 | |||
334 | stats->start_empty_time = sched_clock(); | ||
335 | blkio_mark_blkg_empty(stats); | ||
336 | spin_unlock_irqrestore(&blkg->stats_lock, flags); | ||
267 | } | 337 | } |
338 | EXPORT_SYMBOL_GPL(blkiocg_set_start_empty_time); | ||
268 | 339 | ||
269 | /** | 340 | void blkiocg_update_dequeue_stats(struct blkio_group *blkg, |
270 | * blkg_destroy_all - destroy all blkgs associated with a request_queue | 341 | unsigned long dequeue) |
271 | * @q: request_queue of interest | 342 | { |
272 | * | 343 | blkg->stats.dequeue += dequeue; |
273 | * Destroy all blkgs associated with @q. | 344 | } |
274 | */ | 345 | EXPORT_SYMBOL_GPL(blkiocg_update_dequeue_stats); |
275 | static void blkg_destroy_all(struct request_queue *q) | 346 | #else |
347 | static inline void blkio_set_start_group_wait_time(struct blkio_group *blkg, | ||
348 | struct blkio_group *curr_blkg) {} | ||
349 | static inline void blkio_end_empty_time(struct blkio_group_stats *stats) {} | ||
350 | #endif | ||
351 | |||
352 | void blkiocg_update_io_add_stats(struct blkio_group *blkg, | ||
353 | struct blkio_group *curr_blkg, bool direction, | ||
354 | bool sync) | ||
276 | { | 355 | { |
277 | struct blkcg_gq *blkg, *n; | 356 | unsigned long flags; |
357 | |||
358 | spin_lock_irqsave(&blkg->stats_lock, flags); | ||
359 | blkio_add_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED], 1, direction, | ||
360 | sync); | ||
361 | blkio_end_empty_time(&blkg->stats); | ||
362 | blkio_set_start_group_wait_time(blkg, curr_blkg); | ||
363 | spin_unlock_irqrestore(&blkg->stats_lock, flags); | ||
364 | } | ||
365 | EXPORT_SYMBOL_GPL(blkiocg_update_io_add_stats); | ||
278 | 366 | ||
279 | lockdep_assert_held(q->queue_lock); | 367 | void blkiocg_update_io_remove_stats(struct blkio_group *blkg, |
368 | bool direction, bool sync) | ||
369 | { | ||
370 | unsigned long flags; | ||
280 | 371 | ||
281 | list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) { | 372 | spin_lock_irqsave(&blkg->stats_lock, flags); |
282 | struct blkcg *blkcg = blkg->blkcg; | 373 | blkio_check_and_dec_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED], |
374 | direction, sync); | ||
375 | spin_unlock_irqrestore(&blkg->stats_lock, flags); | ||
376 | } | ||
377 | EXPORT_SYMBOL_GPL(blkiocg_update_io_remove_stats); | ||
283 | 378 | ||
284 | spin_lock(&blkcg->lock); | 379 | void blkiocg_update_timeslice_used(struct blkio_group *blkg, unsigned long time, |
285 | blkg_destroy(blkg); | 380 | unsigned long unaccounted_time) |
286 | spin_unlock(&blkcg->lock); | 381 | { |
287 | } | 382 | unsigned long flags; |
383 | |||
384 | spin_lock_irqsave(&blkg->stats_lock, flags); | ||
385 | blkg->stats.time += time; | ||
386 | #ifdef CONFIG_DEBUG_BLK_CGROUP | ||
387 | blkg->stats.unaccounted_time += unaccounted_time; | ||
388 | #endif | ||
389 | spin_unlock_irqrestore(&blkg->stats_lock, flags); | ||
390 | } | ||
391 | EXPORT_SYMBOL_GPL(blkiocg_update_timeslice_used); | ||
392 | |||
393 | /* | ||
394 | * should be called under rcu read lock or queue lock to make sure blkg pointer | ||
395 | * is valid. | ||
396 | */ | ||
397 | void blkiocg_update_dispatch_stats(struct blkio_group *blkg, | ||
398 | uint64_t bytes, bool direction, bool sync) | ||
399 | { | ||
400 | struct blkio_group_stats_cpu *stats_cpu; | ||
401 | unsigned long flags; | ||
288 | 402 | ||
289 | /* | 403 | /* |
290 | * root blkg is destroyed. Just clear the pointer since | 404 | * Disabling interrupts to provide mutual exclusion between two |
291 | * root_rl does not take reference on root blkg. | 405 | * writes on same cpu. It probably is not needed for 64bit. Not |
406 | * optimizing that case yet. | ||
292 | */ | 407 | */ |
293 | q->root_blkg = NULL; | 408 | local_irq_save(flags); |
294 | q->root_rl.blkg = NULL; | 409 | |
410 | stats_cpu = this_cpu_ptr(blkg->stats_cpu); | ||
411 | |||
412 | u64_stats_update_begin(&stats_cpu->syncp); | ||
413 | stats_cpu->sectors += bytes >> 9; | ||
414 | blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_SERVICED], | ||
415 | 1, direction, sync); | ||
416 | blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_SERVICE_BYTES], | ||
417 | bytes, direction, sync); | ||
418 | u64_stats_update_end(&stats_cpu->syncp); | ||
419 | local_irq_restore(flags); | ||
295 | } | 420 | } |
421 | EXPORT_SYMBOL_GPL(blkiocg_update_dispatch_stats); | ||
296 | 422 | ||
297 | static void blkg_rcu_free(struct rcu_head *rcu_head) | 423 | void blkiocg_update_completion_stats(struct blkio_group *blkg, |
424 | uint64_t start_time, uint64_t io_start_time, bool direction, bool sync) | ||
298 | { | 425 | { |
299 | blkg_free(container_of(rcu_head, struct blkcg_gq, rcu_head)); | 426 | struct blkio_group_stats *stats; |
427 | unsigned long flags; | ||
428 | unsigned long long now = sched_clock(); | ||
429 | |||
430 | spin_lock_irqsave(&blkg->stats_lock, flags); | ||
431 | stats = &blkg->stats; | ||
432 | if (time_after64(now, io_start_time)) | ||
433 | blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICE_TIME], | ||
434 | now - io_start_time, direction, sync); | ||
435 | if (time_after64(io_start_time, start_time)) | ||
436 | blkio_add_stat(stats->stat_arr[BLKIO_STAT_WAIT_TIME], | ||
437 | io_start_time - start_time, direction, sync); | ||
438 | spin_unlock_irqrestore(&blkg->stats_lock, flags); | ||
300 | } | 439 | } |
440 | EXPORT_SYMBOL_GPL(blkiocg_update_completion_stats); | ||
301 | 441 | ||
302 | void __blkg_release(struct blkcg_gq *blkg) | 442 | /* Merged stats are per cpu. */ |
443 | void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction, | ||
444 | bool sync) | ||
303 | { | 445 | { |
304 | /* release the extra blkcg reference this blkg has been holding */ | 446 | struct blkio_group_stats_cpu *stats_cpu; |
305 | css_put(&blkg->blkcg->css); | 447 | unsigned long flags; |
306 | 448 | ||
307 | /* | 449 | /* |
308 | * A group is freed in rcu manner. But having an rcu lock does not | 450 | * Disabling interrupts to provide mutual exclusion between two |
309 | * mean that one can access all the fields of blkg and assume these | 451 | * writes on same cpu. It probably is not needed for 64bit. Not |
310 | * are valid. For example, don't try to follow throtl_data and | 452 | * optimizing that case yet. |
311 | * request queue links. | ||
312 | * | ||
313 | * Having a reference to blkg under an rcu allows acess to only | ||
314 | * values local to groups like group stats and group rate limits | ||
315 | */ | 453 | */ |
316 | call_rcu(&blkg->rcu_head, blkg_rcu_free); | 454 | local_irq_save(flags); |
455 | |||
456 | stats_cpu = this_cpu_ptr(blkg->stats_cpu); | ||
457 | |||
458 | u64_stats_update_begin(&stats_cpu->syncp); | ||
459 | blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_MERGED], 1, | ||
460 | direction, sync); | ||
461 | u64_stats_update_end(&stats_cpu->syncp); | ||
462 | local_irq_restore(flags); | ||
317 | } | 463 | } |
318 | EXPORT_SYMBOL_GPL(__blkg_release); | 464 | EXPORT_SYMBOL_GPL(blkiocg_update_io_merged_stats); |
319 | 465 | ||
320 | /* | 466 | /* |
321 | * The next function used by blk_queue_for_each_rl(). It's a bit tricky | 467 | * This function allocates the per cpu stats for blkio_group. Should be called |
322 | * because the root blkg uses @q->root_rl instead of its own rl. | 468 | * from sleepable context as alloc_per_cpu() requires that. |
323 | */ | 469 | */ |
324 | struct request_list *__blk_queue_next_rl(struct request_list *rl, | 470 | int blkio_alloc_blkg_stats(struct blkio_group *blkg) |
325 | struct request_queue *q) | ||
326 | { | 471 | { |
327 | struct list_head *ent; | 472 | /* Allocate memory for per cpu stats */ |
328 | struct blkcg_gq *blkg; | 473 | blkg->stats_cpu = alloc_percpu(struct blkio_group_stats_cpu); |
474 | if (!blkg->stats_cpu) | ||
475 | return -ENOMEM; | ||
476 | return 0; | ||
477 | } | ||
478 | EXPORT_SYMBOL_GPL(blkio_alloc_blkg_stats); | ||
329 | 479 | ||
330 | /* | 480 | void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, |
331 | * Determine the current blkg list_head. The first entry is | 481 | struct blkio_group *blkg, void *key, dev_t dev, |
332 | * root_rl which is off @q->blkg_list and mapped to the head. | 482 | enum blkio_policy_id plid) |
333 | */ | 483 | { |
334 | if (rl == &q->root_rl) { | 484 | unsigned long flags; |
335 | ent = &q->blkg_list; | 485 | |
336 | /* There are no more block groups, hence no request lists */ | 486 | spin_lock_irqsave(&blkcg->lock, flags); |
337 | if (list_empty(ent)) | 487 | spin_lock_init(&blkg->stats_lock); |
338 | return NULL; | 488 | rcu_assign_pointer(blkg->key, key); |
339 | } else { | 489 | blkg->blkcg_id = css_id(&blkcg->css); |
340 | blkg = container_of(rl, struct blkcg_gq, rl); | 490 | hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list); |
341 | ent = &blkg->q_node; | 491 | blkg->plid = plid; |
342 | } | 492 | spin_unlock_irqrestore(&blkcg->lock, flags); |
493 | /* Need to take css reference ? */ | ||
494 | cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path)); | ||
495 | blkg->dev = dev; | ||
496 | } | ||
497 | EXPORT_SYMBOL_GPL(blkiocg_add_blkio_group); | ||
498 | |||
499 | static void __blkiocg_del_blkio_group(struct blkio_group *blkg) | ||
500 | { | ||
501 | hlist_del_init_rcu(&blkg->blkcg_node); | ||
502 | blkg->blkcg_id = 0; | ||
503 | } | ||
343 | 504 | ||
344 | /* walk to the next list_head, skip root blkcg */ | 505 | /* |
345 | ent = ent->next; | 506 | * returns 0 if blkio_group was still on cgroup list. Otherwise returns 1 |
346 | if (ent == &q->root_blkg->q_node) | 507 | * indicating that blk_group was unhashed by the time we got to it. |
347 | ent = ent->next; | 508 | */ |
348 | if (ent == &q->blkg_list) | 509 | int blkiocg_del_blkio_group(struct blkio_group *blkg) |
349 | return NULL; | 510 | { |
511 | struct blkio_cgroup *blkcg; | ||
512 | unsigned long flags; | ||
513 | struct cgroup_subsys_state *css; | ||
514 | int ret = 1; | ||
515 | |||
516 | rcu_read_lock(); | ||
517 | css = css_lookup(&blkio_subsys, blkg->blkcg_id); | ||
518 | if (css) { | ||
519 | blkcg = container_of(css, struct blkio_cgroup, css); | ||
520 | spin_lock_irqsave(&blkcg->lock, flags); | ||
521 | if (!hlist_unhashed(&blkg->blkcg_node)) { | ||
522 | __blkiocg_del_blkio_group(blkg); | ||
523 | ret = 0; | ||
524 | } | ||
525 | spin_unlock_irqrestore(&blkcg->lock, flags); | ||
526 | } | ||
350 | 527 | ||
351 | blkg = container_of(ent, struct blkcg_gq, q_node); | 528 | rcu_read_unlock(); |
352 | return &blkg->rl; | 529 | return ret; |
353 | } | 530 | } |
531 | EXPORT_SYMBOL_GPL(blkiocg_del_blkio_group); | ||
354 | 532 | ||
355 | static int blkcg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, | 533 | /* called under rcu_read_lock(). */ |
356 | u64 val) | 534 | struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key) |
357 | { | 535 | { |
358 | struct blkcg *blkcg = cgroup_to_blkcg(cgroup); | 536 | struct blkio_group *blkg; |
359 | struct blkcg_gq *blkg; | ||
360 | struct hlist_node *n; | 537 | struct hlist_node *n; |
361 | int i; | 538 | void *__key; |
362 | 539 | ||
363 | mutex_lock(&blkcg_pol_mutex); | 540 | hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) { |
364 | spin_lock_irq(&blkcg->lock); | 541 | __key = blkg->key; |
542 | if (__key == key) | ||
543 | return blkg; | ||
544 | } | ||
545 | |||
546 | return NULL; | ||
547 | } | ||
548 | EXPORT_SYMBOL_GPL(blkiocg_lookup_group); | ||
365 | 549 | ||
550 | static void blkio_reset_stats_cpu(struct blkio_group *blkg) | ||
551 | { | ||
552 | struct blkio_group_stats_cpu *stats_cpu; | ||
553 | int i, j, k; | ||
366 | /* | 554 | /* |
367 | * Note that stat reset is racy - it doesn't synchronize against | 555 | * Note: On 64 bit arch this should not be an issue. This has the |
368 | * stat updates. This is a debug feature which shouldn't exist | 556 | * possibility of returning some inconsistent value on 32bit arch |
369 | * anyway. If you get hit by a race, retry. | 557 | * as 64bit update on 32bit is non atomic. Taking care of this |
558 | * corner case makes code very complicated, like sending IPIs to | ||
559 | * cpus, taking care of stats of offline cpus etc. | ||
560 | * | ||
561 | * reset stats is anyway more of a debug feature and this sounds a | ||
562 | * corner case. So I am not complicating the code yet until and | ||
563 | * unless this becomes a real issue. | ||
370 | */ | 564 | */ |
371 | hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) { | 565 | for_each_possible_cpu(i) { |
372 | for (i = 0; i < BLKCG_MAX_POLS; i++) { | 566 | stats_cpu = per_cpu_ptr(blkg->stats_cpu, i); |
373 | struct blkcg_policy *pol = blkcg_policy[i]; | 567 | stats_cpu->sectors = 0; |
568 | for(j = 0; j < BLKIO_STAT_CPU_NR; j++) | ||
569 | for (k = 0; k < BLKIO_STAT_TOTAL; k++) | ||
570 | stats_cpu->stat_arr_cpu[j][k] = 0; | ||
571 | } | ||
572 | } | ||
374 | 573 | ||
375 | if (blkcg_policy_enabled(blkg->q, pol) && | 574 | static int |
376 | pol->pd_reset_stats_fn) | 575 | blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val) |
377 | pol->pd_reset_stats_fn(blkg); | 576 | { |
577 | struct blkio_cgroup *blkcg; | ||
578 | struct blkio_group *blkg; | ||
579 | struct blkio_group_stats *stats; | ||
580 | struct hlist_node *n; | ||
581 | uint64_t queued[BLKIO_STAT_TOTAL]; | ||
582 | int i; | ||
583 | #ifdef CONFIG_DEBUG_BLK_CGROUP | ||
584 | bool idling, waiting, empty; | ||
585 | unsigned long long now = sched_clock(); | ||
586 | #endif | ||
587 | |||
588 | blkcg = cgroup_to_blkio_cgroup(cgroup); | ||
589 | spin_lock_irq(&blkcg->lock); | ||
590 | hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) { | ||
591 | spin_lock(&blkg->stats_lock); | ||
592 | stats = &blkg->stats; | ||
593 | #ifdef CONFIG_DEBUG_BLK_CGROUP | ||
594 | idling = blkio_blkg_idling(stats); | ||
595 | waiting = blkio_blkg_waiting(stats); | ||
596 | empty = blkio_blkg_empty(stats); | ||
597 | #endif | ||
598 | for (i = 0; i < BLKIO_STAT_TOTAL; i++) | ||
599 | queued[i] = stats->stat_arr[BLKIO_STAT_QUEUED][i]; | ||
600 | memset(stats, 0, sizeof(struct blkio_group_stats)); | ||
601 | for (i = 0; i < BLKIO_STAT_TOTAL; i++) | ||
602 | stats->stat_arr[BLKIO_STAT_QUEUED][i] = queued[i]; | ||
603 | #ifdef CONFIG_DEBUG_BLK_CGROUP | ||
604 | if (idling) { | ||
605 | blkio_mark_blkg_idling(stats); | ||
606 | stats->start_idle_time = now; | ||
607 | } | ||
608 | if (waiting) { | ||
609 | blkio_mark_blkg_waiting(stats); | ||
610 | stats->start_group_wait_time = now; | ||
378 | } | 611 | } |
612 | if (empty) { | ||
613 | blkio_mark_blkg_empty(stats); | ||
614 | stats->start_empty_time = now; | ||
615 | } | ||
616 | #endif | ||
617 | spin_unlock(&blkg->stats_lock); | ||
618 | |||
619 | /* Reset Per cpu stats which don't take blkg->stats_lock */ | ||
620 | blkio_reset_stats_cpu(blkg); | ||
379 | } | 621 | } |
380 | 622 | ||
381 | spin_unlock_irq(&blkcg->lock); | 623 | spin_unlock_irq(&blkcg->lock); |
382 | mutex_unlock(&blkcg_pol_mutex); | ||
383 | return 0; | 624 | return 0; |
384 | } | 625 | } |
385 | 626 | ||
386 | static const char *blkg_dev_name(struct blkcg_gq *blkg) | 627 | static void blkio_get_key_name(enum stat_sub_type type, dev_t dev, char *str, |
628 | int chars_left, bool diskname_only) | ||
387 | { | 629 | { |
388 | /* some drivers (floppy) instantiate a queue w/o disk registered */ | 630 | snprintf(str, chars_left, "%d:%d", MAJOR(dev), MINOR(dev)); |
389 | if (blkg->q->backing_dev_info.dev) | 631 | chars_left -= strlen(str); |
390 | return dev_name(blkg->q->backing_dev_info.dev); | 632 | if (chars_left <= 0) { |
391 | return NULL; | 633 | printk(KERN_WARNING |
634 | "Possibly incorrect cgroup stat display format"); | ||
635 | return; | ||
636 | } | ||
637 | if (diskname_only) | ||
638 | return; | ||
639 | switch (type) { | ||
640 | case BLKIO_STAT_READ: | ||
641 | strlcat(str, " Read", chars_left); | ||
642 | break; | ||
643 | case BLKIO_STAT_WRITE: | ||
644 | strlcat(str, " Write", chars_left); | ||
645 | break; | ||
646 | case BLKIO_STAT_SYNC: | ||
647 | strlcat(str, " Sync", chars_left); | ||
648 | break; | ||
649 | case BLKIO_STAT_ASYNC: | ||
650 | strlcat(str, " Async", chars_left); | ||
651 | break; | ||
652 | case BLKIO_STAT_TOTAL: | ||
653 | strlcat(str, " Total", chars_left); | ||
654 | break; | ||
655 | default: | ||
656 | strlcat(str, " Invalid", chars_left); | ||
657 | } | ||
392 | } | 658 | } |
393 | 659 | ||
394 | /** | 660 | static uint64_t blkio_fill_stat(char *str, int chars_left, uint64_t val, |
395 | * blkcg_print_blkgs - helper for printing per-blkg data | 661 | struct cgroup_map_cb *cb, dev_t dev) |
396 | * @sf: seq_file to print to | ||
397 | * @blkcg: blkcg of interest | ||
398 | * @prfill: fill function to print out a blkg | ||
399 | * @pol: policy in question | ||
400 | * @data: data to be passed to @prfill | ||
401 | * @show_total: to print out sum of prfill return values or not | ||
402 | * | ||
403 | * This function invokes @prfill on each blkg of @blkcg if pd for the | ||
404 | * policy specified by @pol exists. @prfill is invoked with @sf, the | ||
405 | * policy data and @data. If @show_total is %true, the sum of the return | ||
406 | * values from @prfill is printed with "Total" label at the end. | ||
407 | * | ||
408 | * This is to be used to construct print functions for | ||
409 | * cftype->read_seq_string method. | ||
410 | */ | ||
411 | void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg, | ||
412 | u64 (*prfill)(struct seq_file *, | ||
413 | struct blkg_policy_data *, int), | ||
414 | const struct blkcg_policy *pol, int data, | ||
415 | bool show_total) | ||
416 | { | 662 | { |
417 | struct blkcg_gq *blkg; | 663 | blkio_get_key_name(0, dev, str, chars_left, true); |
418 | struct hlist_node *n; | 664 | cb->fill(cb, str, val); |
419 | u64 total = 0; | 665 | return val; |
666 | } | ||
420 | 667 | ||
421 | spin_lock_irq(&blkcg->lock); | ||
422 | hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) | ||
423 | if (blkcg_policy_enabled(blkg->q, pol)) | ||
424 | total += prfill(sf, blkg->pd[pol->plid], data); | ||
425 | spin_unlock_irq(&blkcg->lock); | ||
426 | 668 | ||
427 | if (show_total) | 669 | static uint64_t blkio_read_stat_cpu(struct blkio_group *blkg, |
428 | seq_printf(sf, "Total %llu\n", (unsigned long long)total); | 670 | enum stat_type_cpu type, enum stat_sub_type sub_type) |
671 | { | ||
672 | int cpu; | ||
673 | struct blkio_group_stats_cpu *stats_cpu; | ||
674 | u64 val = 0, tval; | ||
675 | |||
676 | for_each_possible_cpu(cpu) { | ||
677 | unsigned int start; | ||
678 | stats_cpu = per_cpu_ptr(blkg->stats_cpu, cpu); | ||
679 | |||
680 | do { | ||
681 | start = u64_stats_fetch_begin(&stats_cpu->syncp); | ||
682 | if (type == BLKIO_STAT_CPU_SECTORS) | ||
683 | tval = stats_cpu->sectors; | ||
684 | else | ||
685 | tval = stats_cpu->stat_arr_cpu[type][sub_type]; | ||
686 | } while(u64_stats_fetch_retry(&stats_cpu->syncp, start)); | ||
687 | |||
688 | val += tval; | ||
689 | } | ||
690 | |||
691 | return val; | ||
429 | } | 692 | } |
430 | EXPORT_SYMBOL_GPL(blkcg_print_blkgs); | ||
431 | 693 | ||
432 | /** | 694 | static uint64_t blkio_get_stat_cpu(struct blkio_group *blkg, |
433 | * __blkg_prfill_u64 - prfill helper for a single u64 value | 695 | struct cgroup_map_cb *cb, dev_t dev, enum stat_type_cpu type) |
434 | * @sf: seq_file to print to | ||
435 | * @pd: policy private data of interest | ||
436 | * @v: value to print | ||
437 | * | ||
438 | * Print @v to @sf for the device assocaited with @pd. | ||
439 | */ | ||
440 | u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v) | ||
441 | { | 696 | { |
442 | const char *dname = blkg_dev_name(pd->blkg); | 697 | uint64_t disk_total, val; |
698 | char key_str[MAX_KEY_LEN]; | ||
699 | enum stat_sub_type sub_type; | ||
443 | 700 | ||
444 | if (!dname) | 701 | if (type == BLKIO_STAT_CPU_SECTORS) { |
445 | return 0; | 702 | val = blkio_read_stat_cpu(blkg, type, 0); |
703 | return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, val, cb, dev); | ||
704 | } | ||
446 | 705 | ||
447 | seq_printf(sf, "%s %llu\n", dname, (unsigned long long)v); | 706 | for (sub_type = BLKIO_STAT_READ; sub_type < BLKIO_STAT_TOTAL; |
448 | return v; | 707 | sub_type++) { |
708 | blkio_get_key_name(sub_type, dev, key_str, MAX_KEY_LEN, false); | ||
709 | val = blkio_read_stat_cpu(blkg, type, sub_type); | ||
710 | cb->fill(cb, key_str, val); | ||
711 | } | ||
712 | |||
713 | disk_total = blkio_read_stat_cpu(blkg, type, BLKIO_STAT_READ) + | ||
714 | blkio_read_stat_cpu(blkg, type, BLKIO_STAT_WRITE); | ||
715 | |||
716 | blkio_get_key_name(BLKIO_STAT_TOTAL, dev, key_str, MAX_KEY_LEN, false); | ||
717 | cb->fill(cb, key_str, disk_total); | ||
718 | return disk_total; | ||
449 | } | 719 | } |
450 | EXPORT_SYMBOL_GPL(__blkg_prfill_u64); | ||
451 | 720 | ||
452 | /** | 721 | /* This should be called with blkg->stats_lock held */ |
453 | * __blkg_prfill_rwstat - prfill helper for a blkg_rwstat | 722 | static uint64_t blkio_get_stat(struct blkio_group *blkg, |
454 | * @sf: seq_file to print to | 723 | struct cgroup_map_cb *cb, dev_t dev, enum stat_type type) |
455 | * @pd: policy private data of interest | 724 | { |
456 | * @rwstat: rwstat to print | 725 | uint64_t disk_total; |
457 | * | 726 | char key_str[MAX_KEY_LEN]; |
458 | * Print @rwstat to @sf for the device assocaited with @pd. | 727 | enum stat_sub_type sub_type; |
459 | */ | 728 | |
460 | u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, | 729 | if (type == BLKIO_STAT_TIME) |
461 | const struct blkg_rwstat *rwstat) | 730 | return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, |
462 | { | 731 | blkg->stats.time, cb, dev); |
463 | static const char *rwstr[] = { | 732 | #ifdef CONFIG_DEBUG_BLK_CGROUP |
464 | [BLKG_RWSTAT_READ] = "Read", | 733 | if (type == BLKIO_STAT_UNACCOUNTED_TIME) |
465 | [BLKG_RWSTAT_WRITE] = "Write", | 734 | return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, |
466 | [BLKG_RWSTAT_SYNC] = "Sync", | 735 | blkg->stats.unaccounted_time, cb, dev); |
467 | [BLKG_RWSTAT_ASYNC] = "Async", | 736 | if (type == BLKIO_STAT_AVG_QUEUE_SIZE) { |
468 | }; | 737 | uint64_t sum = blkg->stats.avg_queue_size_sum; |
469 | const char *dname = blkg_dev_name(pd->blkg); | 738 | uint64_t samples = blkg->stats.avg_queue_size_samples; |
470 | u64 v; | 739 | if (samples) |
471 | int i; | 740 | do_div(sum, samples); |
741 | else | ||
742 | sum = 0; | ||
743 | return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, sum, cb, dev); | ||
744 | } | ||
745 | if (type == BLKIO_STAT_GROUP_WAIT_TIME) | ||
746 | return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, | ||
747 | blkg->stats.group_wait_time, cb, dev); | ||
748 | if (type == BLKIO_STAT_IDLE_TIME) | ||
749 | return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, | ||
750 | blkg->stats.idle_time, cb, dev); | ||
751 | if (type == BLKIO_STAT_EMPTY_TIME) | ||
752 | return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, | ||
753 | blkg->stats.empty_time, cb, dev); | ||
754 | if (type == BLKIO_STAT_DEQUEUE) | ||
755 | return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, | ||
756 | blkg->stats.dequeue, cb, dev); | ||
757 | #endif | ||
758 | |||
759 | for (sub_type = BLKIO_STAT_READ; sub_type < BLKIO_STAT_TOTAL; | ||
760 | sub_type++) { | ||
761 | blkio_get_key_name(sub_type, dev, key_str, MAX_KEY_LEN, false); | ||
762 | cb->fill(cb, key_str, blkg->stats.stat_arr[type][sub_type]); | ||
763 | } | ||
764 | disk_total = blkg->stats.stat_arr[type][BLKIO_STAT_READ] + | ||
765 | blkg->stats.stat_arr[type][BLKIO_STAT_WRITE]; | ||
766 | blkio_get_key_name(BLKIO_STAT_TOTAL, dev, key_str, MAX_KEY_LEN, false); | ||
767 | cb->fill(cb, key_str, disk_total); | ||
768 | return disk_total; | ||
769 | } | ||
472 | 770 | ||
473 | if (!dname) | 771 | static int blkio_check_dev_num(dev_t dev) |
474 | return 0; | 772 | { |
773 | int part = 0; | ||
774 | struct gendisk *disk; | ||
475 | 775 | ||
476 | for (i = 0; i < BLKG_RWSTAT_NR; i++) | 776 | disk = get_gendisk(dev, &part); |
477 | seq_printf(sf, "%s %s %llu\n", dname, rwstr[i], | 777 | if (!disk || part) |
478 | (unsigned long long)rwstat->cnt[i]); | 778 | return -ENODEV; |
479 | 779 | ||
480 | v = rwstat->cnt[BLKG_RWSTAT_READ] + rwstat->cnt[BLKG_RWSTAT_WRITE]; | 780 | return 0; |
481 | seq_printf(sf, "%s Total %llu\n", dname, (unsigned long long)v); | ||
482 | return v; | ||
483 | } | 781 | } |
484 | 782 | ||
485 | /** | 783 | static int blkio_policy_parse_and_set(char *buf, |
486 | * blkg_prfill_stat - prfill callback for blkg_stat | 784 | struct blkio_policy_node *newpn, enum blkio_policy_id plid, int fileid) |
487 | * @sf: seq_file to print to | ||
488 | * @pd: policy private data of interest | ||
489 | * @off: offset to the blkg_stat in @pd | ||
490 | * | ||
491 | * prfill callback for printing a blkg_stat. | ||
492 | */ | ||
493 | u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, int off) | ||
494 | { | 785 | { |
495 | return __blkg_prfill_u64(sf, pd, blkg_stat_read((void *)pd + off)); | 786 | char *s[4], *p, *major_s = NULL, *minor_s = NULL; |
787 | int ret; | ||
788 | unsigned long major, minor; | ||
789 | int i = 0; | ||
790 | dev_t dev; | ||
791 | u64 temp; | ||
792 | |||
793 | memset(s, 0, sizeof(s)); | ||
794 | |||
795 | while ((p = strsep(&buf, " ")) != NULL) { | ||
796 | if (!*p) | ||
797 | continue; | ||
798 | |||
799 | s[i++] = p; | ||
800 | |||
801 | /* Prevent from inputing too many things */ | ||
802 | if (i == 3) | ||
803 | break; | ||
804 | } | ||
805 | |||
806 | if (i != 2) | ||
807 | return -EINVAL; | ||
808 | |||
809 | p = strsep(&s[0], ":"); | ||
810 | if (p != NULL) | ||
811 | major_s = p; | ||
812 | else | ||
813 | return -EINVAL; | ||
814 | |||
815 | minor_s = s[0]; | ||
816 | if (!minor_s) | ||
817 | return -EINVAL; | ||
818 | |||
819 | ret = strict_strtoul(major_s, 10, &major); | ||
820 | if (ret) | ||
821 | return -EINVAL; | ||
822 | |||
823 | ret = strict_strtoul(minor_s, 10, &minor); | ||
824 | if (ret) | ||
825 | return -EINVAL; | ||
826 | |||
827 | dev = MKDEV(major, minor); | ||
828 | |||
829 | ret = strict_strtoull(s[1], 10, &temp); | ||
830 | if (ret) | ||
831 | return -EINVAL; | ||
832 | |||
833 | /* For rule removal, do not check for device presence. */ | ||
834 | if (temp) { | ||
835 | ret = blkio_check_dev_num(dev); | ||
836 | if (ret) | ||
837 | return ret; | ||
838 | } | ||
839 | |||
840 | newpn->dev = dev; | ||
841 | |||
842 | switch (plid) { | ||
843 | case BLKIO_POLICY_PROP: | ||
844 | if ((temp < BLKIO_WEIGHT_MIN && temp > 0) || | ||
845 | temp > BLKIO_WEIGHT_MAX) | ||
846 | return -EINVAL; | ||
847 | |||
848 | newpn->plid = plid; | ||
849 | newpn->fileid = fileid; | ||
850 | newpn->val.weight = temp; | ||
851 | break; | ||
852 | case BLKIO_POLICY_THROTL: | ||
853 | switch(fileid) { | ||
854 | case BLKIO_THROTL_read_bps_device: | ||
855 | case BLKIO_THROTL_write_bps_device: | ||
856 | newpn->plid = plid; | ||
857 | newpn->fileid = fileid; | ||
858 | newpn->val.bps = temp; | ||
859 | break; | ||
860 | case BLKIO_THROTL_read_iops_device: | ||
861 | case BLKIO_THROTL_write_iops_device: | ||
862 | if (temp > THROTL_IOPS_MAX) | ||
863 | return -EINVAL; | ||
864 | |||
865 | newpn->plid = plid; | ||
866 | newpn->fileid = fileid; | ||
867 | newpn->val.iops = (unsigned int)temp; | ||
868 | break; | ||
869 | } | ||
870 | break; | ||
871 | default: | ||
872 | BUG(); | ||
873 | } | ||
874 | |||
875 | return 0; | ||
496 | } | 876 | } |
497 | EXPORT_SYMBOL_GPL(blkg_prfill_stat); | ||
498 | 877 | ||
499 | /** | 878 | unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg, |
500 | * blkg_prfill_rwstat - prfill callback for blkg_rwstat | 879 | dev_t dev) |
501 | * @sf: seq_file to print to | ||
502 | * @pd: policy private data of interest | ||
503 | * @off: offset to the blkg_rwstat in @pd | ||
504 | * | ||
505 | * prfill callback for printing a blkg_rwstat. | ||
506 | */ | ||
507 | u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, | ||
508 | int off) | ||
509 | { | 880 | { |
510 | struct blkg_rwstat rwstat = blkg_rwstat_read((void *)pd + off); | 881 | struct blkio_policy_node *pn; |
511 | 882 | ||
512 | return __blkg_prfill_rwstat(sf, pd, &rwstat); | 883 | pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_PROP, |
884 | BLKIO_PROP_weight_device); | ||
885 | if (pn) | ||
886 | return pn->val.weight; | ||
887 | else | ||
888 | return blkcg->weight; | ||
513 | } | 889 | } |
514 | EXPORT_SYMBOL_GPL(blkg_prfill_rwstat); | 890 | EXPORT_SYMBOL_GPL(blkcg_get_weight); |
515 | 891 | ||
516 | /** | 892 | uint64_t blkcg_get_read_bps(struct blkio_cgroup *blkcg, dev_t dev) |
517 | * blkg_conf_prep - parse and prepare for per-blkg config update | ||
518 | * @blkcg: target block cgroup | ||
519 | * @pol: target policy | ||
520 | * @input: input string | ||
521 | * @ctx: blkg_conf_ctx to be filled | ||
522 | * | ||
523 | * Parse per-blkg config update from @input and initialize @ctx with the | ||
524 | * result. @ctx->blkg points to the blkg to be updated and @ctx->v the new | ||
525 | * value. This function returns with RCU read lock and queue lock held and | ||
526 | * must be paired with blkg_conf_finish(). | ||
527 | */ | ||
528 | int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, | ||
529 | const char *input, struct blkg_conf_ctx *ctx) | ||
530 | __acquires(rcu) __acquires(disk->queue->queue_lock) | ||
531 | { | 893 | { |
532 | struct gendisk *disk; | 894 | struct blkio_policy_node *pn; |
533 | struct blkcg_gq *blkg; | ||
534 | unsigned int major, minor; | ||
535 | unsigned long long v; | ||
536 | int part, ret; | ||
537 | 895 | ||
538 | if (sscanf(input, "%u:%u %llu", &major, &minor, &v) != 3) | 896 | pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL, |
539 | return -EINVAL; | 897 | BLKIO_THROTL_read_bps_device); |
898 | if (pn) | ||
899 | return pn->val.bps; | ||
900 | else | ||
901 | return -1; | ||
902 | } | ||
540 | 903 | ||
541 | disk = get_gendisk(MKDEV(major, minor), &part); | 904 | uint64_t blkcg_get_write_bps(struct blkio_cgroup *blkcg, dev_t dev) |
542 | if (!disk || part) | 905 | { |
543 | return -EINVAL; | 906 | struct blkio_policy_node *pn; |
907 | pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL, | ||
908 | BLKIO_THROTL_write_bps_device); | ||
909 | if (pn) | ||
910 | return pn->val.bps; | ||
911 | else | ||
912 | return -1; | ||
913 | } | ||
544 | 914 | ||
545 | rcu_read_lock(); | 915 | unsigned int blkcg_get_read_iops(struct blkio_cgroup *blkcg, dev_t dev) |
546 | spin_lock_irq(disk->queue->queue_lock); | 916 | { |
917 | struct blkio_policy_node *pn; | ||
547 | 918 | ||
548 | if (blkcg_policy_enabled(disk->queue, pol)) | 919 | pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL, |
549 | blkg = blkg_lookup_create(blkcg, disk->queue); | 920 | BLKIO_THROTL_read_iops_device); |
921 | if (pn) | ||
922 | return pn->val.iops; | ||
550 | else | 923 | else |
551 | blkg = ERR_PTR(-EINVAL); | 924 | return -1; |
925 | } | ||
552 | 926 | ||
553 | if (IS_ERR(blkg)) { | 927 | unsigned int blkcg_get_write_iops(struct blkio_cgroup *blkcg, dev_t dev) |
554 | ret = PTR_ERR(blkg); | 928 | { |
555 | rcu_read_unlock(); | 929 | struct blkio_policy_node *pn; |
556 | spin_unlock_irq(disk->queue->queue_lock); | 930 | pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL, |
557 | put_disk(disk); | 931 | BLKIO_THROTL_write_iops_device); |
558 | /* | 932 | if (pn) |
559 | * If queue was bypassing, we should retry. Do so after a | 933 | return pn->val.iops; |
560 | * short msleep(). It isn't strictly necessary but queue | 934 | else |
561 | * can be bypassing for some time and it's always nice to | 935 | return -1; |
562 | * avoid busy looping. | 936 | } |
563 | */ | 937 | |
564 | if (ret == -EBUSY) { | 938 | /* Checks whether user asked for deleting a policy rule */ |
565 | msleep(10); | 939 | static bool blkio_delete_rule_command(struct blkio_policy_node *pn) |
566 | ret = restart_syscall(); | 940 | { |
941 | switch(pn->plid) { | ||
942 | case BLKIO_POLICY_PROP: | ||
943 | if (pn->val.weight == 0) | ||
944 | return 1; | ||
945 | break; | ||
946 | case BLKIO_POLICY_THROTL: | ||
947 | switch(pn->fileid) { | ||
948 | case BLKIO_THROTL_read_bps_device: | ||
949 | case BLKIO_THROTL_write_bps_device: | ||
950 | if (pn->val.bps == 0) | ||
951 | return 1; | ||
952 | break; | ||
953 | case BLKIO_THROTL_read_iops_device: | ||
954 | case BLKIO_THROTL_write_iops_device: | ||
955 | if (pn->val.iops == 0) | ||
956 | return 1; | ||
567 | } | 957 | } |
568 | return ret; | 958 | break; |
959 | default: | ||
960 | BUG(); | ||
569 | } | 961 | } |
570 | 962 | ||
571 | ctx->disk = disk; | ||
572 | ctx->blkg = blkg; | ||
573 | ctx->v = v; | ||
574 | return 0; | 963 | return 0; |
575 | } | 964 | } |
576 | EXPORT_SYMBOL_GPL(blkg_conf_prep); | ||
577 | 965 | ||
578 | /** | 966 | static void blkio_update_policy_rule(struct blkio_policy_node *oldpn, |
579 | * blkg_conf_finish - finish up per-blkg config update | 967 | struct blkio_policy_node *newpn) |
580 | * @ctx: blkg_conf_ctx intiailized by blkg_conf_prep() | ||
581 | * | ||
582 | * Finish up after per-blkg config update. This function must be paired | ||
583 | * with blkg_conf_prep(). | ||
584 | */ | ||
585 | void blkg_conf_finish(struct blkg_conf_ctx *ctx) | ||
586 | __releases(ctx->disk->queue->queue_lock) __releases(rcu) | ||
587 | { | 968 | { |
588 | spin_unlock_irq(ctx->disk->queue->queue_lock); | 969 | switch(oldpn->plid) { |
589 | rcu_read_unlock(); | 970 | case BLKIO_POLICY_PROP: |
590 | put_disk(ctx->disk); | 971 | oldpn->val.weight = newpn->val.weight; |
972 | break; | ||
973 | case BLKIO_POLICY_THROTL: | ||
974 | switch(newpn->fileid) { | ||
975 | case BLKIO_THROTL_read_bps_device: | ||
976 | case BLKIO_THROTL_write_bps_device: | ||
977 | oldpn->val.bps = newpn->val.bps; | ||
978 | break; | ||
979 | case BLKIO_THROTL_read_iops_device: | ||
980 | case BLKIO_THROTL_write_iops_device: | ||
981 | oldpn->val.iops = newpn->val.iops; | ||
982 | } | ||
983 | break; | ||
984 | default: | ||
985 | BUG(); | ||
986 | } | ||
591 | } | 987 | } |
592 | EXPORT_SYMBOL_GPL(blkg_conf_finish); | ||
593 | 988 | ||
594 | struct cftype blkcg_files[] = { | 989 | /* |
595 | { | 990 | * Some rules/values in blkg have changed. Propagate those to respective |
596 | .name = "reset_stats", | 991 | * policies. |
597 | .write_u64 = blkcg_reset_stats, | 992 | */ |
598 | }, | 993 | static void blkio_update_blkg_policy(struct blkio_cgroup *blkcg, |
599 | { } /* terminate */ | 994 | struct blkio_group *blkg, struct blkio_policy_node *pn) |
600 | }; | 995 | { |
996 | unsigned int weight, iops; | ||
997 | u64 bps; | ||
998 | |||
999 | switch(pn->plid) { | ||
1000 | case BLKIO_POLICY_PROP: | ||
1001 | weight = pn->val.weight ? pn->val.weight : | ||
1002 | blkcg->weight; | ||
1003 | blkio_update_group_weight(blkg, weight); | ||
1004 | break; | ||
1005 | case BLKIO_POLICY_THROTL: | ||
1006 | switch(pn->fileid) { | ||
1007 | case BLKIO_THROTL_read_bps_device: | ||
1008 | case BLKIO_THROTL_write_bps_device: | ||
1009 | bps = pn->val.bps ? pn->val.bps : (-1); | ||
1010 | blkio_update_group_bps(blkg, bps, pn->fileid); | ||
1011 | break; | ||
1012 | case BLKIO_THROTL_read_iops_device: | ||
1013 | case BLKIO_THROTL_write_iops_device: | ||
1014 | iops = pn->val.iops ? pn->val.iops : (-1); | ||
1015 | blkio_update_group_iops(blkg, iops, pn->fileid); | ||
1016 | break; | ||
1017 | } | ||
1018 | break; | ||
1019 | default: | ||
1020 | BUG(); | ||
1021 | } | ||
1022 | } | ||
601 | 1023 | ||
602 | /** | 1024 | /* |
603 | * blkcg_css_offline - cgroup css_offline callback | 1025 | * A policy node rule has been updated. Propagate this update to all the |
604 | * @cgroup: cgroup of interest | 1026 | * block groups which might be affected by this update. |
605 | * | ||
606 | * This function is called when @cgroup is about to go away and responsible | ||
607 | * for shooting down all blkgs associated with @cgroup. blkgs should be | ||
608 | * removed while holding both q and blkcg locks. As blkcg lock is nested | ||
609 | * inside q lock, this function performs reverse double lock dancing. | ||
610 | * | ||
611 | * This is the blkcg counterpart of ioc_release_fn(). | ||
612 | */ | 1027 | */ |
613 | static void blkcg_css_offline(struct cgroup *cgroup) | 1028 | static void blkio_update_policy_node_blkg(struct blkio_cgroup *blkcg, |
1029 | struct blkio_policy_node *pn) | ||
614 | { | 1030 | { |
615 | struct blkcg *blkcg = cgroup_to_blkcg(cgroup); | 1031 | struct blkio_group *blkg; |
1032 | struct hlist_node *n; | ||
616 | 1033 | ||
1034 | spin_lock(&blkio_list_lock); | ||
617 | spin_lock_irq(&blkcg->lock); | 1035 | spin_lock_irq(&blkcg->lock); |
618 | 1036 | ||
619 | while (!hlist_empty(&blkcg->blkg_list)) { | 1037 | hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) { |
620 | struct blkcg_gq *blkg = hlist_entry(blkcg->blkg_list.first, | 1038 | if (pn->dev != blkg->dev || pn->plid != blkg->plid) |
621 | struct blkcg_gq, blkcg_node); | 1039 | continue; |
622 | struct request_queue *q = blkg->q; | 1040 | blkio_update_blkg_policy(blkcg, blkg, pn); |
623 | |||
624 | if (spin_trylock(q->queue_lock)) { | ||
625 | blkg_destroy(blkg); | ||
626 | spin_unlock(q->queue_lock); | ||
627 | } else { | ||
628 | spin_unlock_irq(&blkcg->lock); | ||
629 | cpu_relax(); | ||
630 | spin_lock_irq(&blkcg->lock); | ||
631 | } | ||
632 | } | 1041 | } |
633 | 1042 | ||
634 | spin_unlock_irq(&blkcg->lock); | 1043 | spin_unlock_irq(&blkcg->lock); |
1044 | spin_unlock(&blkio_list_lock); | ||
635 | } | 1045 | } |
636 | 1046 | ||
637 | static void blkcg_css_free(struct cgroup *cgroup) | 1047 | static int blkiocg_file_write(struct cgroup *cgrp, struct cftype *cft, |
1048 | const char *buffer) | ||
638 | { | 1049 | { |
639 | struct blkcg *blkcg = cgroup_to_blkcg(cgroup); | 1050 | int ret = 0; |
1051 | char *buf; | ||
1052 | struct blkio_policy_node *newpn, *pn; | ||
1053 | struct blkio_cgroup *blkcg; | ||
1054 | int keep_newpn = 0; | ||
1055 | enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); | ||
1056 | int fileid = BLKIOFILE_ATTR(cft->private); | ||
1057 | |||
1058 | buf = kstrdup(buffer, GFP_KERNEL); | ||
1059 | if (!buf) | ||
1060 | return -ENOMEM; | ||
640 | 1061 | ||
641 | if (blkcg != &blkcg_root) | 1062 | newpn = kzalloc(sizeof(*newpn), GFP_KERNEL); |
642 | kfree(blkcg); | 1063 | if (!newpn) { |
643 | } | 1064 | ret = -ENOMEM; |
1065 | goto free_buf; | ||
1066 | } | ||
644 | 1067 | ||
645 | static struct cgroup_subsys_state *blkcg_css_alloc(struct cgroup *cgroup) | 1068 | ret = blkio_policy_parse_and_set(buf, newpn, plid, fileid); |
646 | { | 1069 | if (ret) |
647 | static atomic64_t id_seq = ATOMIC64_INIT(0); | 1070 | goto free_newpn; |
648 | struct blkcg *blkcg; | ||
649 | struct cgroup *parent = cgroup->parent; | ||
650 | 1071 | ||
651 | if (!parent) { | 1072 | blkcg = cgroup_to_blkio_cgroup(cgrp); |
652 | blkcg = &blkcg_root; | 1073 | |
653 | goto done; | 1074 | spin_lock_irq(&blkcg->lock); |
1075 | |||
1076 | pn = blkio_policy_search_node(blkcg, newpn->dev, plid, fileid); | ||
1077 | if (!pn) { | ||
1078 | if (!blkio_delete_rule_command(newpn)) { | ||
1079 | blkio_policy_insert_node(blkcg, newpn); | ||
1080 | keep_newpn = 1; | ||
1081 | } | ||
1082 | spin_unlock_irq(&blkcg->lock); | ||
1083 | goto update_io_group; | ||
654 | } | 1084 | } |
655 | 1085 | ||
656 | blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL); | 1086 | if (blkio_delete_rule_command(newpn)) { |
657 | if (!blkcg) | 1087 | blkio_policy_delete_node(pn); |
658 | return ERR_PTR(-ENOMEM); | 1088 | spin_unlock_irq(&blkcg->lock); |
1089 | goto update_io_group; | ||
1090 | } | ||
1091 | spin_unlock_irq(&blkcg->lock); | ||
659 | 1092 | ||
660 | blkcg->cfq_weight = CFQ_WEIGHT_DEFAULT; | 1093 | blkio_update_policy_rule(pn, newpn); |
661 | blkcg->id = atomic64_inc_return(&id_seq); /* root is 0, start from 1 */ | ||
662 | done: | ||
663 | spin_lock_init(&blkcg->lock); | ||
664 | INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_ATOMIC); | ||
665 | INIT_HLIST_HEAD(&blkcg->blkg_list); | ||
666 | 1094 | ||
667 | return &blkcg->css; | 1095 | update_io_group: |
1096 | blkio_update_policy_node_blkg(blkcg, newpn); | ||
1097 | |||
1098 | free_newpn: | ||
1099 | if (!keep_newpn) | ||
1100 | kfree(newpn); | ||
1101 | free_buf: | ||
1102 | kfree(buf); | ||
1103 | return ret; | ||
668 | } | 1104 | } |
669 | 1105 | ||
670 | /** | 1106 | static void |
671 | * blkcg_init_queue - initialize blkcg part of request queue | 1107 | blkio_print_policy_node(struct seq_file *m, struct blkio_policy_node *pn) |
672 | * @q: request_queue to initialize | ||
673 | * | ||
674 | * Called from blk_alloc_queue_node(). Responsible for initializing blkcg | ||
675 | * part of new request_queue @q. | ||
676 | * | ||
677 | * RETURNS: | ||
678 | * 0 on success, -errno on failure. | ||
679 | */ | ||
680 | int blkcg_init_queue(struct request_queue *q) | ||
681 | { | 1108 | { |
682 | might_sleep(); | 1109 | switch(pn->plid) { |
683 | 1110 | case BLKIO_POLICY_PROP: | |
684 | return blk_throtl_init(q); | 1111 | if (pn->fileid == BLKIO_PROP_weight_device) |
1112 | seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev), | ||
1113 | MINOR(pn->dev), pn->val.weight); | ||
1114 | break; | ||
1115 | case BLKIO_POLICY_THROTL: | ||
1116 | switch(pn->fileid) { | ||
1117 | case BLKIO_THROTL_read_bps_device: | ||
1118 | case BLKIO_THROTL_write_bps_device: | ||
1119 | seq_printf(m, "%u:%u\t%llu\n", MAJOR(pn->dev), | ||
1120 | MINOR(pn->dev), pn->val.bps); | ||
1121 | break; | ||
1122 | case BLKIO_THROTL_read_iops_device: | ||
1123 | case BLKIO_THROTL_write_iops_device: | ||
1124 | seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev), | ||
1125 | MINOR(pn->dev), pn->val.iops); | ||
1126 | break; | ||
1127 | } | ||
1128 | break; | ||
1129 | default: | ||
1130 | BUG(); | ||
1131 | } | ||
685 | } | 1132 | } |
686 | 1133 | ||
687 | /** | 1134 | /* cgroup files which read their data from policy nodes end up here */ |
688 | * blkcg_drain_queue - drain blkcg part of request_queue | 1135 | static void blkio_read_policy_node_files(struct cftype *cft, |
689 | * @q: request_queue to drain | 1136 | struct blkio_cgroup *blkcg, struct seq_file *m) |
690 | * | ||
691 | * Called from blk_drain_queue(). Responsible for draining blkcg part. | ||
692 | */ | ||
693 | void blkcg_drain_queue(struct request_queue *q) | ||
694 | { | 1137 | { |
695 | lockdep_assert_held(q->queue_lock); | 1138 | struct blkio_policy_node *pn; |
696 | 1139 | ||
697 | blk_throtl_drain(q); | 1140 | if (!list_empty(&blkcg->policy_list)) { |
1141 | spin_lock_irq(&blkcg->lock); | ||
1142 | list_for_each_entry(pn, &blkcg->policy_list, node) { | ||
1143 | if (!pn_matches_cftype(cft, pn)) | ||
1144 | continue; | ||
1145 | blkio_print_policy_node(m, pn); | ||
1146 | } | ||
1147 | spin_unlock_irq(&blkcg->lock); | ||
1148 | } | ||
698 | } | 1149 | } |
699 | 1150 | ||
700 | /** | 1151 | static int blkiocg_file_read(struct cgroup *cgrp, struct cftype *cft, |
701 | * blkcg_exit_queue - exit and release blkcg part of request_queue | 1152 | struct seq_file *m) |
702 | * @q: request_queue being released | ||
703 | * | ||
704 | * Called from blk_release_queue(). Responsible for exiting blkcg part. | ||
705 | */ | ||
706 | void blkcg_exit_queue(struct request_queue *q) | ||
707 | { | 1153 | { |
708 | spin_lock_irq(q->queue_lock); | 1154 | struct blkio_cgroup *blkcg; |
709 | blkg_destroy_all(q); | 1155 | enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); |
710 | spin_unlock_irq(q->queue_lock); | 1156 | int name = BLKIOFILE_ATTR(cft->private); |
1157 | |||
1158 | blkcg = cgroup_to_blkio_cgroup(cgrp); | ||
1159 | |||
1160 | switch(plid) { | ||
1161 | case BLKIO_POLICY_PROP: | ||
1162 | switch(name) { | ||
1163 | case BLKIO_PROP_weight_device: | ||
1164 | blkio_read_policy_node_files(cft, blkcg, m); | ||
1165 | return 0; | ||
1166 | default: | ||
1167 | BUG(); | ||
1168 | } | ||
1169 | break; | ||
1170 | case BLKIO_POLICY_THROTL: | ||
1171 | switch(name){ | ||
1172 | case BLKIO_THROTL_read_bps_device: | ||
1173 | case BLKIO_THROTL_write_bps_device: | ||
1174 | case BLKIO_THROTL_read_iops_device: | ||
1175 | case BLKIO_THROTL_write_iops_device: | ||
1176 | blkio_read_policy_node_files(cft, blkcg, m); | ||
1177 | return 0; | ||
1178 | default: | ||
1179 | BUG(); | ||
1180 | } | ||
1181 | break; | ||
1182 | default: | ||
1183 | BUG(); | ||
1184 | } | ||
711 | 1185 | ||
712 | blk_throtl_exit(q); | 1186 | return 0; |
713 | } | 1187 | } |
714 | 1188 | ||
715 | /* | 1189 | static int blkio_read_blkg_stats(struct blkio_cgroup *blkcg, |
716 | * We cannot support shared io contexts, as we have no mean to support | 1190 | struct cftype *cft, struct cgroup_map_cb *cb, |
717 | * two tasks with the same ioc in two different groups without major rework | 1191 | enum stat_type type, bool show_total, bool pcpu) |
718 | * of the main cic data structures. For now we allow a task to change | ||
719 | * its cgroup only if it's the only owner of its ioc. | ||
720 | */ | ||
721 | static int blkcg_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) | ||
722 | { | 1192 | { |
723 | struct task_struct *task; | 1193 | struct blkio_group *blkg; |
724 | struct io_context *ioc; | 1194 | struct hlist_node *n; |
725 | int ret = 0; | 1195 | uint64_t cgroup_total = 0; |
726 | 1196 | ||
727 | /* task_lock() is needed to avoid races with exit_io_context() */ | 1197 | rcu_read_lock(); |
728 | cgroup_taskset_for_each(task, cgrp, tset) { | 1198 | hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) { |
729 | task_lock(task); | 1199 | if (blkg->dev) { |
730 | ioc = task->io_context; | 1200 | if (!cftype_blkg_same_policy(cft, blkg)) |
731 | if (ioc && atomic_read(&ioc->nr_tasks) > 1) | 1201 | continue; |
732 | ret = -EINVAL; | 1202 | if (pcpu) |
733 | task_unlock(task); | 1203 | cgroup_total += blkio_get_stat_cpu(blkg, cb, |
734 | if (ret) | 1204 | blkg->dev, type); |
735 | break; | 1205 | else { |
1206 | spin_lock_irq(&blkg->stats_lock); | ||
1207 | cgroup_total += blkio_get_stat(blkg, cb, | ||
1208 | blkg->dev, type); | ||
1209 | spin_unlock_irq(&blkg->stats_lock); | ||
1210 | } | ||
1211 | } | ||
736 | } | 1212 | } |
737 | return ret; | 1213 | if (show_total) |
1214 | cb->fill(cb, "Total", cgroup_total); | ||
1215 | rcu_read_unlock(); | ||
1216 | return 0; | ||
738 | } | 1217 | } |
739 | 1218 | ||
740 | struct cgroup_subsys blkio_subsys = { | 1219 | /* All map kind of cgroup file get serviced by this function */ |
741 | .name = "blkio", | 1220 | static int blkiocg_file_read_map(struct cgroup *cgrp, struct cftype *cft, |
742 | .css_alloc = blkcg_css_alloc, | 1221 | struct cgroup_map_cb *cb) |
743 | .css_offline = blkcg_css_offline, | 1222 | { |
744 | .css_free = blkcg_css_free, | 1223 | struct blkio_cgroup *blkcg; |
745 | .can_attach = blkcg_can_attach, | 1224 | enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); |
746 | .subsys_id = blkio_subsys_id, | 1225 | int name = BLKIOFILE_ATTR(cft->private); |
747 | .base_cftypes = blkcg_files, | 1226 | |
748 | .module = THIS_MODULE, | 1227 | blkcg = cgroup_to_blkio_cgroup(cgrp); |
1228 | |||
1229 | switch(plid) { | ||
1230 | case BLKIO_POLICY_PROP: | ||
1231 | switch(name) { | ||
1232 | case BLKIO_PROP_time: | ||
1233 | return blkio_read_blkg_stats(blkcg, cft, cb, | ||
1234 | BLKIO_STAT_TIME, 0, 0); | ||
1235 | case BLKIO_PROP_sectors: | ||
1236 | return blkio_read_blkg_stats(blkcg, cft, cb, | ||
1237 | BLKIO_STAT_CPU_SECTORS, 0, 1); | ||
1238 | case BLKIO_PROP_io_service_bytes: | ||
1239 | return blkio_read_blkg_stats(blkcg, cft, cb, | ||
1240 | BLKIO_STAT_CPU_SERVICE_BYTES, 1, 1); | ||
1241 | case BLKIO_PROP_io_serviced: | ||
1242 | return blkio_read_blkg_stats(blkcg, cft, cb, | ||
1243 | BLKIO_STAT_CPU_SERVICED, 1, 1); | ||
1244 | case BLKIO_PROP_io_service_time: | ||
1245 | return blkio_read_blkg_stats(blkcg, cft, cb, | ||
1246 | BLKIO_STAT_SERVICE_TIME, 1, 0); | ||
1247 | case BLKIO_PROP_io_wait_time: | ||
1248 | return blkio_read_blkg_stats(blkcg, cft, cb, | ||
1249 | BLKIO_STAT_WAIT_TIME, 1, 0); | ||
1250 | case BLKIO_PROP_io_merged: | ||
1251 | return blkio_read_blkg_stats(blkcg, cft, cb, | ||
1252 | BLKIO_STAT_CPU_MERGED, 1, 1); | ||
1253 | case BLKIO_PROP_io_queued: | ||
1254 | return blkio_read_blkg_stats(blkcg, cft, cb, | ||
1255 | BLKIO_STAT_QUEUED, 1, 0); | ||
1256 | #ifdef CONFIG_DEBUG_BLK_CGROUP | ||
1257 | case BLKIO_PROP_unaccounted_time: | ||
1258 | return blkio_read_blkg_stats(blkcg, cft, cb, | ||
1259 | BLKIO_STAT_UNACCOUNTED_TIME, 0, 0); | ||
1260 | case BLKIO_PROP_dequeue: | ||
1261 | return blkio_read_blkg_stats(blkcg, cft, cb, | ||
1262 | BLKIO_STAT_DEQUEUE, 0, 0); | ||
1263 | case BLKIO_PROP_avg_queue_size: | ||
1264 | return blkio_read_blkg_stats(blkcg, cft, cb, | ||
1265 | BLKIO_STAT_AVG_QUEUE_SIZE, 0, 0); | ||
1266 | case BLKIO_PROP_group_wait_time: | ||
1267 | return blkio_read_blkg_stats(blkcg, cft, cb, | ||
1268 | BLKIO_STAT_GROUP_WAIT_TIME, 0, 0); | ||
1269 | case BLKIO_PROP_idle_time: | ||
1270 | return blkio_read_blkg_stats(blkcg, cft, cb, | ||
1271 | BLKIO_STAT_IDLE_TIME, 0, 0); | ||
1272 | case BLKIO_PROP_empty_time: | ||
1273 | return blkio_read_blkg_stats(blkcg, cft, cb, | ||
1274 | BLKIO_STAT_EMPTY_TIME, 0, 0); | ||
1275 | #endif | ||
1276 | default: | ||
1277 | BUG(); | ||
1278 | } | ||
1279 | break; | ||
1280 | case BLKIO_POLICY_THROTL: | ||
1281 | switch(name){ | ||
1282 | case BLKIO_THROTL_io_service_bytes: | ||
1283 | return blkio_read_blkg_stats(blkcg, cft, cb, | ||
1284 | BLKIO_STAT_CPU_SERVICE_BYTES, 1, 1); | ||
1285 | case BLKIO_THROTL_io_serviced: | ||
1286 | return blkio_read_blkg_stats(blkcg, cft, cb, | ||
1287 | BLKIO_STAT_CPU_SERVICED, 1, 1); | ||
1288 | default: | ||
1289 | BUG(); | ||
1290 | } | ||
1291 | break; | ||
1292 | default: | ||
1293 | BUG(); | ||
1294 | } | ||
749 | 1295 | ||
750 | /* | 1296 | return 0; |
751 | * blkio subsystem is utterly broken in terms of hierarchy support. | 1297 | } |
752 | * It treats all cgroups equally regardless of where they're | ||
753 | * located in the hierarchy - all cgroups are treated as if they're | ||
754 | * right below the root. Fix it and remove the following. | ||
755 | */ | ||
756 | .broken_hierarchy = true, | ||
757 | }; | ||
758 | EXPORT_SYMBOL_GPL(blkio_subsys); | ||
759 | 1298 | ||
760 | /** | 1299 | static int blkio_weight_write(struct blkio_cgroup *blkcg, u64 val) |
761 | * blkcg_activate_policy - activate a blkcg policy on a request_queue | ||
762 | * @q: request_queue of interest | ||
763 | * @pol: blkcg policy to activate | ||
764 | * | ||
765 | * Activate @pol on @q. Requires %GFP_KERNEL context. @q goes through | ||
766 | * bypass mode to populate its blkgs with policy_data for @pol. | ||
767 | * | ||
768 | * Activation happens with @q bypassed, so nobody would be accessing blkgs | ||
769 | * from IO path. Update of each blkg is protected by both queue and blkcg | ||
770 | * locks so that holding either lock and testing blkcg_policy_enabled() is | ||
771 | * always enough for dereferencing policy data. | ||
772 | * | ||
773 | * The caller is responsible for synchronizing [de]activations and policy | ||
774 | * [un]registerations. Returns 0 on success, -errno on failure. | ||
775 | */ | ||
776 | int blkcg_activate_policy(struct request_queue *q, | ||
777 | const struct blkcg_policy *pol) | ||
778 | { | 1300 | { |
779 | LIST_HEAD(pds); | 1301 | struct blkio_group *blkg; |
780 | struct blkcg_gq *blkg; | 1302 | struct hlist_node *n; |
781 | struct blkg_policy_data *pd, *n; | 1303 | struct blkio_policy_node *pn; |
782 | int cnt = 0, ret; | ||
783 | bool preloaded; | ||
784 | |||
785 | if (blkcg_policy_enabled(q, pol)) | ||
786 | return 0; | ||
787 | 1304 | ||
788 | /* preallocations for root blkg */ | 1305 | if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX) |
789 | blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL); | 1306 | return -EINVAL; |
790 | if (!blkg) | ||
791 | return -ENOMEM; | ||
792 | 1307 | ||
793 | preloaded = !radix_tree_preload(GFP_KERNEL); | 1308 | spin_lock(&blkio_list_lock); |
1309 | spin_lock_irq(&blkcg->lock); | ||
1310 | blkcg->weight = (unsigned int)val; | ||
794 | 1311 | ||
795 | blk_queue_bypass_start(q); | 1312 | hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) { |
1313 | pn = blkio_policy_search_node(blkcg, blkg->dev, | ||
1314 | BLKIO_POLICY_PROP, BLKIO_PROP_weight_device); | ||
1315 | if (pn) | ||
1316 | continue; | ||
796 | 1317 | ||
797 | /* make sure the root blkg exists and count the existing blkgs */ | 1318 | blkio_update_group_weight(blkg, blkcg->weight); |
798 | spin_lock_irq(q->queue_lock); | 1319 | } |
1320 | spin_unlock_irq(&blkcg->lock); | ||
1321 | spin_unlock(&blkio_list_lock); | ||
1322 | return 0; | ||
1323 | } | ||
799 | 1324 | ||
800 | rcu_read_lock(); | 1325 | static u64 blkiocg_file_read_u64 (struct cgroup *cgrp, struct cftype *cft) { |
801 | blkg = __blkg_lookup_create(&blkcg_root, q, blkg); | 1326 | struct blkio_cgroup *blkcg; |
802 | rcu_read_unlock(); | 1327 | enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); |
1328 | int name = BLKIOFILE_ATTR(cft->private); | ||
803 | 1329 | ||
804 | if (preloaded) | 1330 | blkcg = cgroup_to_blkio_cgroup(cgrp); |
805 | radix_tree_preload_end(); | ||
806 | 1331 | ||
807 | if (IS_ERR(blkg)) { | 1332 | switch(plid) { |
808 | ret = PTR_ERR(blkg); | 1333 | case BLKIO_POLICY_PROP: |
809 | goto out_unlock; | 1334 | switch(name) { |
1335 | case BLKIO_PROP_weight: | ||
1336 | return (u64)blkcg->weight; | ||
1337 | } | ||
1338 | break; | ||
1339 | default: | ||
1340 | BUG(); | ||
810 | } | 1341 | } |
811 | q->root_blkg = blkg; | 1342 | return 0; |
812 | q->root_rl.blkg = blkg; | 1343 | } |
813 | 1344 | ||
814 | list_for_each_entry(blkg, &q->blkg_list, q_node) | 1345 | static int |
815 | cnt++; | 1346 | blkiocg_file_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val) |
1347 | { | ||
1348 | struct blkio_cgroup *blkcg; | ||
1349 | enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); | ||
1350 | int name = BLKIOFILE_ATTR(cft->private); | ||
816 | 1351 | ||
817 | spin_unlock_irq(q->queue_lock); | 1352 | blkcg = cgroup_to_blkio_cgroup(cgrp); |
818 | 1353 | ||
819 | /* allocate policy_data for all existing blkgs */ | 1354 | switch(plid) { |
820 | while (cnt--) { | 1355 | case BLKIO_POLICY_PROP: |
821 | pd = kzalloc_node(pol->pd_size, GFP_KERNEL, q->node); | 1356 | switch(name) { |
822 | if (!pd) { | 1357 | case BLKIO_PROP_weight: |
823 | ret = -ENOMEM; | 1358 | return blkio_weight_write(blkcg, val); |
824 | goto out_free; | ||
825 | } | 1359 | } |
826 | list_add_tail(&pd->alloc_node, &pds); | 1360 | break; |
1361 | default: | ||
1362 | BUG(); | ||
827 | } | 1363 | } |
828 | 1364 | ||
829 | /* | 1365 | return 0; |
830 | * Install the allocated pds. With @q bypassing, no new blkg | 1366 | } |
831 | * should have been created while the queue lock was dropped. | ||
832 | */ | ||
833 | spin_lock_irq(q->queue_lock); | ||
834 | 1367 | ||
835 | list_for_each_entry(blkg, &q->blkg_list, q_node) { | 1368 | struct cftype blkio_files[] = { |
836 | if (WARN_ON(list_empty(&pds))) { | 1369 | { |
837 | /* umm... this shouldn't happen, just abort */ | 1370 | .name = "weight_device", |
838 | ret = -ENOMEM; | 1371 | .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, |
839 | goto out_unlock; | 1372 | BLKIO_PROP_weight_device), |
840 | } | 1373 | .read_seq_string = blkiocg_file_read, |
841 | pd = list_first_entry(&pds, struct blkg_policy_data, alloc_node); | 1374 | .write_string = blkiocg_file_write, |
842 | list_del_init(&pd->alloc_node); | 1375 | .max_write_len = 256, |
1376 | }, | ||
1377 | { | ||
1378 | .name = "weight", | ||
1379 | .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, | ||
1380 | BLKIO_PROP_weight), | ||
1381 | .read_u64 = blkiocg_file_read_u64, | ||
1382 | .write_u64 = blkiocg_file_write_u64, | ||
1383 | }, | ||
1384 | { | ||
1385 | .name = "time", | ||
1386 | .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, | ||
1387 | BLKIO_PROP_time), | ||
1388 | .read_map = blkiocg_file_read_map, | ||
1389 | }, | ||
1390 | { | ||
1391 | .name = "sectors", | ||
1392 | .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, | ||
1393 | BLKIO_PROP_sectors), | ||
1394 | .read_map = blkiocg_file_read_map, | ||
1395 | }, | ||
1396 | { | ||
1397 | .name = "io_service_bytes", | ||
1398 | .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, | ||
1399 | BLKIO_PROP_io_service_bytes), | ||
1400 | .read_map = blkiocg_file_read_map, | ||
1401 | }, | ||
1402 | { | ||
1403 | .name = "io_serviced", | ||
1404 | .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, | ||
1405 | BLKIO_PROP_io_serviced), | ||
1406 | .read_map = blkiocg_file_read_map, | ||
1407 | }, | ||
1408 | { | ||
1409 | .name = "io_service_time", | ||
1410 | .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, | ||
1411 | BLKIO_PROP_io_service_time), | ||
1412 | .read_map = blkiocg_file_read_map, | ||
1413 | }, | ||
1414 | { | ||
1415 | .name = "io_wait_time", | ||
1416 | .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, | ||
1417 | BLKIO_PROP_io_wait_time), | ||
1418 | .read_map = blkiocg_file_read_map, | ||
1419 | }, | ||
1420 | { | ||
1421 | .name = "io_merged", | ||
1422 | .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, | ||
1423 | BLKIO_PROP_io_merged), | ||
1424 | .read_map = blkiocg_file_read_map, | ||
1425 | }, | ||
1426 | { | ||
1427 | .name = "io_queued", | ||
1428 | .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, | ||
1429 | BLKIO_PROP_io_queued), | ||
1430 | .read_map = blkiocg_file_read_map, | ||
1431 | }, | ||
1432 | { | ||
1433 | .name = "reset_stats", | ||
1434 | .write_u64 = blkiocg_reset_stats, | ||
1435 | }, | ||
1436 | #ifdef CONFIG_BLK_DEV_THROTTLING | ||
1437 | { | ||
1438 | .name = "throttle.read_bps_device", | ||
1439 | .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL, | ||
1440 | BLKIO_THROTL_read_bps_device), | ||
1441 | .read_seq_string = blkiocg_file_read, | ||
1442 | .write_string = blkiocg_file_write, | ||
1443 | .max_write_len = 256, | ||
1444 | }, | ||
843 | 1445 | ||
844 | /* grab blkcg lock too while installing @pd on @blkg */ | 1446 | { |
845 | spin_lock(&blkg->blkcg->lock); | 1447 | .name = "throttle.write_bps_device", |
1448 | .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL, | ||
1449 | BLKIO_THROTL_write_bps_device), | ||
1450 | .read_seq_string = blkiocg_file_read, | ||
1451 | .write_string = blkiocg_file_write, | ||
1452 | .max_write_len = 256, | ||
1453 | }, | ||
846 | 1454 | ||
847 | blkg->pd[pol->plid] = pd; | 1455 | { |
848 | pd->blkg = blkg; | 1456 | .name = "throttle.read_iops_device", |
849 | pol->pd_init_fn(blkg); | 1457 | .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL, |
1458 | BLKIO_THROTL_read_iops_device), | ||
1459 | .read_seq_string = blkiocg_file_read, | ||
1460 | .write_string = blkiocg_file_write, | ||
1461 | .max_write_len = 256, | ||
1462 | }, | ||
850 | 1463 | ||
851 | spin_unlock(&blkg->blkcg->lock); | 1464 | { |
852 | } | 1465 | .name = "throttle.write_iops_device", |
1466 | .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL, | ||
1467 | BLKIO_THROTL_write_iops_device), | ||
1468 | .read_seq_string = blkiocg_file_read, | ||
1469 | .write_string = blkiocg_file_write, | ||
1470 | .max_write_len = 256, | ||
1471 | }, | ||
1472 | { | ||
1473 | .name = "throttle.io_service_bytes", | ||
1474 | .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL, | ||
1475 | BLKIO_THROTL_io_service_bytes), | ||
1476 | .read_map = blkiocg_file_read_map, | ||
1477 | }, | ||
1478 | { | ||
1479 | .name = "throttle.io_serviced", | ||
1480 | .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL, | ||
1481 | BLKIO_THROTL_io_serviced), | ||
1482 | .read_map = blkiocg_file_read_map, | ||
1483 | }, | ||
1484 | #endif /* CONFIG_BLK_DEV_THROTTLING */ | ||
853 | 1485 | ||
854 | __set_bit(pol->plid, q->blkcg_pols); | 1486 | #ifdef CONFIG_DEBUG_BLK_CGROUP |
855 | ret = 0; | 1487 | { |
856 | out_unlock: | 1488 | .name = "avg_queue_size", |
857 | spin_unlock_irq(q->queue_lock); | 1489 | .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, |
858 | out_free: | 1490 | BLKIO_PROP_avg_queue_size), |
859 | blk_queue_bypass_end(q); | 1491 | .read_map = blkiocg_file_read_map, |
860 | list_for_each_entry_safe(pd, n, &pds, alloc_node) | 1492 | }, |
861 | kfree(pd); | 1493 | { |
862 | return ret; | 1494 | .name = "group_wait_time", |
1495 | .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, | ||
1496 | BLKIO_PROP_group_wait_time), | ||
1497 | .read_map = blkiocg_file_read_map, | ||
1498 | }, | ||
1499 | { | ||
1500 | .name = "idle_time", | ||
1501 | .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, | ||
1502 | BLKIO_PROP_idle_time), | ||
1503 | .read_map = blkiocg_file_read_map, | ||
1504 | }, | ||
1505 | { | ||
1506 | .name = "empty_time", | ||
1507 | .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, | ||
1508 | BLKIO_PROP_empty_time), | ||
1509 | .read_map = blkiocg_file_read_map, | ||
1510 | }, | ||
1511 | { | ||
1512 | .name = "dequeue", | ||
1513 | .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, | ||
1514 | BLKIO_PROP_dequeue), | ||
1515 | .read_map = blkiocg_file_read_map, | ||
1516 | }, | ||
1517 | { | ||
1518 | .name = "unaccounted_time", | ||
1519 | .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, | ||
1520 | BLKIO_PROP_unaccounted_time), | ||
1521 | .read_map = blkiocg_file_read_map, | ||
1522 | }, | ||
1523 | #endif | ||
1524 | }; | ||
1525 | |||
1526 | static int blkiocg_populate(struct cgroup_subsys *subsys, struct cgroup *cgroup) | ||
1527 | { | ||
1528 | return cgroup_add_files(cgroup, subsys, blkio_files, | ||
1529 | ARRAY_SIZE(blkio_files)); | ||
863 | } | 1530 | } |
864 | EXPORT_SYMBOL_GPL(blkcg_activate_policy); | ||
865 | 1531 | ||
866 | /** | 1532 | static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup) |
867 | * blkcg_deactivate_policy - deactivate a blkcg policy on a request_queue | ||
868 | * @q: request_queue of interest | ||
869 | * @pol: blkcg policy to deactivate | ||
870 | * | ||
871 | * Deactivate @pol on @q. Follows the same synchronization rules as | ||
872 | * blkcg_activate_policy(). | ||
873 | */ | ||
874 | void blkcg_deactivate_policy(struct request_queue *q, | ||
875 | const struct blkcg_policy *pol) | ||
876 | { | 1533 | { |
877 | struct blkcg_gq *blkg; | 1534 | struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup); |
1535 | unsigned long flags; | ||
1536 | struct blkio_group *blkg; | ||
1537 | void *key; | ||
1538 | struct blkio_policy_type *blkiop; | ||
1539 | struct blkio_policy_node *pn, *pntmp; | ||
878 | 1540 | ||
879 | if (!blkcg_policy_enabled(q, pol)) | 1541 | rcu_read_lock(); |
880 | return; | 1542 | do { |
1543 | spin_lock_irqsave(&blkcg->lock, flags); | ||
881 | 1544 | ||
882 | blk_queue_bypass_start(q); | 1545 | if (hlist_empty(&blkcg->blkg_list)) { |
883 | spin_lock_irq(q->queue_lock); | 1546 | spin_unlock_irqrestore(&blkcg->lock, flags); |
1547 | break; | ||
1548 | } | ||
884 | 1549 | ||
885 | __clear_bit(pol->plid, q->blkcg_pols); | 1550 | blkg = hlist_entry(blkcg->blkg_list.first, struct blkio_group, |
1551 | blkcg_node); | ||
1552 | key = rcu_dereference(blkg->key); | ||
1553 | __blkiocg_del_blkio_group(blkg); | ||
886 | 1554 | ||
887 | /* if no policy is left, no need for blkgs - shoot them down */ | 1555 | spin_unlock_irqrestore(&blkcg->lock, flags); |
888 | if (bitmap_empty(q->blkcg_pols, BLKCG_MAX_POLS)) | ||
889 | blkg_destroy_all(q); | ||
890 | 1556 | ||
891 | list_for_each_entry(blkg, &q->blkg_list, q_node) { | 1557 | /* |
892 | /* grab blkcg lock too while removing @pd from @blkg */ | 1558 | * This blkio_group is being unlinked as associated cgroup is |
893 | spin_lock(&blkg->blkcg->lock); | 1559 | * going away. Let all the IO controlling policies know about |
1560 | * this event. | ||
1561 | */ | ||
1562 | spin_lock(&blkio_list_lock); | ||
1563 | list_for_each_entry(blkiop, &blkio_list, list) { | ||
1564 | if (blkiop->plid != blkg->plid) | ||
1565 | continue; | ||
1566 | blkiop->ops.blkio_unlink_group_fn(key, blkg); | ||
1567 | } | ||
1568 | spin_unlock(&blkio_list_lock); | ||
1569 | } while (1); | ||
894 | 1570 | ||
895 | if (pol->pd_exit_fn) | 1571 | list_for_each_entry_safe(pn, pntmp, &blkcg->policy_list, node) { |
896 | pol->pd_exit_fn(blkg); | 1572 | blkio_policy_delete_node(pn); |
1573 | kfree(pn); | ||
1574 | } | ||
897 | 1575 | ||
898 | kfree(blkg->pd[pol->plid]); | 1576 | free_css_id(&blkio_subsys, &blkcg->css); |
899 | blkg->pd[pol->plid] = NULL; | 1577 | rcu_read_unlock(); |
1578 | if (blkcg != &blkio_root_cgroup) | ||
1579 | kfree(blkcg); | ||
1580 | } | ||
900 | 1581 | ||
901 | spin_unlock(&blkg->blkcg->lock); | 1582 | static struct cgroup_subsys_state * |
1583 | blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup) | ||
1584 | { | ||
1585 | struct blkio_cgroup *blkcg; | ||
1586 | struct cgroup *parent = cgroup->parent; | ||
1587 | |||
1588 | if (!parent) { | ||
1589 | blkcg = &blkio_root_cgroup; | ||
1590 | goto done; | ||
902 | } | 1591 | } |
903 | 1592 | ||
904 | spin_unlock_irq(q->queue_lock); | 1593 | blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL); |
905 | blk_queue_bypass_end(q); | 1594 | if (!blkcg) |
1595 | return ERR_PTR(-ENOMEM); | ||
1596 | |||
1597 | blkcg->weight = BLKIO_WEIGHT_DEFAULT; | ||
1598 | done: | ||
1599 | spin_lock_init(&blkcg->lock); | ||
1600 | INIT_HLIST_HEAD(&blkcg->blkg_list); | ||
1601 | |||
1602 | INIT_LIST_HEAD(&blkcg->policy_list); | ||
1603 | return &blkcg->css; | ||
906 | } | 1604 | } |
907 | EXPORT_SYMBOL_GPL(blkcg_deactivate_policy); | ||
908 | 1605 | ||
909 | /** | 1606 | /* |
910 | * blkcg_policy_register - register a blkcg policy | 1607 | * We cannot support shared io contexts, as we have no mean to support |
911 | * @pol: blkcg policy to register | 1608 | * two tasks with the same ioc in two different groups without major rework |
912 | * | 1609 | * of the main cic data structures. For now we allow a task to change |
913 | * Register @pol with blkcg core. Might sleep and @pol may be modified on | 1610 | * its cgroup only if it's the only owner of its ioc. |
914 | * successful registration. Returns 0 on success and -errno on failure. | ||
915 | */ | 1611 | */ |
916 | int blkcg_policy_register(struct blkcg_policy *pol) | 1612 | static int blkiocg_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk) |
917 | { | 1613 | { |
918 | int i, ret; | 1614 | struct io_context *ioc; |
919 | 1615 | int ret = 0; | |
920 | if (WARN_ON(pol->pd_size < sizeof(struct blkg_policy_data))) | ||
921 | return -EINVAL; | ||
922 | 1616 | ||
923 | mutex_lock(&blkcg_pol_mutex); | 1617 | /* task_lock() is needed to avoid races with exit_io_context() */ |
1618 | task_lock(tsk); | ||
1619 | ioc = tsk->io_context; | ||
1620 | if (ioc && atomic_read(&ioc->nr_tasks) > 1) | ||
1621 | ret = -EINVAL; | ||
1622 | task_unlock(tsk); | ||
924 | 1623 | ||
925 | /* find an empty slot */ | ||
926 | ret = -ENOSPC; | ||
927 | for (i = 0; i < BLKCG_MAX_POLS; i++) | ||
928 | if (!blkcg_policy[i]) | ||
929 | break; | ||
930 | if (i >= BLKCG_MAX_POLS) | ||
931 | goto out_unlock; | ||
932 | |||
933 | /* register and update blkgs */ | ||
934 | pol->plid = i; | ||
935 | blkcg_policy[i] = pol; | ||
936 | |||
937 | /* everything is in place, add intf files for the new policy */ | ||
938 | if (pol->cftypes) | ||
939 | WARN_ON(cgroup_add_cftypes(&blkio_subsys, pol->cftypes)); | ||
940 | ret = 0; | ||
941 | out_unlock: | ||
942 | mutex_unlock(&blkcg_pol_mutex); | ||
943 | return ret; | 1624 | return ret; |
944 | } | 1625 | } |
945 | EXPORT_SYMBOL_GPL(blkcg_policy_register); | ||
946 | 1626 | ||
947 | /** | 1627 | static void blkiocg_attach_task(struct cgroup *cgrp, struct task_struct *tsk) |
948 | * blkcg_policy_unregister - unregister a blkcg policy | 1628 | { |
949 | * @pol: blkcg policy to unregister | 1629 | struct io_context *ioc; |
950 | * | 1630 | |
951 | * Undo blkcg_policy_register(@pol). Might sleep. | 1631 | task_lock(tsk); |
952 | */ | 1632 | ioc = tsk->io_context; |
953 | void blkcg_policy_unregister(struct blkcg_policy *pol) | 1633 | if (ioc) |
1634 | ioc->cgroup_changed = 1; | ||
1635 | task_unlock(tsk); | ||
1636 | } | ||
1637 | |||
1638 | void blkio_policy_register(struct blkio_policy_type *blkiop) | ||
954 | { | 1639 | { |
955 | mutex_lock(&blkcg_pol_mutex); | 1640 | spin_lock(&blkio_list_lock); |
1641 | list_add_tail(&blkiop->list, &blkio_list); | ||
1642 | spin_unlock(&blkio_list_lock); | ||
1643 | } | ||
1644 | EXPORT_SYMBOL_GPL(blkio_policy_register); | ||
956 | 1645 | ||
957 | if (WARN_ON(blkcg_policy[pol->plid] != pol)) | 1646 | void blkio_policy_unregister(struct blkio_policy_type *blkiop) |
958 | goto out_unlock; | 1647 | { |
1648 | spin_lock(&blkio_list_lock); | ||
1649 | list_del_init(&blkiop->list); | ||
1650 | spin_unlock(&blkio_list_lock); | ||
1651 | } | ||
1652 | EXPORT_SYMBOL_GPL(blkio_policy_unregister); | ||
959 | 1653 | ||
960 | /* kill the intf files first */ | 1654 | static int __init init_cgroup_blkio(void) |
961 | if (pol->cftypes) | 1655 | { |
962 | cgroup_rm_cftypes(&blkio_subsys, pol->cftypes); | 1656 | return cgroup_load_subsys(&blkio_subsys); |
1657 | } | ||
963 | 1658 | ||
964 | /* unregister and update blkgs */ | 1659 | static void __exit exit_cgroup_blkio(void) |
965 | blkcg_policy[pol->plid] = NULL; | 1660 | { |
966 | out_unlock: | 1661 | cgroup_unload_subsys(&blkio_subsys); |
967 | mutex_unlock(&blkcg_pol_mutex); | ||
968 | } | 1662 | } |
969 | EXPORT_SYMBOL_GPL(blkcg_policy_unregister); | 1663 | |
1664 | module_init(init_cgroup_blkio); | ||
1665 | module_exit(exit_cgroup_blkio); | ||
1666 | MODULE_LICENSE("GPL"); | ||
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index 24597309e23..a71d2904ffb 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h | |||
@@ -15,491 +15,350 @@ | |||
15 | 15 | ||
16 | #include <linux/cgroup.h> | 16 | #include <linux/cgroup.h> |
17 | #include <linux/u64_stats_sync.h> | 17 | #include <linux/u64_stats_sync.h> |
18 | #include <linux/seq_file.h> | 18 | |
19 | #include <linux/radix-tree.h> | 19 | enum blkio_policy_id { |
20 | #include <linux/blkdev.h> | 20 | BLKIO_POLICY_PROP = 0, /* Proportional Bandwidth division */ |
21 | BLKIO_POLICY_THROTL, /* Throttling */ | ||
22 | }; | ||
21 | 23 | ||
22 | /* Max limits for throttle policy */ | 24 | /* Max limits for throttle policy */ |
23 | #define THROTL_IOPS_MAX UINT_MAX | 25 | #define THROTL_IOPS_MAX UINT_MAX |
24 | 26 | ||
25 | /* CFQ specific, out here for blkcg->cfq_weight */ | 27 | #if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE) |
26 | #define CFQ_WEIGHT_MIN 10 | 28 | |
27 | #define CFQ_WEIGHT_MAX 1000 | 29 | #ifndef CONFIG_BLK_CGROUP |
28 | #define CFQ_WEIGHT_DEFAULT 500 | 30 | /* When blk-cgroup is a module, its subsys_id isn't a compile-time constant */ |
29 | 31 | extern struct cgroup_subsys blkio_subsys; | |
30 | #ifdef CONFIG_BLK_CGROUP | 32 | #define blkio_subsys_id blkio_subsys.subsys_id |
31 | 33 | #endif | |
32 | enum blkg_rwstat_type { | 34 | |
33 | BLKG_RWSTAT_READ, | 35 | enum stat_type { |
34 | BLKG_RWSTAT_WRITE, | 36 | /* Total time spent (in ns) between request dispatch to the driver and |
35 | BLKG_RWSTAT_SYNC, | 37 | * request completion for IOs doen by this cgroup. This may not be |
36 | BLKG_RWSTAT_ASYNC, | 38 | * accurate when NCQ is turned on. */ |
37 | 39 | BLKIO_STAT_SERVICE_TIME = 0, | |
38 | BLKG_RWSTAT_NR, | 40 | /* Total time spent waiting in scheduler queue in ns */ |
39 | BLKG_RWSTAT_TOTAL = BLKG_RWSTAT_NR, | 41 | BLKIO_STAT_WAIT_TIME, |
42 | /* Number of IOs queued up */ | ||
43 | BLKIO_STAT_QUEUED, | ||
44 | /* All the single valued stats go below this */ | ||
45 | BLKIO_STAT_TIME, | ||
46 | #ifdef CONFIG_DEBUG_BLK_CGROUP | ||
47 | /* Time not charged to this cgroup */ | ||
48 | BLKIO_STAT_UNACCOUNTED_TIME, | ||
49 | BLKIO_STAT_AVG_QUEUE_SIZE, | ||
50 | BLKIO_STAT_IDLE_TIME, | ||
51 | BLKIO_STAT_EMPTY_TIME, | ||
52 | BLKIO_STAT_GROUP_WAIT_TIME, | ||
53 | BLKIO_STAT_DEQUEUE | ||
54 | #endif | ||
40 | }; | 55 | }; |
41 | 56 | ||
42 | struct blkcg_gq; | 57 | /* Per cpu stats */ |
43 | 58 | enum stat_type_cpu { | |
44 | struct blkcg { | 59 | BLKIO_STAT_CPU_SECTORS, |
45 | struct cgroup_subsys_state css; | 60 | /* Total bytes transferred */ |
46 | spinlock_t lock; | 61 | BLKIO_STAT_CPU_SERVICE_BYTES, |
47 | 62 | /* Total IOs serviced, post merge */ | |
48 | struct radix_tree_root blkg_tree; | 63 | BLKIO_STAT_CPU_SERVICED, |
49 | struct blkcg_gq *blkg_hint; | 64 | /* Number of IOs merged */ |
50 | struct hlist_head blkg_list; | 65 | BLKIO_STAT_CPU_MERGED, |
51 | 66 | BLKIO_STAT_CPU_NR | |
52 | /* for policies to test whether associated blkcg has changed */ | ||
53 | uint64_t id; | ||
54 | |||
55 | /* TODO: per-policy storage in blkcg */ | ||
56 | unsigned int cfq_weight; /* belongs to cfq */ | ||
57 | }; | 67 | }; |
58 | 68 | ||
59 | struct blkg_stat { | 69 | enum stat_sub_type { |
60 | struct u64_stats_sync syncp; | 70 | BLKIO_STAT_READ = 0, |
61 | uint64_t cnt; | 71 | BLKIO_STAT_WRITE, |
72 | BLKIO_STAT_SYNC, | ||
73 | BLKIO_STAT_ASYNC, | ||
74 | BLKIO_STAT_TOTAL | ||
62 | }; | 75 | }; |
63 | 76 | ||
64 | struct blkg_rwstat { | 77 | /* blkg state flags */ |
65 | struct u64_stats_sync syncp; | 78 | enum blkg_state_flags { |
66 | uint64_t cnt[BLKG_RWSTAT_NR]; | 79 | BLKG_waiting = 0, |
80 | BLKG_idling, | ||
81 | BLKG_empty, | ||
67 | }; | 82 | }; |
68 | 83 | ||
69 | /* | 84 | /* cgroup files owned by proportional weight policy */ |
70 | * A blkcg_gq (blkg) is association between a block cgroup (blkcg) and a | 85 | enum blkcg_file_name_prop { |
71 | * request_queue (q). This is used by blkcg policies which need to track | 86 | BLKIO_PROP_weight = 1, |
72 | * information per blkcg - q pair. | 87 | BLKIO_PROP_weight_device, |
73 | * | 88 | BLKIO_PROP_io_service_bytes, |
74 | * There can be multiple active blkcg policies and each has its private | 89 | BLKIO_PROP_io_serviced, |
75 | * data on each blkg, the size of which is determined by | 90 | BLKIO_PROP_time, |
76 | * blkcg_policy->pd_size. blkcg core allocates and frees such areas | 91 | BLKIO_PROP_sectors, |
77 | * together with blkg and invokes pd_init/exit_fn() methods. | 92 | BLKIO_PROP_unaccounted_time, |
78 | * | 93 | BLKIO_PROP_io_service_time, |
79 | * Such private data must embed struct blkg_policy_data (pd) at the | 94 | BLKIO_PROP_io_wait_time, |
80 | * beginning and pd_size can't be smaller than pd. | 95 | BLKIO_PROP_io_merged, |
81 | */ | 96 | BLKIO_PROP_io_queued, |
82 | struct blkg_policy_data { | 97 | BLKIO_PROP_avg_queue_size, |
83 | /* the blkg this per-policy data belongs to */ | 98 | BLKIO_PROP_group_wait_time, |
84 | struct blkcg_gq *blkg; | 99 | BLKIO_PROP_idle_time, |
85 | 100 | BLKIO_PROP_empty_time, | |
86 | /* used during policy activation */ | 101 | BLKIO_PROP_dequeue, |
87 | struct list_head alloc_node; | ||
88 | }; | 102 | }; |
89 | 103 | ||
90 | /* association between a blk cgroup and a request queue */ | 104 | /* cgroup files owned by throttle policy */ |
91 | struct blkcg_gq { | 105 | enum blkcg_file_name_throtl { |
92 | /* Pointer to the associated request_queue */ | 106 | BLKIO_THROTL_read_bps_device, |
93 | struct request_queue *q; | 107 | BLKIO_THROTL_write_bps_device, |
94 | struct list_head q_node; | 108 | BLKIO_THROTL_read_iops_device, |
95 | struct hlist_node blkcg_node; | 109 | BLKIO_THROTL_write_iops_device, |
96 | struct blkcg *blkcg; | 110 | BLKIO_THROTL_io_service_bytes, |
97 | /* request allocation list for this blkcg-q pair */ | 111 | BLKIO_THROTL_io_serviced, |
98 | struct request_list rl; | ||
99 | /* reference count */ | ||
100 | int refcnt; | ||
101 | |||
102 | struct blkg_policy_data *pd[BLKCG_MAX_POLS]; | ||
103 | |||
104 | struct rcu_head rcu_head; | ||
105 | }; | 112 | }; |
106 | 113 | ||
107 | typedef void (blkcg_pol_init_pd_fn)(struct blkcg_gq *blkg); | 114 | struct blkio_cgroup { |
108 | typedef void (blkcg_pol_exit_pd_fn)(struct blkcg_gq *blkg); | 115 | struct cgroup_subsys_state css; |
109 | typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkcg_gq *blkg); | 116 | unsigned int weight; |
110 | 117 | spinlock_t lock; | |
111 | struct blkcg_policy { | 118 | struct hlist_head blkg_list; |
112 | int plid; | 119 | struct list_head policy_list; /* list of blkio_policy_node */ |
113 | /* policy specific private data size */ | ||
114 | size_t pd_size; | ||
115 | /* cgroup files for the policy */ | ||
116 | struct cftype *cftypes; | ||
117 | |||
118 | /* operations */ | ||
119 | blkcg_pol_init_pd_fn *pd_init_fn; | ||
120 | blkcg_pol_exit_pd_fn *pd_exit_fn; | ||
121 | blkcg_pol_reset_pd_stats_fn *pd_reset_stats_fn; | ||
122 | }; | 120 | }; |
123 | 121 | ||
124 | extern struct blkcg blkcg_root; | 122 | struct blkio_group_stats { |
125 | 123 | /* total disk time and nr sectors dispatched by this group */ | |
126 | struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q); | 124 | uint64_t time; |
127 | struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, | 125 | uint64_t stat_arr[BLKIO_STAT_QUEUED + 1][BLKIO_STAT_TOTAL]; |
128 | struct request_queue *q); | 126 | #ifdef CONFIG_DEBUG_BLK_CGROUP |
129 | int blkcg_init_queue(struct request_queue *q); | 127 | /* Time not charged to this cgroup */ |
130 | void blkcg_drain_queue(struct request_queue *q); | 128 | uint64_t unaccounted_time; |
131 | void blkcg_exit_queue(struct request_queue *q); | 129 | |
132 | 130 | /* Sum of number of IOs queued across all samples */ | |
133 | /* Blkio controller policy registration */ | 131 | uint64_t avg_queue_size_sum; |
134 | int blkcg_policy_register(struct blkcg_policy *pol); | 132 | /* Count of samples taken for average */ |
135 | void blkcg_policy_unregister(struct blkcg_policy *pol); | 133 | uint64_t avg_queue_size_samples; |
136 | int blkcg_activate_policy(struct request_queue *q, | 134 | /* How many times this group has been removed from service tree */ |
137 | const struct blkcg_policy *pol); | 135 | unsigned long dequeue; |
138 | void blkcg_deactivate_policy(struct request_queue *q, | 136 | |
139 | const struct blkcg_policy *pol); | 137 | /* Total time spent waiting for it to be assigned a timeslice. */ |
140 | 138 | uint64_t group_wait_time; | |
141 | void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg, | 139 | uint64_t start_group_wait_time; |
142 | u64 (*prfill)(struct seq_file *, | 140 | |
143 | struct blkg_policy_data *, int), | 141 | /* Time spent idling for this blkio_group */ |
144 | const struct blkcg_policy *pol, int data, | 142 | uint64_t idle_time; |
145 | bool show_total); | 143 | uint64_t start_idle_time; |
146 | u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v); | ||
147 | u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, | ||
148 | const struct blkg_rwstat *rwstat); | ||
149 | u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, int off); | ||
150 | u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, | ||
151 | int off); | ||
152 | |||
153 | struct blkg_conf_ctx { | ||
154 | struct gendisk *disk; | ||
155 | struct blkcg_gq *blkg; | ||
156 | u64 v; | ||
157 | }; | ||
158 | |||
159 | int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, | ||
160 | const char *input, struct blkg_conf_ctx *ctx); | ||
161 | void blkg_conf_finish(struct blkg_conf_ctx *ctx); | ||
162 | |||
163 | |||
164 | static inline struct blkcg *cgroup_to_blkcg(struct cgroup *cgroup) | ||
165 | { | ||
166 | return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id), | ||
167 | struct blkcg, css); | ||
168 | } | ||
169 | |||
170 | static inline struct blkcg *task_blkcg(struct task_struct *tsk) | ||
171 | { | ||
172 | return container_of(task_subsys_state(tsk, blkio_subsys_id), | ||
173 | struct blkcg, css); | ||
174 | } | ||
175 | |||
176 | static inline struct blkcg *bio_blkcg(struct bio *bio) | ||
177 | { | ||
178 | if (bio && bio->bi_css) | ||
179 | return container_of(bio->bi_css, struct blkcg, css); | ||
180 | return task_blkcg(current); | ||
181 | } | ||
182 | |||
183 | /** | ||
184 | * blkg_to_pdata - get policy private data | ||
185 | * @blkg: blkg of interest | ||
186 | * @pol: policy of interest | ||
187 | * | ||
188 | * Return pointer to private data associated with the @blkg-@pol pair. | ||
189 | */ | ||
190 | static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg, | ||
191 | struct blkcg_policy *pol) | ||
192 | { | ||
193 | return blkg ? blkg->pd[pol->plid] : NULL; | ||
194 | } | ||
195 | |||
196 | /** | ||
197 | * pdata_to_blkg - get blkg associated with policy private data | ||
198 | * @pd: policy private data of interest | ||
199 | * | ||
200 | * @pd is policy private data. Determine the blkg it's associated with. | ||
201 | */ | ||
202 | static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) | ||
203 | { | ||
204 | return pd ? pd->blkg : NULL; | ||
205 | } | ||
206 | |||
207 | /** | ||
208 | * blkg_path - format cgroup path of blkg | ||
209 | * @blkg: blkg of interest | ||
210 | * @buf: target buffer | ||
211 | * @buflen: target buffer length | ||
212 | * | ||
213 | * Format the path of the cgroup of @blkg into @buf. | ||
214 | */ | ||
215 | static inline int blkg_path(struct blkcg_gq *blkg, char *buf, int buflen) | ||
216 | { | ||
217 | int ret; | ||
218 | |||
219 | rcu_read_lock(); | ||
220 | ret = cgroup_path(blkg->blkcg->css.cgroup, buf, buflen); | ||
221 | rcu_read_unlock(); | ||
222 | if (ret) | ||
223 | strncpy(buf, "<unavailable>", buflen); | ||
224 | return ret; | ||
225 | } | ||
226 | |||
227 | /** | ||
228 | * blkg_get - get a blkg reference | ||
229 | * @blkg: blkg to get | ||
230 | * | ||
231 | * The caller should be holding queue_lock and an existing reference. | ||
232 | */ | ||
233 | static inline void blkg_get(struct blkcg_gq *blkg) | ||
234 | { | ||
235 | lockdep_assert_held(blkg->q->queue_lock); | ||
236 | WARN_ON_ONCE(!blkg->refcnt); | ||
237 | blkg->refcnt++; | ||
238 | } | ||
239 | |||
240 | void __blkg_release(struct blkcg_gq *blkg); | ||
241 | |||
242 | /** | ||
243 | * blkg_put - put a blkg reference | ||
244 | * @blkg: blkg to put | ||
245 | * | ||
246 | * The caller should be holding queue_lock. | ||
247 | */ | ||
248 | static inline void blkg_put(struct blkcg_gq *blkg) | ||
249 | { | ||
250 | lockdep_assert_held(blkg->q->queue_lock); | ||
251 | WARN_ON_ONCE(blkg->refcnt <= 0); | ||
252 | if (!--blkg->refcnt) | ||
253 | __blkg_release(blkg); | ||
254 | } | ||
255 | |||
256 | /** | ||
257 | * blk_get_rl - get request_list to use | ||
258 | * @q: request_queue of interest | ||
259 | * @bio: bio which will be attached to the allocated request (may be %NULL) | ||
260 | * | ||
261 | * The caller wants to allocate a request from @q to use for @bio. Find | ||
262 | * the request_list to use and obtain a reference on it. Should be called | ||
263 | * under queue_lock. This function is guaranteed to return non-%NULL | ||
264 | * request_list. | ||
265 | */ | ||
266 | static inline struct request_list *blk_get_rl(struct request_queue *q, | ||
267 | struct bio *bio) | ||
268 | { | ||
269 | struct blkcg *blkcg; | ||
270 | struct blkcg_gq *blkg; | ||
271 | |||
272 | rcu_read_lock(); | ||
273 | |||
274 | blkcg = bio_blkcg(bio); | ||
275 | |||
276 | /* bypass blkg lookup and use @q->root_rl directly for root */ | ||
277 | if (blkcg == &blkcg_root) | ||
278 | goto root_rl; | ||
279 | |||
280 | /* | 144 | /* |
281 | * Try to use blkg->rl. blkg lookup may fail under memory pressure | 145 | * Total time when we have requests queued and do not contain the |
282 | * or if either the blkcg or queue is going away. Fall back to | 146 | * current active queue. |
283 | * root_rl in such cases. | ||
284 | */ | 147 | */ |
285 | blkg = blkg_lookup_create(blkcg, q); | 148 | uint64_t empty_time; |
286 | if (unlikely(IS_ERR(blkg))) | 149 | uint64_t start_empty_time; |
287 | goto root_rl; | 150 | uint16_t flags; |
288 | 151 | #endif | |
289 | blkg_get(blkg); | 152 | }; |
290 | rcu_read_unlock(); | ||
291 | return &blkg->rl; | ||
292 | root_rl: | ||
293 | rcu_read_unlock(); | ||
294 | return &q->root_rl; | ||
295 | } | ||
296 | |||
297 | /** | ||
298 | * blk_put_rl - put request_list | ||
299 | * @rl: request_list to put | ||
300 | * | ||
301 | * Put the reference acquired by blk_get_rl(). Should be called under | ||
302 | * queue_lock. | ||
303 | */ | ||
304 | static inline void blk_put_rl(struct request_list *rl) | ||
305 | { | ||
306 | /* root_rl may not have blkg set */ | ||
307 | if (rl->blkg && rl->blkg->blkcg != &blkcg_root) | ||
308 | blkg_put(rl->blkg); | ||
309 | } | ||
310 | |||
311 | /** | ||
312 | * blk_rq_set_rl - associate a request with a request_list | ||
313 | * @rq: request of interest | ||
314 | * @rl: target request_list | ||
315 | * | ||
316 | * Associate @rq with @rl so that accounting and freeing can know the | ||
317 | * request_list @rq came from. | ||
318 | */ | ||
319 | static inline void blk_rq_set_rl(struct request *rq, struct request_list *rl) | ||
320 | { | ||
321 | rq->rl = rl; | ||
322 | } | ||
323 | |||
324 | /** | ||
325 | * blk_rq_rl - return the request_list a request came from | ||
326 | * @rq: request of interest | ||
327 | * | ||
328 | * Return the request_list @rq is allocated from. | ||
329 | */ | ||
330 | static inline struct request_list *blk_rq_rl(struct request *rq) | ||
331 | { | ||
332 | return rq->rl; | ||
333 | } | ||
334 | |||
335 | struct request_list *__blk_queue_next_rl(struct request_list *rl, | ||
336 | struct request_queue *q); | ||
337 | /** | ||
338 | * blk_queue_for_each_rl - iterate through all request_lists of a request_queue | ||
339 | * | ||
340 | * Should be used under queue_lock. | ||
341 | */ | ||
342 | #define blk_queue_for_each_rl(rl, q) \ | ||
343 | for ((rl) = &(q)->root_rl; (rl); (rl) = __blk_queue_next_rl((rl), (q))) | ||
344 | |||
345 | /** | ||
346 | * blkg_stat_add - add a value to a blkg_stat | ||
347 | * @stat: target blkg_stat | ||
348 | * @val: value to add | ||
349 | * | ||
350 | * Add @val to @stat. The caller is responsible for synchronizing calls to | ||
351 | * this function. | ||
352 | */ | ||
353 | static inline void blkg_stat_add(struct blkg_stat *stat, uint64_t val) | ||
354 | { | ||
355 | u64_stats_update_begin(&stat->syncp); | ||
356 | stat->cnt += val; | ||
357 | u64_stats_update_end(&stat->syncp); | ||
358 | } | ||
359 | |||
360 | /** | ||
361 | * blkg_stat_read - read the current value of a blkg_stat | ||
362 | * @stat: blkg_stat to read | ||
363 | * | ||
364 | * Read the current value of @stat. This function can be called without | ||
365 | * synchroniztion and takes care of u64 atomicity. | ||
366 | */ | ||
367 | static inline uint64_t blkg_stat_read(struct blkg_stat *stat) | ||
368 | { | ||
369 | unsigned int start; | ||
370 | uint64_t v; | ||
371 | |||
372 | do { | ||
373 | start = u64_stats_fetch_begin(&stat->syncp); | ||
374 | v = stat->cnt; | ||
375 | } while (u64_stats_fetch_retry(&stat->syncp, start)); | ||
376 | |||
377 | return v; | ||
378 | } | ||
379 | |||
380 | /** | ||
381 | * blkg_stat_reset - reset a blkg_stat | ||
382 | * @stat: blkg_stat to reset | ||
383 | */ | ||
384 | static inline void blkg_stat_reset(struct blkg_stat *stat) | ||
385 | { | ||
386 | stat->cnt = 0; | ||
387 | } | ||
388 | 153 | ||
389 | /** | 154 | /* Per cpu blkio group stats */ |
390 | * blkg_rwstat_add - add a value to a blkg_rwstat | 155 | struct blkio_group_stats_cpu { |
391 | * @rwstat: target blkg_rwstat | 156 | uint64_t sectors; |
392 | * @rw: mask of REQ_{WRITE|SYNC} | 157 | uint64_t stat_arr_cpu[BLKIO_STAT_CPU_NR][BLKIO_STAT_TOTAL]; |
393 | * @val: value to add | 158 | struct u64_stats_sync syncp; |
394 | * | 159 | }; |
395 | * Add @val to @rwstat. The counters are chosen according to @rw. The | ||
396 | * caller is responsible for synchronizing calls to this function. | ||
397 | */ | ||
398 | static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat, | ||
399 | int rw, uint64_t val) | ||
400 | { | ||
401 | u64_stats_update_begin(&rwstat->syncp); | ||
402 | |||
403 | if (rw & REQ_WRITE) | ||
404 | rwstat->cnt[BLKG_RWSTAT_WRITE] += val; | ||
405 | else | ||
406 | rwstat->cnt[BLKG_RWSTAT_READ] += val; | ||
407 | if (rw & REQ_SYNC) | ||
408 | rwstat->cnt[BLKG_RWSTAT_SYNC] += val; | ||
409 | else | ||
410 | rwstat->cnt[BLKG_RWSTAT_ASYNC] += val; | ||
411 | |||
412 | u64_stats_update_end(&rwstat->syncp); | ||
413 | } | ||
414 | 160 | ||
415 | /** | 161 | struct blkio_group { |
416 | * blkg_rwstat_read - read the current values of a blkg_rwstat | 162 | /* An rcu protected unique identifier for the group */ |
417 | * @rwstat: blkg_rwstat to read | 163 | void *key; |
418 | * | 164 | struct hlist_node blkcg_node; |
419 | * Read the current snapshot of @rwstat and return it as the return value. | 165 | unsigned short blkcg_id; |
420 | * This function can be called without synchronization and takes care of | 166 | /* Store cgroup path */ |
421 | * u64 atomicity. | 167 | char path[128]; |
422 | */ | 168 | /* The device MKDEV(major, minor), this group has been created for */ |
423 | static inline struct blkg_rwstat blkg_rwstat_read(struct blkg_rwstat *rwstat) | 169 | dev_t dev; |
424 | { | 170 | /* policy which owns this blk group */ |
425 | unsigned int start; | 171 | enum blkio_policy_id plid; |
426 | struct blkg_rwstat tmp; | 172 | |
173 | /* Need to serialize the stats in the case of reset/update */ | ||
174 | spinlock_t stats_lock; | ||
175 | struct blkio_group_stats stats; | ||
176 | /* Per cpu stats pointer */ | ||
177 | struct blkio_group_stats_cpu __percpu *stats_cpu; | ||
178 | }; | ||
427 | 179 | ||
428 | do { | 180 | struct blkio_policy_node { |
429 | start = u64_stats_fetch_begin(&rwstat->syncp); | 181 | struct list_head node; |
430 | tmp = *rwstat; | 182 | dev_t dev; |
431 | } while (u64_stats_fetch_retry(&rwstat->syncp, start)); | 183 | /* This node belongs to max bw policy or porportional weight policy */ |
184 | enum blkio_policy_id plid; | ||
185 | /* cgroup file to which this rule belongs to */ | ||
186 | int fileid; | ||
187 | |||
188 | union { | ||
189 | unsigned int weight; | ||
190 | /* | ||
191 | * Rate read/write in terms of byptes per second | ||
192 | * Whether this rate represents read or write is determined | ||
193 | * by file type "fileid". | ||
194 | */ | ||
195 | u64 bps; | ||
196 | unsigned int iops; | ||
197 | } val; | ||
198 | }; | ||
432 | 199 | ||
433 | return tmp; | 200 | extern unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg, |
434 | } | 201 | dev_t dev); |
202 | extern uint64_t blkcg_get_read_bps(struct blkio_cgroup *blkcg, | ||
203 | dev_t dev); | ||
204 | extern uint64_t blkcg_get_write_bps(struct blkio_cgroup *blkcg, | ||
205 | dev_t dev); | ||
206 | extern unsigned int blkcg_get_read_iops(struct blkio_cgroup *blkcg, | ||
207 | dev_t dev); | ||
208 | extern unsigned int blkcg_get_write_iops(struct blkio_cgroup *blkcg, | ||
209 | dev_t dev); | ||
210 | |||
211 | typedef void (blkio_unlink_group_fn) (void *key, struct blkio_group *blkg); | ||
212 | |||
213 | typedef void (blkio_update_group_weight_fn) (void *key, | ||
214 | struct blkio_group *blkg, unsigned int weight); | ||
215 | typedef void (blkio_update_group_read_bps_fn) (void * key, | ||
216 | struct blkio_group *blkg, u64 read_bps); | ||
217 | typedef void (blkio_update_group_write_bps_fn) (void *key, | ||
218 | struct blkio_group *blkg, u64 write_bps); | ||
219 | typedef void (blkio_update_group_read_iops_fn) (void *key, | ||
220 | struct blkio_group *blkg, unsigned int read_iops); | ||
221 | typedef void (blkio_update_group_write_iops_fn) (void *key, | ||
222 | struct blkio_group *blkg, unsigned int write_iops); | ||
223 | |||
224 | struct blkio_policy_ops { | ||
225 | blkio_unlink_group_fn *blkio_unlink_group_fn; | ||
226 | blkio_update_group_weight_fn *blkio_update_group_weight_fn; | ||
227 | blkio_update_group_read_bps_fn *blkio_update_group_read_bps_fn; | ||
228 | blkio_update_group_write_bps_fn *blkio_update_group_write_bps_fn; | ||
229 | blkio_update_group_read_iops_fn *blkio_update_group_read_iops_fn; | ||
230 | blkio_update_group_write_iops_fn *blkio_update_group_write_iops_fn; | ||
231 | }; | ||
435 | 232 | ||
436 | /** | 233 | struct blkio_policy_type { |
437 | * blkg_rwstat_sum - read the total count of a blkg_rwstat | 234 | struct list_head list; |
438 | * @rwstat: blkg_rwstat to read | 235 | struct blkio_policy_ops ops; |
439 | * | 236 | enum blkio_policy_id plid; |
440 | * Return the total count of @rwstat regardless of the IO direction. This | 237 | }; |
441 | * function can be called without synchronization and takes care of u64 | ||
442 | * atomicity. | ||
443 | */ | ||
444 | static inline uint64_t blkg_rwstat_sum(struct blkg_rwstat *rwstat) | ||
445 | { | ||
446 | struct blkg_rwstat tmp = blkg_rwstat_read(rwstat); | ||
447 | 238 | ||
448 | return tmp.cnt[BLKG_RWSTAT_READ] + tmp.cnt[BLKG_RWSTAT_WRITE]; | 239 | /* Blkio controller policy registration */ |
449 | } | 240 | extern void blkio_policy_register(struct blkio_policy_type *); |
241 | extern void blkio_policy_unregister(struct blkio_policy_type *); | ||
450 | 242 | ||
451 | /** | 243 | static inline char *blkg_path(struct blkio_group *blkg) |
452 | * blkg_rwstat_reset - reset a blkg_rwstat | ||
453 | * @rwstat: blkg_rwstat to reset | ||
454 | */ | ||
455 | static inline void blkg_rwstat_reset(struct blkg_rwstat *rwstat) | ||
456 | { | 244 | { |
457 | memset(rwstat->cnt, 0, sizeof(rwstat->cnt)); | 245 | return blkg->path; |
458 | } | 246 | } |
459 | 247 | ||
460 | #else /* CONFIG_BLK_CGROUP */ | 248 | #else |
461 | |||
462 | struct cgroup; | ||
463 | struct blkcg; | ||
464 | 249 | ||
465 | struct blkg_policy_data { | 250 | struct blkio_group { |
466 | }; | 251 | }; |
467 | 252 | ||
468 | struct blkcg_gq { | 253 | struct blkio_policy_type { |
469 | }; | 254 | }; |
470 | 255 | ||
471 | struct blkcg_policy { | 256 | static inline void blkio_policy_register(struct blkio_policy_type *blkiop) { } |
472 | }; | 257 | static inline void blkio_policy_unregister(struct blkio_policy_type *blkiop) { } |
473 | 258 | ||
474 | static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; } | 259 | static inline char *blkg_path(struct blkio_group *blkg) { return NULL; } |
475 | static inline int blkcg_init_queue(struct request_queue *q) { return 0; } | 260 | |
476 | static inline void blkcg_drain_queue(struct request_queue *q) { } | 261 | #endif |
477 | static inline void blkcg_exit_queue(struct request_queue *q) { } | 262 | |
478 | static inline int blkcg_policy_register(struct blkcg_policy *pol) { return 0; } | 263 | #define BLKIO_WEIGHT_MIN 10 |
479 | static inline void blkcg_policy_unregister(struct blkcg_policy *pol) { } | 264 | #define BLKIO_WEIGHT_MAX 1000 |
480 | static inline int blkcg_activate_policy(struct request_queue *q, | 265 | #define BLKIO_WEIGHT_DEFAULT 500 |
481 | const struct blkcg_policy *pol) { return 0; } | 266 | |
482 | static inline void blkcg_deactivate_policy(struct request_queue *q, | 267 | #ifdef CONFIG_DEBUG_BLK_CGROUP |
483 | const struct blkcg_policy *pol) { } | 268 | void blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg); |
484 | 269 | void blkiocg_update_dequeue_stats(struct blkio_group *blkg, | |
485 | static inline struct blkcg *cgroup_to_blkcg(struct cgroup *cgroup) { return NULL; } | 270 | unsigned long dequeue); |
486 | static inline struct blkcg *bio_blkcg(struct bio *bio) { return NULL; } | 271 | void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg); |
487 | 272 | void blkiocg_update_idle_time_stats(struct blkio_group *blkg); | |
488 | static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg, | 273 | void blkiocg_set_start_empty_time(struct blkio_group *blkg); |
489 | struct blkcg_policy *pol) { return NULL; } | 274 | |
490 | static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) { return NULL; } | 275 | #define BLKG_FLAG_FNS(name) \ |
491 | static inline char *blkg_path(struct blkcg_gq *blkg) { return NULL; } | 276 | static inline void blkio_mark_blkg_##name( \ |
492 | static inline void blkg_get(struct blkcg_gq *blkg) { } | 277 | struct blkio_group_stats *stats) \ |
493 | static inline void blkg_put(struct blkcg_gq *blkg) { } | 278 | { \ |
494 | 279 | stats->flags |= (1 << BLKG_##name); \ | |
495 | static inline struct request_list *blk_get_rl(struct request_queue *q, | 280 | } \ |
496 | struct bio *bio) { return &q->root_rl; } | 281 | static inline void blkio_clear_blkg_##name( \ |
497 | static inline void blk_put_rl(struct request_list *rl) { } | 282 | struct blkio_group_stats *stats) \ |
498 | static inline void blk_rq_set_rl(struct request *rq, struct request_list *rl) { } | 283 | { \ |
499 | static inline struct request_list *blk_rq_rl(struct request *rq) { return &rq->q->root_rl; } | 284 | stats->flags &= ~(1 << BLKG_##name); \ |
500 | 285 | } \ | |
501 | #define blk_queue_for_each_rl(rl, q) \ | 286 | static inline int blkio_blkg_##name(struct blkio_group_stats *stats) \ |
502 | for ((rl) = &(q)->root_rl; (rl); (rl) = NULL) | 287 | { \ |
503 | 288 | return (stats->flags & (1 << BLKG_##name)) != 0; \ | |
504 | #endif /* CONFIG_BLK_CGROUP */ | 289 | } \ |
505 | #endif /* _BLK_CGROUP_H */ | 290 | |
291 | BLKG_FLAG_FNS(waiting) | ||
292 | BLKG_FLAG_FNS(idling) | ||
293 | BLKG_FLAG_FNS(empty) | ||
294 | #undef BLKG_FLAG_FNS | ||
295 | #else | ||
296 | static inline void blkiocg_update_avg_queue_size_stats( | ||
297 | struct blkio_group *blkg) {} | ||
298 | static inline void blkiocg_update_dequeue_stats(struct blkio_group *blkg, | ||
299 | unsigned long dequeue) {} | ||
300 | static inline void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg) | ||
301 | {} | ||
302 | static inline void blkiocg_update_idle_time_stats(struct blkio_group *blkg) {} | ||
303 | static inline void blkiocg_set_start_empty_time(struct blkio_group *blkg) {} | ||
304 | #endif | ||
305 | |||
306 | #if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE) | ||
307 | extern struct blkio_cgroup blkio_root_cgroup; | ||
308 | extern struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup); | ||
309 | extern struct blkio_cgroup *task_blkio_cgroup(struct task_struct *tsk); | ||
310 | extern void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, | ||
311 | struct blkio_group *blkg, void *key, dev_t dev, | ||
312 | enum blkio_policy_id plid); | ||
313 | extern int blkio_alloc_blkg_stats(struct blkio_group *blkg); | ||
314 | extern int blkiocg_del_blkio_group(struct blkio_group *blkg); | ||
315 | extern struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, | ||
316 | void *key); | ||
317 | void blkiocg_update_timeslice_used(struct blkio_group *blkg, | ||
318 | unsigned long time, | ||
319 | unsigned long unaccounted_time); | ||
320 | void blkiocg_update_dispatch_stats(struct blkio_group *blkg, uint64_t bytes, | ||
321 | bool direction, bool sync); | ||
322 | void blkiocg_update_completion_stats(struct blkio_group *blkg, | ||
323 | uint64_t start_time, uint64_t io_start_time, bool direction, bool sync); | ||
324 | void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction, | ||
325 | bool sync); | ||
326 | void blkiocg_update_io_add_stats(struct blkio_group *blkg, | ||
327 | struct blkio_group *curr_blkg, bool direction, bool sync); | ||
328 | void blkiocg_update_io_remove_stats(struct blkio_group *blkg, | ||
329 | bool direction, bool sync); | ||
330 | #else | ||
331 | struct cgroup; | ||
332 | static inline struct blkio_cgroup * | ||
333 | cgroup_to_blkio_cgroup(struct cgroup *cgroup) { return NULL; } | ||
334 | static inline struct blkio_cgroup * | ||
335 | task_blkio_cgroup(struct task_struct *tsk) { return NULL; } | ||
336 | |||
337 | static inline void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, | ||
338 | struct blkio_group *blkg, void *key, dev_t dev, | ||
339 | enum blkio_policy_id plid) {} | ||
340 | |||
341 | static inline int blkio_alloc_blkg_stats(struct blkio_group *blkg) { return 0; } | ||
342 | |||
343 | static inline int | ||
344 | blkiocg_del_blkio_group(struct blkio_group *blkg) { return 0; } | ||
345 | |||
346 | static inline struct blkio_group * | ||
347 | blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key) { return NULL; } | ||
348 | static inline void blkiocg_update_timeslice_used(struct blkio_group *blkg, | ||
349 | unsigned long time, | ||
350 | unsigned long unaccounted_time) | ||
351 | {} | ||
352 | static inline void blkiocg_update_dispatch_stats(struct blkio_group *blkg, | ||
353 | uint64_t bytes, bool direction, bool sync) {} | ||
354 | static inline void blkiocg_update_completion_stats(struct blkio_group *blkg, | ||
355 | uint64_t start_time, uint64_t io_start_time, bool direction, | ||
356 | bool sync) {} | ||
357 | static inline void blkiocg_update_io_merged_stats(struct blkio_group *blkg, | ||
358 | bool direction, bool sync) {} | ||
359 | static inline void blkiocg_update_io_add_stats(struct blkio_group *blkg, | ||
360 | struct blkio_group *curr_blkg, bool direction, bool sync) {} | ||
361 | static inline void blkiocg_update_io_remove_stats(struct blkio_group *blkg, | ||
362 | bool direction, bool sync) {} | ||
363 | #endif | ||
364 | #endif /* _BLK_CGROUP_H */ | ||
diff --git a/block/blk-core.c b/block/blk-core.c index c973249d68c..8fc4ae28a19 100644 --- a/block/blk-core.c +++ b/block/blk-core.c | |||
@@ -28,21 +28,17 @@ | |||
28 | #include <linux/task_io_accounting_ops.h> | 28 | #include <linux/task_io_accounting_ops.h> |
29 | #include <linux/fault-inject.h> | 29 | #include <linux/fault-inject.h> |
30 | #include <linux/list_sort.h> | 30 | #include <linux/list_sort.h> |
31 | #include <linux/delay.h> | ||
32 | #include <linux/ratelimit.h> | ||
33 | 31 | ||
34 | #define CREATE_TRACE_POINTS | 32 | #define CREATE_TRACE_POINTS |
35 | #include <trace/events/block.h> | 33 | #include <trace/events/block.h> |
36 | 34 | ||
37 | #include "blk.h" | 35 | #include "blk.h" |
38 | #include "blk-cgroup.h" | ||
39 | 36 | ||
40 | EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap); | 37 | EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap); |
41 | EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap); | 38 | EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap); |
42 | EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete); | 39 | EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete); |
43 | EXPORT_TRACEPOINT_SYMBOL_GPL(block_unplug); | ||
44 | 40 | ||
45 | DEFINE_IDA(blk_queue_ida); | 41 | static int __make_request(struct request_queue *q, struct bio *bio); |
46 | 42 | ||
47 | /* | 43 | /* |
48 | * For the allocated request tables | 44 | * For the allocated request tables |
@@ -220,13 +216,12 @@ static void blk_delay_work(struct work_struct *work) | |||
220 | * Description: | 216 | * Description: |
221 | * Sometimes queueing needs to be postponed for a little while, to allow | 217 | * Sometimes queueing needs to be postponed for a little while, to allow |
222 | * resources to come back. This function will make sure that queueing is | 218 | * resources to come back. This function will make sure that queueing is |
223 | * restarted around the specified time. Queue lock must be held. | 219 | * restarted around the specified time. |
224 | */ | 220 | */ |
225 | void blk_delay_queue(struct request_queue *q, unsigned long msecs) | 221 | void blk_delay_queue(struct request_queue *q, unsigned long msecs) |
226 | { | 222 | { |
227 | if (likely(!blk_queue_dead(q))) | 223 | queue_delayed_work(kblockd_workqueue, &q->delay_work, |
228 | queue_delayed_work(kblockd_workqueue, &q->delay_work, | 224 | msecs_to_jiffies(msecs)); |
229 | msecs_to_jiffies(msecs)); | ||
230 | } | 225 | } |
231 | EXPORT_SYMBOL(blk_delay_queue); | 226 | EXPORT_SYMBOL(blk_delay_queue); |
232 | 227 | ||
@@ -264,7 +259,7 @@ EXPORT_SYMBOL(blk_start_queue); | |||
264 | **/ | 259 | **/ |
265 | void blk_stop_queue(struct request_queue *q) | 260 | void blk_stop_queue(struct request_queue *q) |
266 | { | 261 | { |
267 | cancel_delayed_work(&q->delay_work); | 262 | __cancel_delayed_work(&q->delay_work); |
268 | queue_flag_set(QUEUE_FLAG_STOPPED, q); | 263 | queue_flag_set(QUEUE_FLAG_STOPPED, q); |
269 | } | 264 | } |
270 | EXPORT_SYMBOL(blk_stop_queue); | 265 | EXPORT_SYMBOL(blk_stop_queue); |
@@ -284,7 +279,7 @@ EXPORT_SYMBOL(blk_stop_queue); | |||
284 | * | 279 | * |
285 | * This function does not cancel any asynchronous activity arising | 280 | * This function does not cancel any asynchronous activity arising |
286 | * out of elevator or throttling code. That would require elevaotor_exit() | 281 | * out of elevator or throttling code. That would require elevaotor_exit() |
287 | * and blkcg_exit_queue() to be called with queue lock initialized. | 282 | * and blk_throtl_exit() to be called with queue lock initialized. |
288 | * | 283 | * |
289 | */ | 284 | */ |
290 | void blk_sync_queue(struct request_queue *q) | 285 | void blk_sync_queue(struct request_queue *q) |
@@ -295,34 +290,6 @@ void blk_sync_queue(struct request_queue *q) | |||
295 | EXPORT_SYMBOL(blk_sync_queue); | 290 | EXPORT_SYMBOL(blk_sync_queue); |
296 | 291 | ||
297 | /** | 292 | /** |
298 | * __blk_run_queue_uncond - run a queue whether or not it has been stopped | ||
299 | * @q: The queue to run | ||
300 | * | ||
301 | * Description: | ||
302 | * Invoke request handling on a queue if there are any pending requests. | ||
303 | * May be used to restart request handling after a request has completed. | ||
304 | * This variant runs the queue whether or not the queue has been | ||
305 | * stopped. Must be called with the queue lock held and interrupts | ||
306 | * disabled. See also @blk_run_queue. | ||
307 | */ | ||
308 | inline void __blk_run_queue_uncond(struct request_queue *q) | ||
309 | { | ||
310 | if (unlikely(blk_queue_dead(q))) | ||
311 | return; | ||
312 | |||
313 | /* | ||
314 | * Some request_fn implementations, e.g. scsi_request_fn(), unlock | ||
315 | * the queue lock internally. As a result multiple threads may be | ||
316 | * running such a request function concurrently. Keep track of the | ||
317 | * number of active request_fn invocations such that blk_drain_queue() | ||
318 | * can wait until all these request_fn calls have finished. | ||
319 | */ | ||
320 | q->request_fn_active++; | ||
321 | q->request_fn(q); | ||
322 | q->request_fn_active--; | ||
323 | } | ||
324 | |||
325 | /** | ||
326 | * __blk_run_queue - run a single device queue | 293 | * __blk_run_queue - run a single device queue |
327 | * @q: The queue to run | 294 | * @q: The queue to run |
328 | * | 295 | * |
@@ -335,7 +302,7 @@ void __blk_run_queue(struct request_queue *q) | |||
335 | if (unlikely(blk_queue_stopped(q))) | 302 | if (unlikely(blk_queue_stopped(q))) |
336 | return; | 303 | return; |
337 | 304 | ||
338 | __blk_run_queue_uncond(q); | 305 | q->request_fn(q); |
339 | } | 306 | } |
340 | EXPORT_SYMBOL(__blk_run_queue); | 307 | EXPORT_SYMBOL(__blk_run_queue); |
341 | 308 | ||
@@ -345,12 +312,14 @@ EXPORT_SYMBOL(__blk_run_queue); | |||
345 | * | 312 | * |
346 | * Description: | 313 | * Description: |
347 | * Tells kblockd to perform the equivalent of @blk_run_queue on behalf | 314 | * Tells kblockd to perform the equivalent of @blk_run_queue on behalf |
348 | * of us. The caller must hold the queue lock. | 315 | * of us. |
349 | */ | 316 | */ |
350 | void blk_run_queue_async(struct request_queue *q) | 317 | void blk_run_queue_async(struct request_queue *q) |
351 | { | 318 | { |
352 | if (likely(!blk_queue_stopped(q) && !blk_queue_dead(q))) | 319 | if (likely(!blk_queue_stopped(q))) { |
353 | mod_delayed_work(kblockd_workqueue, &q->delay_work, 0); | 320 | __cancel_delayed_work(&q->delay_work); |
321 | queue_delayed_work(kblockd_workqueue, &q->delay_work, 0); | ||
322 | } | ||
354 | } | 323 | } |
355 | EXPORT_SYMBOL(blk_run_queue_async); | 324 | EXPORT_SYMBOL(blk_run_queue_async); |
356 | 325 | ||
@@ -378,219 +347,59 @@ void blk_put_queue(struct request_queue *q) | |||
378 | } | 347 | } |
379 | EXPORT_SYMBOL(blk_put_queue); | 348 | EXPORT_SYMBOL(blk_put_queue); |
380 | 349 | ||
381 | /** | 350 | /* |
382 | * __blk_drain_queue - drain requests from request_queue | 351 | * Note: If a driver supplied the queue lock, it is disconnected |
383 | * @q: queue to drain | 352 | * by this function. The actual state of the lock doesn't matter |
384 | * @drain_all: whether to drain all requests or only the ones w/ ELVPRIV | 353 | * here as the request_queue isn't accessible after this point |
385 | * | 354 | * (QUEUE_FLAG_DEAD is set) and no other requests will be queued. |
386 | * Drain requests from @q. If @drain_all is set, all requests are drained. | ||
387 | * If not, only ELVPRIV requests are drained. The caller is responsible | ||
388 | * for ensuring that no new requests which need to be drained are queued. | ||
389 | */ | ||
390 | static void __blk_drain_queue(struct request_queue *q, bool drain_all) | ||
391 | __releases(q->queue_lock) | ||
392 | __acquires(q->queue_lock) | ||
393 | { | ||
394 | int i; | ||
395 | |||
396 | lockdep_assert_held(q->queue_lock); | ||
397 | |||
398 | while (true) { | ||
399 | bool drain = false; | ||
400 | |||
401 | /* | ||
402 | * The caller might be trying to drain @q before its | ||
403 | * elevator is initialized. | ||
404 | */ | ||
405 | if (q->elevator) | ||
406 | elv_drain_elevator(q); | ||
407 | |||
408 | blkcg_drain_queue(q); | ||
409 | |||
410 | /* | ||
411 | * This function might be called on a queue which failed | ||
412 | * driver init after queue creation or is not yet fully | ||
413 | * active yet. Some drivers (e.g. fd and loop) get unhappy | ||
414 | * in such cases. Kick queue iff dispatch queue has | ||
415 | * something on it and @q has request_fn set. | ||
416 | */ | ||
417 | if (!list_empty(&q->queue_head) && q->request_fn) | ||
418 | __blk_run_queue(q); | ||
419 | |||
420 | drain |= q->nr_rqs_elvpriv; | ||
421 | drain |= q->request_fn_active; | ||
422 | |||
423 | /* | ||
424 | * Unfortunately, requests are queued at and tracked from | ||
425 | * multiple places and there's no single counter which can | ||
426 | * be drained. Check all the queues and counters. | ||
427 | */ | ||
428 | if (drain_all) { | ||
429 | drain |= !list_empty(&q->queue_head); | ||
430 | for (i = 0; i < 2; i++) { | ||
431 | drain |= q->nr_rqs[i]; | ||
432 | drain |= q->in_flight[i]; | ||
433 | drain |= !list_empty(&q->flush_queue[i]); | ||
434 | } | ||
435 | } | ||
436 | |||
437 | if (!drain) | ||
438 | break; | ||
439 | |||
440 | spin_unlock_irq(q->queue_lock); | ||
441 | |||
442 | msleep(10); | ||
443 | |||
444 | spin_lock_irq(q->queue_lock); | ||
445 | } | ||
446 | |||
447 | /* | ||
448 | * With queue marked dead, any woken up waiter will fail the | ||
449 | * allocation path, so the wakeup chaining is lost and we're | ||
450 | * left with hung waiters. We need to wake up those waiters. | ||
451 | */ | ||
452 | if (q->request_fn) { | ||
453 | struct request_list *rl; | ||
454 | |||
455 | blk_queue_for_each_rl(rl, q) | ||
456 | for (i = 0; i < ARRAY_SIZE(rl->wait); i++) | ||
457 | wake_up_all(&rl->wait[i]); | ||
458 | } | ||
459 | } | ||
460 | |||
461 | /** | ||
462 | * blk_queue_bypass_start - enter queue bypass mode | ||
463 | * @q: queue of interest | ||
464 | * | ||
465 | * In bypass mode, only the dispatch FIFO queue of @q is used. This | ||
466 | * function makes @q enter bypass mode and drains all requests which were | ||
467 | * throttled or issued before. On return, it's guaranteed that no request | ||
468 | * is being throttled or has ELVPRIV set and blk_queue_bypass() %true | ||
469 | * inside queue or RCU read lock. | ||
470 | */ | ||
471 | void blk_queue_bypass_start(struct request_queue *q) | ||
472 | { | ||
473 | bool drain; | ||
474 | |||
475 | spin_lock_irq(q->queue_lock); | ||
476 | drain = !q->bypass_depth++; | ||
477 | queue_flag_set(QUEUE_FLAG_BYPASS, q); | ||
478 | spin_unlock_irq(q->queue_lock); | ||
479 | |||
480 | if (drain) { | ||
481 | spin_lock_irq(q->queue_lock); | ||
482 | __blk_drain_queue(q, false); | ||
483 | spin_unlock_irq(q->queue_lock); | ||
484 | |||
485 | /* ensure blk_queue_bypass() is %true inside RCU read lock */ | ||
486 | synchronize_rcu(); | ||
487 | } | ||
488 | } | ||
489 | EXPORT_SYMBOL_GPL(blk_queue_bypass_start); | ||
490 | |||
491 | /** | ||
492 | * blk_queue_bypass_end - leave queue bypass mode | ||
493 | * @q: queue of interest | ||
494 | * | ||
495 | * Leave bypass mode and restore the normal queueing behavior. | ||
496 | */ | ||
497 | void blk_queue_bypass_end(struct request_queue *q) | ||
498 | { | ||
499 | spin_lock_irq(q->queue_lock); | ||
500 | if (!--q->bypass_depth) | ||
501 | queue_flag_clear(QUEUE_FLAG_BYPASS, q); | ||
502 | WARN_ON_ONCE(q->bypass_depth < 0); | ||
503 | spin_unlock_irq(q->queue_lock); | ||
504 | } | ||
505 | EXPORT_SYMBOL_GPL(blk_queue_bypass_end); | ||
506 | |||
507 | /** | ||
508 | * blk_cleanup_queue - shutdown a request queue | ||
509 | * @q: request queue to shutdown | ||
510 | * | ||
511 | * Mark @q DYING, drain all pending requests, mark @q DEAD, destroy and | ||
512 | * put it. All future requests will be failed immediately with -ENODEV. | ||
513 | */ | 355 | */ |
514 | void blk_cleanup_queue(struct request_queue *q) | 356 | void blk_cleanup_queue(struct request_queue *q) |
515 | { | 357 | { |
516 | spinlock_t *lock = q->queue_lock; | ||
517 | |||
518 | /* mark @q DYING, no new request or merges will be allowed afterwards */ | ||
519 | mutex_lock(&q->sysfs_lock); | ||
520 | queue_flag_set_unlocked(QUEUE_FLAG_DYING, q); | ||
521 | spin_lock_irq(lock); | ||
522 | |||
523 | /* | ||
524 | * A dying queue is permanently in bypass mode till released. Note | ||
525 | * that, unlike blk_queue_bypass_start(), we aren't performing | ||
526 | * synchronize_rcu() after entering bypass mode to avoid the delay | ||
527 | * as some drivers create and destroy a lot of queues while | ||
528 | * probing. This is still safe because blk_release_queue() will be | ||
529 | * called only after the queue refcnt drops to zero and nothing, | ||
530 | * RCU or not, would be traversing the queue by then. | ||
531 | */ | ||
532 | q->bypass_depth++; | ||
533 | queue_flag_set(QUEUE_FLAG_BYPASS, q); | ||
534 | |||
535 | queue_flag_set(QUEUE_FLAG_NOMERGES, q); | ||
536 | queue_flag_set(QUEUE_FLAG_NOXMERGES, q); | ||
537 | queue_flag_set(QUEUE_FLAG_DYING, q); | ||
538 | spin_unlock_irq(lock); | ||
539 | mutex_unlock(&q->sysfs_lock); | ||
540 | |||
541 | /* | 358 | /* |
542 | * Drain all requests queued before DYING marking. Set DEAD flag to | 359 | * We know we have process context here, so we can be a little |
543 | * prevent that q->request_fn() gets invoked after draining finished. | 360 | * cautious and ensure that pending block actions on this device |
361 | * are done before moving on. Going into this function, we should | ||
362 | * not have processes doing IO to this device. | ||
544 | */ | 363 | */ |
545 | spin_lock_irq(lock); | 364 | blk_sync_queue(q); |
546 | __blk_drain_queue(q, true); | ||
547 | queue_flag_set(QUEUE_FLAG_DEAD, q); | ||
548 | spin_unlock_irq(lock); | ||
549 | 365 | ||
550 | /* @q won't process any more request, flush async actions */ | ||
551 | del_timer_sync(&q->backing_dev_info.laptop_mode_wb_timer); | 366 | del_timer_sync(&q->backing_dev_info.laptop_mode_wb_timer); |
552 | blk_sync_queue(q); | 367 | mutex_lock(&q->sysfs_lock); |
368 | queue_flag_set_unlocked(QUEUE_FLAG_DEAD, q); | ||
369 | mutex_unlock(&q->sysfs_lock); | ||
553 | 370 | ||
554 | spin_lock_irq(lock); | ||
555 | if (q->queue_lock != &q->__queue_lock) | 371 | if (q->queue_lock != &q->__queue_lock) |
556 | q->queue_lock = &q->__queue_lock; | 372 | q->queue_lock = &q->__queue_lock; |
557 | spin_unlock_irq(lock); | ||
558 | 373 | ||
559 | /* @q is and will stay empty, shutdown and put */ | ||
560 | blk_put_queue(q); | 374 | blk_put_queue(q); |
561 | } | 375 | } |
562 | EXPORT_SYMBOL(blk_cleanup_queue); | 376 | EXPORT_SYMBOL(blk_cleanup_queue); |
563 | 377 | ||
564 | int blk_init_rl(struct request_list *rl, struct request_queue *q, | 378 | static int blk_init_free_list(struct request_queue *q) |
565 | gfp_t gfp_mask) | ||
566 | { | 379 | { |
380 | struct request_list *rl = &q->rq; | ||
381 | |||
567 | if (unlikely(rl->rq_pool)) | 382 | if (unlikely(rl->rq_pool)) |
568 | return 0; | 383 | return 0; |
569 | 384 | ||
570 | rl->q = q; | ||
571 | rl->count[BLK_RW_SYNC] = rl->count[BLK_RW_ASYNC] = 0; | 385 | rl->count[BLK_RW_SYNC] = rl->count[BLK_RW_ASYNC] = 0; |
572 | rl->starved[BLK_RW_SYNC] = rl->starved[BLK_RW_ASYNC] = 0; | 386 | rl->starved[BLK_RW_SYNC] = rl->starved[BLK_RW_ASYNC] = 0; |
387 | rl->elvpriv = 0; | ||
573 | init_waitqueue_head(&rl->wait[BLK_RW_SYNC]); | 388 | init_waitqueue_head(&rl->wait[BLK_RW_SYNC]); |
574 | init_waitqueue_head(&rl->wait[BLK_RW_ASYNC]); | 389 | init_waitqueue_head(&rl->wait[BLK_RW_ASYNC]); |
575 | 390 | ||
576 | rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ, mempool_alloc_slab, | 391 | rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ, mempool_alloc_slab, |
577 | mempool_free_slab, request_cachep, | 392 | mempool_free_slab, request_cachep, q->node); |
578 | gfp_mask, q->node); | 393 | |
579 | if (!rl->rq_pool) | 394 | if (!rl->rq_pool) |
580 | return -ENOMEM; | 395 | return -ENOMEM; |
581 | 396 | ||
582 | return 0; | 397 | return 0; |
583 | } | 398 | } |
584 | 399 | ||
585 | void blk_exit_rl(struct request_list *rl) | ||
586 | { | ||
587 | if (rl->rq_pool) | ||
588 | mempool_destroy(rl->rq_pool); | ||
589 | } | ||
590 | |||
591 | struct request_queue *blk_alloc_queue(gfp_t gfp_mask) | 400 | struct request_queue *blk_alloc_queue(gfp_t gfp_mask) |
592 | { | 401 | { |
593 | return blk_alloc_queue_node(gfp_mask, NUMA_NO_NODE); | 402 | return blk_alloc_queue_node(gfp_mask, -1); |
594 | } | 403 | } |
595 | EXPORT_SYMBOL(blk_alloc_queue); | 404 | EXPORT_SYMBOL(blk_alloc_queue); |
596 | 405 | ||
@@ -604,10 +413,6 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) | |||
604 | if (!q) | 413 | if (!q) |
605 | return NULL; | 414 | return NULL; |
606 | 415 | ||
607 | q->id = ida_simple_get(&blk_queue_ida, 0, 0, gfp_mask); | ||
608 | if (q->id < 0) | ||
609 | goto fail_q; | ||
610 | |||
611 | q->backing_dev_info.ra_pages = | 416 | q->backing_dev_info.ra_pages = |
612 | (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; | 417 | (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; |
613 | q->backing_dev_info.state = 0; | 418 | q->backing_dev_info.state = 0; |
@@ -616,18 +421,20 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) | |||
616 | q->node = node_id; | 421 | q->node = node_id; |
617 | 422 | ||
618 | err = bdi_init(&q->backing_dev_info); | 423 | err = bdi_init(&q->backing_dev_info); |
619 | if (err) | 424 | if (err) { |
620 | goto fail_id; | 425 | kmem_cache_free(blk_requestq_cachep, q); |
426 | return NULL; | ||
427 | } | ||
428 | |||
429 | if (blk_throtl_init(q)) { | ||
430 | kmem_cache_free(blk_requestq_cachep, q); | ||
431 | return NULL; | ||
432 | } | ||
621 | 433 | ||
622 | setup_timer(&q->backing_dev_info.laptop_mode_wb_timer, | 434 | setup_timer(&q->backing_dev_info.laptop_mode_wb_timer, |
623 | laptop_mode_timer_fn, (unsigned long) q); | 435 | laptop_mode_timer_fn, (unsigned long) q); |
624 | setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q); | 436 | setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q); |
625 | INIT_LIST_HEAD(&q->queue_head); | ||
626 | INIT_LIST_HEAD(&q->timeout_list); | 437 | INIT_LIST_HEAD(&q->timeout_list); |
627 | INIT_LIST_HEAD(&q->icq_list); | ||
628 | #ifdef CONFIG_BLK_CGROUP | ||
629 | INIT_LIST_HEAD(&q->blkg_list); | ||
630 | #endif | ||
631 | INIT_LIST_HEAD(&q->flush_queue[0]); | 438 | INIT_LIST_HEAD(&q->flush_queue[0]); |
632 | INIT_LIST_HEAD(&q->flush_queue[1]); | 439 | INIT_LIST_HEAD(&q->flush_queue[1]); |
633 | INIT_LIST_HEAD(&q->flush_data_in_flight); | 440 | INIT_LIST_HEAD(&q->flush_data_in_flight); |
@@ -644,25 +451,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) | |||
644 | */ | 451 | */ |
645 | q->queue_lock = &q->__queue_lock; | 452 | q->queue_lock = &q->__queue_lock; |
646 | 453 | ||
647 | /* | ||
648 | * A queue starts its life with bypass turned on to avoid | ||
649 | * unnecessary bypass on/off overhead and nasty surprises during | ||
650 | * init. The initial bypass will be finished when the queue is | ||
651 | * registered by blk_register_queue(). | ||
652 | */ | ||
653 | q->bypass_depth = 1; | ||
654 | __set_bit(QUEUE_FLAG_BYPASS, &q->queue_flags); | ||
655 | |||
656 | if (blkcg_init_queue(q)) | ||
657 | goto fail_id; | ||
658 | |||
659 | return q; | 454 | return q; |
660 | |||
661 | fail_id: | ||
662 | ida_simple_remove(&blk_queue_ida, q->id); | ||
663 | fail_q: | ||
664 | kmem_cache_free(blk_requestq_cachep, q); | ||
665 | return NULL; | ||
666 | } | 455 | } |
667 | EXPORT_SYMBOL(blk_alloc_queue_node); | 456 | EXPORT_SYMBOL(blk_alloc_queue_node); |
668 | 457 | ||
@@ -701,7 +490,7 @@ EXPORT_SYMBOL(blk_alloc_queue_node); | |||
701 | 490 | ||
702 | struct request_queue *blk_init_queue(request_fn_proc *rfn, spinlock_t *lock) | 491 | struct request_queue *blk_init_queue(request_fn_proc *rfn, spinlock_t *lock) |
703 | { | 492 | { |
704 | return blk_init_queue_node(rfn, lock, NUMA_NO_NODE); | 493 | return blk_init_queue_node(rfn, lock, -1); |
705 | } | 494 | } |
706 | EXPORT_SYMBOL(blk_init_queue); | 495 | EXPORT_SYMBOL(blk_init_queue); |
707 | 496 | ||
@@ -729,13 +518,13 @@ blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn, | |||
729 | if (!q) | 518 | if (!q) |
730 | return NULL; | 519 | return NULL; |
731 | 520 | ||
732 | if (blk_init_rl(&q->root_rl, q, GFP_KERNEL)) | 521 | if (blk_init_free_list(q)) |
733 | return NULL; | 522 | return NULL; |
734 | 523 | ||
735 | q->request_fn = rfn; | 524 | q->request_fn = rfn; |
736 | q->prep_rq_fn = NULL; | 525 | q->prep_rq_fn = NULL; |
737 | q->unprep_rq_fn = NULL; | 526 | q->unprep_rq_fn = NULL; |
738 | q->queue_flags |= QUEUE_FLAG_DEFAULT; | 527 | q->queue_flags = QUEUE_FLAG_DEFAULT; |
739 | 528 | ||
740 | /* Override internal queue lock with supplied lock pointer */ | 529 | /* Override internal queue lock with supplied lock pointer */ |
741 | if (lock) | 530 | if (lock) |
@@ -744,37 +533,61 @@ blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn, | |||
744 | /* | 533 | /* |
745 | * This also sets hw/phys segments, boundary and size | 534 | * This also sets hw/phys segments, boundary and size |
746 | */ | 535 | */ |
747 | blk_queue_make_request(q, blk_queue_bio); | 536 | blk_queue_make_request(q, __make_request); |
748 | 537 | ||
749 | q->sg_reserved_size = INT_MAX; | 538 | q->sg_reserved_size = INT_MAX; |
750 | 539 | ||
751 | /* init elevator */ | 540 | /* |
752 | if (elevator_init(q, NULL)) | 541 | * all done |
753 | return NULL; | 542 | */ |
754 | return q; | 543 | if (!elevator_init(q, NULL)) { |
544 | blk_queue_congestion_threshold(q); | ||
545 | return q; | ||
546 | } | ||
547 | |||
548 | return NULL; | ||
755 | } | 549 | } |
756 | EXPORT_SYMBOL(blk_init_allocated_queue); | 550 | EXPORT_SYMBOL(blk_init_allocated_queue); |
757 | 551 | ||
758 | bool blk_get_queue(struct request_queue *q) | 552 | int blk_get_queue(struct request_queue *q) |
759 | { | 553 | { |
760 | if (likely(!blk_queue_dying(q))) { | 554 | if (likely(!test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) { |
761 | __blk_get_queue(q); | 555 | kobject_get(&q->kobj); |
762 | return true; | 556 | return 0; |
763 | } | 557 | } |
764 | 558 | ||
765 | return false; | 559 | return 1; |
766 | } | 560 | } |
767 | EXPORT_SYMBOL(blk_get_queue); | 561 | EXPORT_SYMBOL(blk_get_queue); |
768 | 562 | ||
769 | static inline void blk_free_request(struct request_list *rl, struct request *rq) | 563 | static inline void blk_free_request(struct request_queue *q, struct request *rq) |
770 | { | 564 | { |
771 | if (rq->cmd_flags & REQ_ELVPRIV) { | 565 | if (rq->cmd_flags & REQ_ELVPRIV) |
772 | elv_put_request(rl->q, rq); | 566 | elv_put_request(q, rq); |
773 | if (rq->elv.icq) | 567 | mempool_free(rq, q->rq.rq_pool); |
774 | put_io_context(rq->elv.icq->ioc); | 568 | } |
569 | |||
570 | static struct request * | ||
571 | blk_alloc_request(struct request_queue *q, int flags, int priv, gfp_t gfp_mask) | ||
572 | { | ||
573 | struct request *rq = mempool_alloc(q->rq.rq_pool, gfp_mask); | ||
574 | |||
575 | if (!rq) | ||
576 | return NULL; | ||
577 | |||
578 | blk_rq_init(q, rq); | ||
579 | |||
580 | rq->cmd_flags = flags | REQ_ALLOCED; | ||
581 | |||
582 | if (priv) { | ||
583 | if (unlikely(elv_set_request(q, rq, gfp_mask))) { | ||
584 | mempool_free(rq, q->rq.rq_pool); | ||
585 | return NULL; | ||
586 | } | ||
587 | rq->cmd_flags |= REQ_ELVPRIV; | ||
775 | } | 588 | } |
776 | 589 | ||
777 | mempool_free(rq, rl->rq_pool); | 590 | return rq; |
778 | } | 591 | } |
779 | 592 | ||
780 | /* | 593 | /* |
@@ -811,23 +624,18 @@ static void ioc_set_batching(struct request_queue *q, struct io_context *ioc) | |||
811 | ioc->last_waited = jiffies; | 624 | ioc->last_waited = jiffies; |
812 | } | 625 | } |
813 | 626 | ||
814 | static void __freed_request(struct request_list *rl, int sync) | 627 | static void __freed_request(struct request_queue *q, int sync) |
815 | { | 628 | { |
816 | struct request_queue *q = rl->q; | 629 | struct request_list *rl = &q->rq; |
817 | 630 | ||
818 | /* | 631 | if (rl->count[sync] < queue_congestion_off_threshold(q)) |
819 | * bdi isn't aware of blkcg yet. As all async IOs end up root | ||
820 | * blkcg anyway, just use root blkcg state. | ||
821 | */ | ||
822 | if (rl == &q->root_rl && | ||
823 | rl->count[sync] < queue_congestion_off_threshold(q)) | ||
824 | blk_clear_queue_congested(q, sync); | 632 | blk_clear_queue_congested(q, sync); |
825 | 633 | ||
826 | if (rl->count[sync] + 1 <= q->nr_requests) { | 634 | if (rl->count[sync] + 1 <= q->nr_requests) { |
827 | if (waitqueue_active(&rl->wait[sync])) | 635 | if (waitqueue_active(&rl->wait[sync])) |
828 | wake_up(&rl->wait[sync]); | 636 | wake_up(&rl->wait[sync]); |
829 | 637 | ||
830 | blk_clear_rl_full(rl, sync); | 638 | blk_clear_queue_full(q, sync); |
831 | } | 639 | } |
832 | } | 640 | } |
833 | 641 | ||
@@ -835,20 +643,18 @@ static void __freed_request(struct request_list *rl, int sync) | |||
835 | * A request has just been released. Account for it, update the full and | 643 | * A request has just been released. Account for it, update the full and |
836 | * congestion status, wake up any waiters. Called under q->queue_lock. | 644 | * congestion status, wake up any waiters. Called under q->queue_lock. |
837 | */ | 645 | */ |
838 | static void freed_request(struct request_list *rl, unsigned int flags) | 646 | static void freed_request(struct request_queue *q, int sync, int priv) |
839 | { | 647 | { |
840 | struct request_queue *q = rl->q; | 648 | struct request_list *rl = &q->rq; |
841 | int sync = rw_is_sync(flags); | ||
842 | 649 | ||
843 | q->nr_rqs[sync]--; | ||
844 | rl->count[sync]--; | 650 | rl->count[sync]--; |
845 | if (flags & REQ_ELVPRIV) | 651 | if (priv) |
846 | q->nr_rqs_elvpriv--; | 652 | rl->elvpriv--; |
847 | 653 | ||
848 | __freed_request(rl, sync); | 654 | __freed_request(q, sync); |
849 | 655 | ||
850 | if (unlikely(rl->starved[sync ^ 1])) | 656 | if (unlikely(rl->starved[sync ^ 1])) |
851 | __freed_request(rl, sync ^ 1); | 657 | __freed_request(q, sync ^ 1); |
852 | } | 658 | } |
853 | 659 | ||
854 | /* | 660 | /* |
@@ -870,49 +676,19 @@ static bool blk_rq_should_init_elevator(struct bio *bio) | |||
870 | return true; | 676 | return true; |
871 | } | 677 | } |
872 | 678 | ||
873 | /** | 679 | /* |
874 | * rq_ioc - determine io_context for request allocation | 680 | * Get a free request, queue_lock must be held. |
875 | * @bio: request being allocated is for this bio (can be %NULL) | 681 | * Returns NULL on failure, with queue_lock held. |
876 | * | 682 | * Returns !NULL on success, with queue_lock *not held*. |
877 | * Determine io_context to use for request allocation for @bio. May return | ||
878 | * %NULL if %current->io_context doesn't exist. | ||
879 | */ | ||
880 | static struct io_context *rq_ioc(struct bio *bio) | ||
881 | { | ||
882 | #ifdef CONFIG_BLK_CGROUP | ||
883 | if (bio && bio->bi_ioc) | ||
884 | return bio->bi_ioc; | ||
885 | #endif | ||
886 | return current->io_context; | ||
887 | } | ||
888 | |||
889 | /** | ||
890 | * __get_request - get a free request | ||
891 | * @rl: request list to allocate from | ||
892 | * @rw_flags: RW and SYNC flags | ||
893 | * @bio: bio to allocate request for (can be %NULL) | ||
894 | * @gfp_mask: allocation mask | ||
895 | * | ||
896 | * Get a free request from @q. This function may fail under memory | ||
897 | * pressure or if @q is dead. | ||
898 | * | ||
899 | * Must be callled with @q->queue_lock held and, | ||
900 | * Returns %NULL on failure, with @q->queue_lock held. | ||
901 | * Returns !%NULL on success, with @q->queue_lock *not held*. | ||
902 | */ | 683 | */ |
903 | static struct request *__get_request(struct request_list *rl, int rw_flags, | 684 | static struct request *get_request(struct request_queue *q, int rw_flags, |
904 | struct bio *bio, gfp_t gfp_mask) | 685 | struct bio *bio, gfp_t gfp_mask) |
905 | { | 686 | { |
906 | struct request_queue *q = rl->q; | 687 | struct request *rq = NULL; |
907 | struct request *rq; | 688 | struct request_list *rl = &q->rq; |
908 | struct elevator_type *et = q->elevator->type; | 689 | struct io_context *ioc = NULL; |
909 | struct io_context *ioc = rq_ioc(bio); | ||
910 | struct io_cq *icq = NULL; | ||
911 | const bool is_sync = rw_is_sync(rw_flags) != 0; | 690 | const bool is_sync = rw_is_sync(rw_flags) != 0; |
912 | int may_queue; | 691 | int may_queue, priv = 0; |
913 | |||
914 | if (unlikely(blk_queue_dying(q))) | ||
915 | return NULL; | ||
916 | 692 | ||
917 | may_queue = elv_may_queue(q, rw_flags); | 693 | may_queue = elv_may_queue(q, rw_flags); |
918 | if (may_queue == ELV_MQUEUE_NO) | 694 | if (may_queue == ELV_MQUEUE_NO) |
@@ -920,15 +696,16 @@ static struct request *__get_request(struct request_list *rl, int rw_flags, | |||
920 | 696 | ||
921 | if (rl->count[is_sync]+1 >= queue_congestion_on_threshold(q)) { | 697 | if (rl->count[is_sync]+1 >= queue_congestion_on_threshold(q)) { |
922 | if (rl->count[is_sync]+1 >= q->nr_requests) { | 698 | if (rl->count[is_sync]+1 >= q->nr_requests) { |
699 | ioc = current_io_context(GFP_ATOMIC, q->node); | ||
923 | /* | 700 | /* |
924 | * The queue will fill after this allocation, so set | 701 | * The queue will fill after this allocation, so set |
925 | * it as full, and mark this process as "batching". | 702 | * it as full, and mark this process as "batching". |
926 | * This process will be allowed to complete a batch of | 703 | * This process will be allowed to complete a batch of |
927 | * requests, others will be blocked. | 704 | * requests, others will be blocked. |
928 | */ | 705 | */ |
929 | if (!blk_rl_full(rl, is_sync)) { | 706 | if (!blk_queue_full(q, is_sync)) { |
930 | ioc_set_batching(q, ioc); | 707 | ioc_set_batching(q, ioc); |
931 | blk_set_rl_full(rl, is_sync); | 708 | blk_set_queue_full(q, is_sync); |
932 | } else { | 709 | } else { |
933 | if (may_queue != ELV_MQUEUE_MUST | 710 | if (may_queue != ELV_MQUEUE_MUST |
934 | && !ioc_batching(q, ioc)) { | 711 | && !ioc_batching(q, ioc)) { |
@@ -937,16 +714,11 @@ static struct request *__get_request(struct request_list *rl, int rw_flags, | |||
937 | * process is not a "batcher", and not | 714 | * process is not a "batcher", and not |
938 | * exempted by the IO scheduler | 715 | * exempted by the IO scheduler |
939 | */ | 716 | */ |
940 | return NULL; | 717 | goto out; |
941 | } | 718 | } |
942 | } | 719 | } |
943 | } | 720 | } |
944 | /* | 721 | blk_set_queue_congested(q, is_sync); |
945 | * bdi isn't aware of blkcg yet. As all async IOs end up | ||
946 | * root blkcg anyway, just use root blkcg state. | ||
947 | */ | ||
948 | if (rl == &q->root_rl) | ||
949 | blk_set_queue_congested(q, is_sync); | ||
950 | } | 722 | } |
951 | 723 | ||
952 | /* | 724 | /* |
@@ -955,60 +727,47 @@ static struct request *__get_request(struct request_list *rl, int rw_flags, | |||
955 | * allocated with any setting of ->nr_requests | 727 | * allocated with any setting of ->nr_requests |
956 | */ | 728 | */ |
957 | if (rl->count[is_sync] >= (3 * q->nr_requests / 2)) | 729 | if (rl->count[is_sync] >= (3 * q->nr_requests / 2)) |
958 | return NULL; | 730 | goto out; |
959 | 731 | ||
960 | q->nr_rqs[is_sync]++; | ||
961 | rl->count[is_sync]++; | 732 | rl->count[is_sync]++; |
962 | rl->starved[is_sync] = 0; | 733 | rl->starved[is_sync] = 0; |
963 | 734 | ||
964 | /* | 735 | if (blk_rq_should_init_elevator(bio)) { |
965 | * Decide whether the new request will be managed by elevator. If | 736 | priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags); |
966 | * so, mark @rw_flags and increment elvpriv. Non-zero elvpriv will | 737 | if (priv) |
967 | * prevent the current elevator from being destroyed until the new | 738 | rl->elvpriv++; |
968 | * request is freed. This guarantees icq's won't be destroyed and | ||
969 | * makes creating new ones safe. | ||
970 | * | ||
971 | * Also, lookup icq while holding queue_lock. If it doesn't exist, | ||
972 | * it will be created after releasing queue_lock. | ||
973 | */ | ||
974 | if (blk_rq_should_init_elevator(bio) && !blk_queue_bypass(q)) { | ||
975 | rw_flags |= REQ_ELVPRIV; | ||
976 | q->nr_rqs_elvpriv++; | ||
977 | if (et->icq_cache && ioc) | ||
978 | icq = ioc_lookup_icq(ioc, q); | ||
979 | } | 739 | } |
980 | 740 | ||
981 | if (blk_queue_io_stat(q)) | 741 | if (blk_queue_io_stat(q)) |
982 | rw_flags |= REQ_IO_STAT; | 742 | rw_flags |= REQ_IO_STAT; |
983 | spin_unlock_irq(q->queue_lock); | 743 | spin_unlock_irq(q->queue_lock); |
984 | 744 | ||
985 | /* allocate and init request */ | 745 | rq = blk_alloc_request(q, rw_flags, priv, gfp_mask); |
986 | rq = mempool_alloc(rl->rq_pool, gfp_mask); | 746 | if (unlikely(!rq)) { |
987 | if (!rq) | 747 | /* |
988 | goto fail_alloc; | 748 | * Allocation failed presumably due to memory. Undo anything |
989 | 749 | * we might have messed up. | |
990 | blk_rq_init(q, rq); | 750 | * |
991 | blk_rq_set_rl(rq, rl); | 751 | * Allocating task should really be put onto the front of the |
992 | rq->cmd_flags = rw_flags | REQ_ALLOCED; | 752 | * wait queue, but this is pretty rare. |
993 | 753 | */ | |
994 | /* init elvpriv */ | 754 | spin_lock_irq(q->queue_lock); |
995 | if (rw_flags & REQ_ELVPRIV) { | 755 | freed_request(q, is_sync, priv); |
996 | if (unlikely(et->icq_cache && !icq)) { | ||
997 | if (ioc) | ||
998 | icq = ioc_create_icq(ioc, q, gfp_mask); | ||
999 | if (!icq) | ||
1000 | goto fail_elvpriv; | ||
1001 | } | ||
1002 | 756 | ||
1003 | rq->elv.icq = icq; | 757 | /* |
1004 | if (unlikely(elv_set_request(q, rq, bio, gfp_mask))) | 758 | * in the very unlikely event that allocation failed and no |
1005 | goto fail_elvpriv; | 759 | * requests for this direction was pending, mark us starved |
760 | * so that freeing of a request in the other direction will | ||
761 | * notice us. another possible fix would be to split the | ||
762 | * rq mempool into READ and WRITE | ||
763 | */ | ||
764 | rq_starved: | ||
765 | if (unlikely(rl->count[is_sync] == 0)) | ||
766 | rl->starved[is_sync] = 1; | ||
1006 | 767 | ||
1007 | /* @rq->elv.icq holds io_context until @rq is freed */ | 768 | goto out; |
1008 | if (icq) | ||
1009 | get_io_context(icq->ioc); | ||
1010 | } | 769 | } |
1011 | out: | 770 | |
1012 | /* | 771 | /* |
1013 | * ioc may be NULL here, and ioc_batching will be false. That's | 772 | * ioc may be NULL here, and ioc_batching will be false. That's |
1014 | * OK, if the queue is under the request limit then requests need | 773 | * OK, if the queue is under the request limit then requests need |
@@ -1019,118 +778,71 @@ out: | |||
1019 | ioc->nr_batch_requests--; | 778 | ioc->nr_batch_requests--; |
1020 | 779 | ||
1021 | trace_block_getrq(q, bio, rw_flags & 1); | 780 | trace_block_getrq(q, bio, rw_flags & 1); |
781 | out: | ||
1022 | return rq; | 782 | return rq; |
1023 | |||
1024 | fail_elvpriv: | ||
1025 | /* | ||
1026 | * elvpriv init failed. ioc, icq and elvpriv aren't mempool backed | ||
1027 | * and may fail indefinitely under memory pressure and thus | ||
1028 | * shouldn't stall IO. Treat this request as !elvpriv. This will | ||
1029 | * disturb iosched and blkcg but weird is bettern than dead. | ||
1030 | */ | ||
1031 | printk_ratelimited(KERN_WARNING "%s: request aux data allocation failed, iosched may be disturbed\n", | ||
1032 | dev_name(q->backing_dev_info.dev)); | ||
1033 | |||
1034 | rq->cmd_flags &= ~REQ_ELVPRIV; | ||
1035 | rq->elv.icq = NULL; | ||
1036 | |||
1037 | spin_lock_irq(q->queue_lock); | ||
1038 | q->nr_rqs_elvpriv--; | ||
1039 | spin_unlock_irq(q->queue_lock); | ||
1040 | goto out; | ||
1041 | |||
1042 | fail_alloc: | ||
1043 | /* | ||
1044 | * Allocation failed presumably due to memory. Undo anything we | ||
1045 | * might have messed up. | ||
1046 | * | ||
1047 | * Allocating task should really be put onto the front of the wait | ||
1048 | * queue, but this is pretty rare. | ||
1049 | */ | ||
1050 | spin_lock_irq(q->queue_lock); | ||
1051 | freed_request(rl, rw_flags); | ||
1052 | |||
1053 | /* | ||
1054 | * in the very unlikely event that allocation failed and no | ||
1055 | * requests for this direction was pending, mark us starved so that | ||
1056 | * freeing of a request in the other direction will notice | ||
1057 | * us. another possible fix would be to split the rq mempool into | ||
1058 | * READ and WRITE | ||
1059 | */ | ||
1060 | rq_starved: | ||
1061 | if (unlikely(rl->count[is_sync] == 0)) | ||
1062 | rl->starved[is_sync] = 1; | ||
1063 | return NULL; | ||
1064 | } | 783 | } |
1065 | 784 | ||
1066 | /** | 785 | /* |
1067 | * get_request - get a free request | 786 | * No available requests for this queue, wait for some requests to become |
1068 | * @q: request_queue to allocate request from | 787 | * available. |
1069 | * @rw_flags: RW and SYNC flags | 788 | * |
1070 | * @bio: bio to allocate request for (can be %NULL) | 789 | * Called with q->queue_lock held, and returns with it unlocked. |
1071 | * @gfp_mask: allocation mask | ||
1072 | * | ||
1073 | * Get a free request from @q. If %__GFP_WAIT is set in @gfp_mask, this | ||
1074 | * function keeps retrying under memory pressure and fails iff @q is dead. | ||
1075 | * | ||
1076 | * Must be callled with @q->queue_lock held and, | ||
1077 | * Returns %NULL on failure, with @q->queue_lock held. | ||
1078 | * Returns !%NULL on success, with @q->queue_lock *not held*. | ||
1079 | */ | 790 | */ |
1080 | static struct request *get_request(struct request_queue *q, int rw_flags, | 791 | static struct request *get_request_wait(struct request_queue *q, int rw_flags, |
1081 | struct bio *bio, gfp_t gfp_mask) | 792 | struct bio *bio) |
1082 | { | 793 | { |
1083 | const bool is_sync = rw_is_sync(rw_flags) != 0; | 794 | const bool is_sync = rw_is_sync(rw_flags) != 0; |
1084 | DEFINE_WAIT(wait); | ||
1085 | struct request_list *rl; | ||
1086 | struct request *rq; | 795 | struct request *rq; |
1087 | 796 | ||
1088 | rl = blk_get_rl(q, bio); /* transferred to @rq on success */ | 797 | rq = get_request(q, rw_flags, bio, GFP_NOIO); |
1089 | retry: | 798 | while (!rq) { |
1090 | rq = __get_request(rl, rw_flags, bio, gfp_mask); | 799 | DEFINE_WAIT(wait); |
1091 | if (rq) | 800 | struct io_context *ioc; |
1092 | return rq; | 801 | struct request_list *rl = &q->rq; |
1093 | 802 | ||
1094 | if (!(gfp_mask & __GFP_WAIT) || unlikely(blk_queue_dying(q))) { | 803 | prepare_to_wait_exclusive(&rl->wait[is_sync], &wait, |
1095 | blk_put_rl(rl); | 804 | TASK_UNINTERRUPTIBLE); |
1096 | return NULL; | ||
1097 | } | ||
1098 | 805 | ||
1099 | /* wait on @rl and retry */ | 806 | trace_block_sleeprq(q, bio, rw_flags & 1); |
1100 | prepare_to_wait_exclusive(&rl->wait[is_sync], &wait, | ||
1101 | TASK_UNINTERRUPTIBLE); | ||
1102 | 807 | ||
1103 | trace_block_sleeprq(q, bio, rw_flags & 1); | 808 | spin_unlock_irq(q->queue_lock); |
809 | io_schedule(); | ||
1104 | 810 | ||
1105 | spin_unlock_irq(q->queue_lock); | 811 | /* |
1106 | io_schedule(); | 812 | * After sleeping, we become a "batching" process and |
813 | * will be able to allocate at least one request, and | ||
814 | * up to a big batch of them for a small period time. | ||
815 | * See ioc_batching, ioc_set_batching | ||
816 | */ | ||
817 | ioc = current_io_context(GFP_NOIO, q->node); | ||
818 | ioc_set_batching(q, ioc); | ||
1107 | 819 | ||
1108 | /* | 820 | spin_lock_irq(q->queue_lock); |
1109 | * After sleeping, we become a "batching" process and will be able | 821 | finish_wait(&rl->wait[is_sync], &wait); |
1110 | * to allocate at least one request, and up to a big batch of them | ||
1111 | * for a small period time. See ioc_batching, ioc_set_batching | ||
1112 | */ | ||
1113 | ioc_set_batching(q, current->io_context); | ||
1114 | 822 | ||
1115 | spin_lock_irq(q->queue_lock); | 823 | rq = get_request(q, rw_flags, bio, GFP_NOIO); |
1116 | finish_wait(&rl->wait[is_sync], &wait); | 824 | }; |
1117 | 825 | ||
1118 | goto retry; | 826 | return rq; |
1119 | } | 827 | } |
1120 | 828 | ||
1121 | struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask) | 829 | struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask) |
1122 | { | 830 | { |
1123 | struct request *rq; | 831 | struct request *rq; |
1124 | 832 | ||
1125 | BUG_ON(rw != READ && rw != WRITE); | 833 | if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) |
834 | return NULL; | ||
1126 | 835 | ||
1127 | /* create ioc upfront */ | 836 | BUG_ON(rw != READ && rw != WRITE); |
1128 | create_io_context(gfp_mask, q->node); | ||
1129 | 837 | ||
1130 | spin_lock_irq(q->queue_lock); | 838 | spin_lock_irq(q->queue_lock); |
1131 | rq = get_request(q, rw, NULL, gfp_mask); | 839 | if (gfp_mask & __GFP_WAIT) { |
1132 | if (!rq) | 840 | rq = get_request_wait(q, rw, NULL); |
1133 | spin_unlock_irq(q->queue_lock); | 841 | } else { |
842 | rq = get_request(q, rw, NULL, gfp_mask); | ||
843 | if (!rq) | ||
844 | spin_unlock_irq(q->queue_lock); | ||
845 | } | ||
1134 | /* q->queue_lock is unlocked at this point */ | 846 | /* q->queue_lock is unlocked at this point */ |
1135 | 847 | ||
1136 | return rq; | 848 | return rq; |
@@ -1224,6 +936,54 @@ static void add_acct_request(struct request_queue *q, struct request *rq, | |||
1224 | __elv_add_request(q, rq, where); | 936 | __elv_add_request(q, rq, where); |
1225 | } | 937 | } |
1226 | 938 | ||
939 | /** | ||
940 | * blk_insert_request - insert a special request into a request queue | ||
941 | * @q: request queue where request should be inserted | ||
942 | * @rq: request to be inserted | ||
943 | * @at_head: insert request at head or tail of queue | ||
944 | * @data: private data | ||
945 | * | ||
946 | * Description: | ||
947 | * Many block devices need to execute commands asynchronously, so they don't | ||
948 | * block the whole kernel from preemption during request execution. This is | ||
949 | * accomplished normally by inserting aritficial requests tagged as | ||
950 | * REQ_TYPE_SPECIAL in to the corresponding request queue, and letting them | ||
951 | * be scheduled for actual execution by the request queue. | ||
952 | * | ||
953 | * We have the option of inserting the head or the tail of the queue. | ||
954 | * Typically we use the tail for new ioctls and so forth. We use the head | ||
955 | * of the queue for things like a QUEUE_FULL message from a device, or a | ||
956 | * host that is unable to accept a particular command. | ||
957 | */ | ||
958 | void blk_insert_request(struct request_queue *q, struct request *rq, | ||
959 | int at_head, void *data) | ||
960 | { | ||
961 | int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK; | ||
962 | unsigned long flags; | ||
963 | |||
964 | /* | ||
965 | * tell I/O scheduler that this isn't a regular read/write (ie it | ||
966 | * must not attempt merges on this) and that it acts as a soft | ||
967 | * barrier | ||
968 | */ | ||
969 | rq->cmd_type = REQ_TYPE_SPECIAL; | ||
970 | |||
971 | rq->special = data; | ||
972 | |||
973 | spin_lock_irqsave(q->queue_lock, flags); | ||
974 | |||
975 | /* | ||
976 | * If command is tagged, release the tag | ||
977 | */ | ||
978 | if (blk_rq_tagged(rq)) | ||
979 | blk_queue_end_tag(q, rq); | ||
980 | |||
981 | add_acct_request(q, rq, where); | ||
982 | __blk_run_queue(q); | ||
983 | spin_unlock_irqrestore(q->queue_lock, flags); | ||
984 | } | ||
985 | EXPORT_SYMBOL(blk_insert_request); | ||
986 | |||
1227 | static void part_round_stats_single(int cpu, struct hd_struct *part, | 987 | static void part_round_stats_single(int cpu, struct hd_struct *part, |
1228 | unsigned long now) | 988 | unsigned long now) |
1229 | { | 989 | { |
@@ -1284,15 +1044,14 @@ void __blk_put_request(struct request_queue *q, struct request *req) | |||
1284 | * it didn't come out of our reserved rq pools | 1044 | * it didn't come out of our reserved rq pools |
1285 | */ | 1045 | */ |
1286 | if (req->cmd_flags & REQ_ALLOCED) { | 1046 | if (req->cmd_flags & REQ_ALLOCED) { |
1287 | unsigned int flags = req->cmd_flags; | 1047 | int is_sync = rq_is_sync(req) != 0; |
1288 | struct request_list *rl = blk_rq_rl(req); | 1048 | int priv = req->cmd_flags & REQ_ELVPRIV; |
1289 | 1049 | ||
1290 | BUG_ON(!list_empty(&req->queuelist)); | 1050 | BUG_ON(!list_empty(&req->queuelist)); |
1291 | BUG_ON(!hlist_unhashed(&req->hash)); | 1051 | BUG_ON(!hlist_unhashed(&req->hash)); |
1292 | 1052 | ||
1293 | blk_free_request(rl, req); | 1053 | blk_free_request(q, req); |
1294 | freed_request(rl, flags); | 1054 | freed_request(q, is_sync, priv); |
1295 | blk_put_rl(rl); | ||
1296 | } | 1055 | } |
1297 | } | 1056 | } |
1298 | EXPORT_SYMBOL_GPL(__blk_put_request); | 1057 | EXPORT_SYMBOL_GPL(__blk_put_request); |
@@ -1359,6 +1118,7 @@ static bool bio_attempt_back_merge(struct request_queue *q, struct request *req, | |||
1359 | req->ioprio = ioprio_best(req->ioprio, bio_prio(bio)); | 1118 | req->ioprio = ioprio_best(req->ioprio, bio_prio(bio)); |
1360 | 1119 | ||
1361 | drive_stat_acct(req, 0); | 1120 | drive_stat_acct(req, 0); |
1121 | elv_bio_merged(q, req, bio); | ||
1362 | return true; | 1122 | return true; |
1363 | } | 1123 | } |
1364 | 1124 | ||
@@ -1389,34 +1149,22 @@ static bool bio_attempt_front_merge(struct request_queue *q, | |||
1389 | req->ioprio = ioprio_best(req->ioprio, bio_prio(bio)); | 1149 | req->ioprio = ioprio_best(req->ioprio, bio_prio(bio)); |
1390 | 1150 | ||
1391 | drive_stat_acct(req, 0); | 1151 | drive_stat_acct(req, 0); |
1152 | elv_bio_merged(q, req, bio); | ||
1392 | return true; | 1153 | return true; |
1393 | } | 1154 | } |
1394 | 1155 | ||
1395 | /** | 1156 | /* |
1396 | * attempt_plug_merge - try to merge with %current's plugged list | 1157 | * Attempts to merge with the plugged list in the current process. Returns |
1397 | * @q: request_queue new bio is being queued at | 1158 | * true if merge was successful, otherwise false. |
1398 | * @bio: new bio being queued | ||
1399 | * @request_count: out parameter for number of traversed plugged requests | ||
1400 | * | ||
1401 | * Determine whether @bio being queued on @q can be merged with a request | ||
1402 | * on %current's plugged list. Returns %true if merge was successful, | ||
1403 | * otherwise %false. | ||
1404 | * | ||
1405 | * Plugging coalesces IOs from the same issuer for the same purpose without | ||
1406 | * going through @q->queue_lock. As such it's more of an issuing mechanism | ||
1407 | * than scheduling, and the request, while may have elvpriv data, is not | ||
1408 | * added on the elevator at this point. In addition, we don't have | ||
1409 | * reliable access to the elevator outside queue lock. Only check basic | ||
1410 | * merging parameters without querying the elevator. | ||
1411 | */ | 1159 | */ |
1412 | static bool attempt_plug_merge(struct request_queue *q, struct bio *bio, | 1160 | static bool attempt_plug_merge(struct task_struct *tsk, struct request_queue *q, |
1413 | unsigned int *request_count) | 1161 | struct bio *bio, unsigned int *request_count) |
1414 | { | 1162 | { |
1415 | struct blk_plug *plug; | 1163 | struct blk_plug *plug; |
1416 | struct request *rq; | 1164 | struct request *rq; |
1417 | bool ret = false; | 1165 | bool ret = false; |
1418 | 1166 | ||
1419 | plug = current->plug; | 1167 | plug = tsk->plug; |
1420 | if (!plug) | 1168 | if (!plug) |
1421 | goto out; | 1169 | goto out; |
1422 | *request_count = 0; | 1170 | *request_count = 0; |
@@ -1424,13 +1172,12 @@ static bool attempt_plug_merge(struct request_queue *q, struct bio *bio, | |||
1424 | list_for_each_entry_reverse(rq, &plug->list, queuelist) { | 1172 | list_for_each_entry_reverse(rq, &plug->list, queuelist) { |
1425 | int el_ret; | 1173 | int el_ret; |
1426 | 1174 | ||
1427 | if (rq->q == q) | 1175 | (*request_count)++; |
1428 | (*request_count)++; | ||
1429 | 1176 | ||
1430 | if (rq->q != q || !blk_rq_merge_ok(rq, bio)) | 1177 | if (rq->q != q) |
1431 | continue; | 1178 | continue; |
1432 | 1179 | ||
1433 | el_ret = blk_try_merge(rq, bio); | 1180 | el_ret = elv_try_merge(rq, bio); |
1434 | if (el_ret == ELEVATOR_BACK_MERGE) { | 1181 | if (el_ret == ELEVATOR_BACK_MERGE) { |
1435 | ret = bio_attempt_back_merge(q, rq, bio); | 1182 | ret = bio_attempt_back_merge(q, rq, bio); |
1436 | if (ret) | 1183 | if (ret) |
@@ -1447,6 +1194,7 @@ out: | |||
1447 | 1194 | ||
1448 | void init_request_from_bio(struct request *req, struct bio *bio) | 1195 | void init_request_from_bio(struct request *req, struct bio *bio) |
1449 | { | 1196 | { |
1197 | req->cpu = bio->bi_comp_cpu; | ||
1450 | req->cmd_type = REQ_TYPE_FS; | 1198 | req->cmd_type = REQ_TYPE_FS; |
1451 | 1199 | ||
1452 | req->cmd_flags |= bio->bi_rw & REQ_COMMON_MASK; | 1200 | req->cmd_flags |= bio->bi_rw & REQ_COMMON_MASK; |
@@ -1459,7 +1207,7 @@ void init_request_from_bio(struct request *req, struct bio *bio) | |||
1459 | blk_rq_bio_prep(req->q, req, bio); | 1207 | blk_rq_bio_prep(req->q, req, bio); |
1460 | } | 1208 | } |
1461 | 1209 | ||
1462 | void blk_queue_bio(struct request_queue *q, struct bio *bio) | 1210 | static int __make_request(struct request_queue *q, struct bio *bio) |
1463 | { | 1211 | { |
1464 | const bool sync = !!(bio->bi_rw & REQ_SYNC); | 1212 | const bool sync = !!(bio->bi_rw & REQ_SYNC); |
1465 | struct blk_plug *plug; | 1213 | struct blk_plug *plug; |
@@ -1484,22 +1232,20 @@ void blk_queue_bio(struct request_queue *q, struct bio *bio) | |||
1484 | * Check if we can merge with the plugged list before grabbing | 1232 | * Check if we can merge with the plugged list before grabbing |
1485 | * any locks. | 1233 | * any locks. |
1486 | */ | 1234 | */ |
1487 | if (attempt_plug_merge(q, bio, &request_count)) | 1235 | if (attempt_plug_merge(current, q, bio, &request_count)) |
1488 | return; | 1236 | goto out; |
1489 | 1237 | ||
1490 | spin_lock_irq(q->queue_lock); | 1238 | spin_lock_irq(q->queue_lock); |
1491 | 1239 | ||
1492 | el_ret = elv_merge(q, &req, bio); | 1240 | el_ret = elv_merge(q, &req, bio); |
1493 | if (el_ret == ELEVATOR_BACK_MERGE) { | 1241 | if (el_ret == ELEVATOR_BACK_MERGE) { |
1494 | if (bio_attempt_back_merge(q, req, bio)) { | 1242 | if (bio_attempt_back_merge(q, req, bio)) { |
1495 | elv_bio_merged(q, req, bio); | ||
1496 | if (!attempt_back_merge(q, req)) | 1243 | if (!attempt_back_merge(q, req)) |
1497 | elv_merged_request(q, req, el_ret); | 1244 | elv_merged_request(q, req, el_ret); |
1498 | goto out_unlock; | 1245 | goto out_unlock; |
1499 | } | 1246 | } |
1500 | } else if (el_ret == ELEVATOR_FRONT_MERGE) { | 1247 | } else if (el_ret == ELEVATOR_FRONT_MERGE) { |
1501 | if (bio_attempt_front_merge(q, req, bio)) { | 1248 | if (bio_attempt_front_merge(q, req, bio)) { |
1502 | elv_bio_merged(q, req, bio); | ||
1503 | if (!attempt_front_merge(q, req)) | 1249 | if (!attempt_front_merge(q, req)) |
1504 | elv_merged_request(q, req, el_ret); | 1250 | elv_merged_request(q, req, el_ret); |
1505 | goto out_unlock; | 1251 | goto out_unlock; |
@@ -1520,11 +1266,7 @@ get_rq: | |||
1520 | * Grab a free request. This is might sleep but can not fail. | 1266 | * Grab a free request. This is might sleep but can not fail. |
1521 | * Returns with the queue unlocked. | 1267 | * Returns with the queue unlocked. |
1522 | */ | 1268 | */ |
1523 | req = get_request(q, rw_flags, bio, GFP_NOIO); | 1269 | req = get_request_wait(q, rw_flags, bio); |
1524 | if (unlikely(!req)) { | ||
1525 | bio_endio(bio, -ENODEV); /* @q is dead */ | ||
1526 | goto out_unlock; | ||
1527 | } | ||
1528 | 1270 | ||
1529 | /* | 1271 | /* |
1530 | * After dropping the lock and possibly sleeping here, our request | 1272 | * After dropping the lock and possibly sleeping here, our request |
@@ -1534,7 +1276,8 @@ get_rq: | |||
1534 | */ | 1276 | */ |
1535 | init_request_from_bio(req, bio); | 1277 | init_request_from_bio(req, bio); |
1536 | 1278 | ||
1537 | if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags)) | 1279 | if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) || |
1280 | bio_flagged(bio, BIO_CPU_AFFINE)) | ||
1538 | req->cpu = raw_smp_processor_id(); | 1281 | req->cpu = raw_smp_processor_id(); |
1539 | 1282 | ||
1540 | plug = current->plug; | 1283 | plug = current->plug; |
@@ -1547,19 +1290,15 @@ get_rq: | |||
1547 | */ | 1290 | */ |
1548 | if (list_empty(&plug->list)) | 1291 | if (list_empty(&plug->list)) |
1549 | trace_block_plug(q); | 1292 | trace_block_plug(q); |
1550 | else { | 1293 | else if (!plug->should_sort) { |
1551 | if (!plug->should_sort) { | 1294 | struct request *__rq; |
1552 | struct request *__rq; | ||
1553 | 1295 | ||
1554 | __rq = list_entry_rq(plug->list.prev); | 1296 | __rq = list_entry_rq(plug->list.prev); |
1555 | if (__rq->q != q) | 1297 | if (__rq->q != q) |
1556 | plug->should_sort = 1; | 1298 | plug->should_sort = 1; |
1557 | } | ||
1558 | if (request_count >= BLK_MAX_REQUEST_COUNT) { | ||
1559 | blk_flush_plug_list(plug, false); | ||
1560 | trace_block_plug(q); | ||
1561 | } | ||
1562 | } | 1299 | } |
1300 | if (request_count >= BLK_MAX_REQUEST_COUNT) | ||
1301 | blk_flush_plug_list(plug, false); | ||
1563 | list_add_tail(&req->queuelist, &plug->list); | 1302 | list_add_tail(&req->queuelist, &plug->list); |
1564 | drive_stat_acct(req, 1); | 1303 | drive_stat_acct(req, 1); |
1565 | } else { | 1304 | } else { |
@@ -1569,8 +1308,9 @@ get_rq: | |||
1569 | out_unlock: | 1308 | out_unlock: |
1570 | spin_unlock_irq(q->queue_lock); | 1309 | spin_unlock_irq(q->queue_lock); |
1571 | } | 1310 | } |
1311 | out: | ||
1312 | return 0; | ||
1572 | } | 1313 | } |
1573 | EXPORT_SYMBOL_GPL(blk_queue_bio); /* for device mapper only */ | ||
1574 | 1314 | ||
1575 | /* | 1315 | /* |
1576 | * If bio->bi_dev is a partition, remap the location | 1316 | * If bio->bi_dev is a partition, remap the location |
@@ -1669,147 +1409,165 @@ static inline int bio_check_eod(struct bio *bio, unsigned int nr_sectors) | |||
1669 | return 0; | 1409 | return 0; |
1670 | } | 1410 | } |
1671 | 1411 | ||
1672 | static noinline_for_stack bool | 1412 | /** |
1673 | generic_make_request_checks(struct bio *bio) | 1413 | * generic_make_request - hand a buffer to its device driver for I/O |
1414 | * @bio: The bio describing the location in memory and on the device. | ||
1415 | * | ||
1416 | * generic_make_request() is used to make I/O requests of block | ||
1417 | * devices. It is passed a &struct bio, which describes the I/O that needs | ||
1418 | * to be done. | ||
1419 | * | ||
1420 | * generic_make_request() does not return any status. The | ||
1421 | * success/failure status of the request, along with notification of | ||
1422 | * completion, is delivered asynchronously through the bio->bi_end_io | ||
1423 | * function described (one day) else where. | ||
1424 | * | ||
1425 | * The caller of generic_make_request must make sure that bi_io_vec | ||
1426 | * are set to describe the memory buffer, and that bi_dev and bi_sector are | ||
1427 | * set to describe the device address, and the | ||
1428 | * bi_end_io and optionally bi_private are set to describe how | ||
1429 | * completion notification should be signaled. | ||
1430 | * | ||
1431 | * generic_make_request and the drivers it calls may use bi_next if this | ||
1432 | * bio happens to be merged with someone else, and may change bi_dev and | ||
1433 | * bi_sector for remaps as it sees fit. So the values of these fields | ||
1434 | * should NOT be depended on after the call to generic_make_request. | ||
1435 | */ | ||
1436 | static inline void __generic_make_request(struct bio *bio) | ||
1674 | { | 1437 | { |
1675 | struct request_queue *q; | 1438 | struct request_queue *q; |
1676 | int nr_sectors = bio_sectors(bio); | 1439 | sector_t old_sector; |
1440 | int ret, nr_sectors = bio_sectors(bio); | ||
1441 | dev_t old_dev; | ||
1677 | int err = -EIO; | 1442 | int err = -EIO; |
1678 | char b[BDEVNAME_SIZE]; | ||
1679 | struct hd_struct *part; | ||
1680 | 1443 | ||
1681 | might_sleep(); | 1444 | might_sleep(); |
1682 | 1445 | ||
1683 | if (bio_check_eod(bio, nr_sectors)) | 1446 | if (bio_check_eod(bio, nr_sectors)) |
1684 | goto end_io; | 1447 | goto end_io; |
1685 | 1448 | ||
1686 | q = bdev_get_queue(bio->bi_bdev); | 1449 | /* |
1687 | if (unlikely(!q)) { | 1450 | * Resolve the mapping until finished. (drivers are |
1688 | printk(KERN_ERR | 1451 | * still free to implement/resolve their own stacking |
1689 | "generic_make_request: Trying to access " | 1452 | * by explicitly returning 0) |
1690 | "nonexistent block-device %s (%Lu)\n", | 1453 | * |
1691 | bdevname(bio->bi_bdev, b), | 1454 | * NOTE: we don't repeat the blk_size check for each new device. |
1692 | (long long) bio->bi_sector); | 1455 | * Stacking drivers are expected to know what they are doing. |
1693 | goto end_io; | 1456 | */ |
1694 | } | 1457 | old_sector = -1; |
1458 | old_dev = 0; | ||
1459 | do { | ||
1460 | char b[BDEVNAME_SIZE]; | ||
1461 | struct hd_struct *part; | ||
1695 | 1462 | ||
1696 | if (likely(bio_is_rw(bio) && | 1463 | q = bdev_get_queue(bio->bi_bdev); |
1697 | nr_sectors > queue_max_hw_sectors(q))) { | 1464 | if (unlikely(!q)) { |
1698 | printk(KERN_ERR "bio too big device %s (%u > %u)\n", | 1465 | printk(KERN_ERR |
1699 | bdevname(bio->bi_bdev, b), | 1466 | "generic_make_request: Trying to access " |
1700 | bio_sectors(bio), | 1467 | "nonexistent block-device %s (%Lu)\n", |
1701 | queue_max_hw_sectors(q)); | 1468 | bdevname(bio->bi_bdev, b), |
1702 | goto end_io; | 1469 | (long long) bio->bi_sector); |
1703 | } | 1470 | goto end_io; |
1471 | } | ||
1704 | 1472 | ||
1705 | part = bio->bi_bdev->bd_part; | 1473 | if (unlikely(!(bio->bi_rw & REQ_DISCARD) && |
1706 | if (should_fail_request(part, bio->bi_size) || | 1474 | nr_sectors > queue_max_hw_sectors(q))) { |
1707 | should_fail_request(&part_to_disk(part)->part0, | 1475 | printk(KERN_ERR "bio too big device %s (%u > %u)\n", |
1708 | bio->bi_size)) | 1476 | bdevname(bio->bi_bdev, b), |
1709 | goto end_io; | 1477 | bio_sectors(bio), |
1478 | queue_max_hw_sectors(q)); | ||
1479 | goto end_io; | ||
1480 | } | ||
1710 | 1481 | ||
1711 | /* | 1482 | if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) |
1712 | * If this device has partitions, remap block n | 1483 | goto end_io; |
1713 | * of partition p to block n+start(p) of the disk. | ||
1714 | */ | ||
1715 | blk_partition_remap(bio); | ||
1716 | 1484 | ||
1717 | if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) | 1485 | part = bio->bi_bdev->bd_part; |
1718 | goto end_io; | 1486 | if (should_fail_request(part, bio->bi_size) || |
1487 | should_fail_request(&part_to_disk(part)->part0, | ||
1488 | bio->bi_size)) | ||
1489 | goto end_io; | ||
1719 | 1490 | ||
1720 | if (bio_check_eod(bio, nr_sectors)) | 1491 | /* |
1721 | goto end_io; | 1492 | * If this device has partitions, remap block n |
1493 | * of partition p to block n+start(p) of the disk. | ||
1494 | */ | ||
1495 | blk_partition_remap(bio); | ||
1722 | 1496 | ||
1723 | /* | 1497 | if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) |
1724 | * Filter flush bio's early so that make_request based | 1498 | goto end_io; |
1725 | * drivers without flush support don't have to worry | 1499 | |
1726 | * about them. | 1500 | if (old_sector != -1) |
1727 | */ | 1501 | trace_block_bio_remap(q, bio, old_dev, old_sector); |
1728 | if ((bio->bi_rw & (REQ_FLUSH | REQ_FUA)) && !q->flush_flags) { | 1502 | |
1729 | bio->bi_rw &= ~(REQ_FLUSH | REQ_FUA); | 1503 | old_sector = bio->bi_sector; |
1730 | if (!nr_sectors) { | 1504 | old_dev = bio->bi_bdev->bd_dev; |
1731 | err = 0; | 1505 | |
1506 | if (bio_check_eod(bio, nr_sectors)) | ||
1732 | goto end_io; | 1507 | goto end_io; |
1508 | |||
1509 | /* | ||
1510 | * Filter flush bio's early so that make_request based | ||
1511 | * drivers without flush support don't have to worry | ||
1512 | * about them. | ||
1513 | */ | ||
1514 | if ((bio->bi_rw & (REQ_FLUSH | REQ_FUA)) && !q->flush_flags) { | ||
1515 | bio->bi_rw &= ~(REQ_FLUSH | REQ_FUA); | ||
1516 | if (!nr_sectors) { | ||
1517 | err = 0; | ||
1518 | goto end_io; | ||
1519 | } | ||
1733 | } | 1520 | } |
1734 | } | ||
1735 | 1521 | ||
1736 | if ((bio->bi_rw & REQ_DISCARD) && | 1522 | if ((bio->bi_rw & REQ_DISCARD) && |
1737 | (!blk_queue_discard(q) || | 1523 | (!blk_queue_discard(q) || |
1738 | ((bio->bi_rw & REQ_SECURE) && !blk_queue_secdiscard(q)))) { | 1524 | ((bio->bi_rw & REQ_SECURE) && |
1739 | err = -EOPNOTSUPP; | 1525 | !blk_queue_secdiscard(q)))) { |
1740 | goto end_io; | 1526 | err = -EOPNOTSUPP; |
1741 | } | 1527 | goto end_io; |
1528 | } | ||
1742 | 1529 | ||
1743 | if (bio->bi_rw & REQ_WRITE_SAME && !bdev_write_same(bio->bi_bdev)) { | 1530 | if (blk_throtl_bio(q, &bio)) |
1744 | err = -EOPNOTSUPP; | 1531 | goto end_io; |
1745 | goto end_io; | ||
1746 | } | ||
1747 | 1532 | ||
1748 | /* | 1533 | /* |
1749 | * Various block parts want %current->io_context and lazy ioc | 1534 | * If bio = NULL, bio has been throttled and will be submitted |
1750 | * allocation ends up trading a lot of pain for a small amount of | 1535 | * later. |
1751 | * memory. Just allocate it upfront. This may fail and block | 1536 | */ |
1752 | * layer knows how to live with it. | 1537 | if (!bio) |
1753 | */ | 1538 | break; |
1754 | create_io_context(GFP_ATOMIC, q->node); | ||
1755 | 1539 | ||
1756 | if (blk_throtl_bio(q, bio)) | 1540 | trace_block_bio_queue(q, bio); |
1757 | return false; /* throttled, will be resubmitted later */ | ||
1758 | 1541 | ||
1759 | trace_block_bio_queue(q, bio); | 1542 | ret = q->make_request_fn(q, bio); |
1760 | return true; | 1543 | } while (ret); |
1544 | |||
1545 | return; | ||
1761 | 1546 | ||
1762 | end_io: | 1547 | end_io: |
1763 | bio_endio(bio, err); | 1548 | bio_endio(bio, err); |
1764 | return false; | ||
1765 | } | 1549 | } |
1766 | 1550 | ||
1767 | /** | 1551 | /* |
1768 | * generic_make_request - hand a buffer to its device driver for I/O | 1552 | * We only want one ->make_request_fn to be active at a time, |
1769 | * @bio: The bio describing the location in memory and on the device. | 1553 | * else stack usage with stacked devices could be a problem. |
1770 | * | 1554 | * So use current->bio_list to keep a list of requests |
1771 | * generic_make_request() is used to make I/O requests of block | 1555 | * submited by a make_request_fn function. |
1772 | * devices. It is passed a &struct bio, which describes the I/O that needs | 1556 | * current->bio_list is also used as a flag to say if |
1773 | * to be done. | 1557 | * generic_make_request is currently active in this task or not. |
1774 | * | 1558 | * If it is NULL, then no make_request is active. If it is non-NULL, |
1775 | * generic_make_request() does not return any status. The | 1559 | * then a make_request is active, and new requests should be added |
1776 | * success/failure status of the request, along with notification of | 1560 | * at the tail |
1777 | * completion, is delivered asynchronously through the bio->bi_end_io | ||
1778 | * function described (one day) else where. | ||
1779 | * | ||
1780 | * The caller of generic_make_request must make sure that bi_io_vec | ||
1781 | * are set to describe the memory buffer, and that bi_dev and bi_sector are | ||
1782 | * set to describe the device address, and the | ||
1783 | * bi_end_io and optionally bi_private are set to describe how | ||
1784 | * completion notification should be signaled. | ||
1785 | * | ||
1786 | * generic_make_request and the drivers it calls may use bi_next if this | ||
1787 | * bio happens to be merged with someone else, and may resubmit the bio to | ||
1788 | * a lower device by calling into generic_make_request recursively, which | ||
1789 | * means the bio should NOT be touched after the call to ->make_request_fn. | ||
1790 | */ | 1561 | */ |
1791 | void generic_make_request(struct bio *bio) | 1562 | void generic_make_request(struct bio *bio) |
1792 | { | 1563 | { |
1793 | struct bio_list bio_list_on_stack; | 1564 | struct bio_list bio_list_on_stack; |
1794 | 1565 | ||
1795 | if (!generic_make_request_checks(bio)) | ||
1796 | return; | ||
1797 | |||
1798 | /* | ||
1799 | * We only want one ->make_request_fn to be active at a time, else | ||
1800 | * stack usage with stacked devices could be a problem. So use | ||
1801 | * current->bio_list to keep a list of requests submited by a | ||
1802 | * make_request_fn function. current->bio_list is also used as a | ||
1803 | * flag to say if generic_make_request is currently active in this | ||
1804 | * task or not. If it is NULL, then no make_request is active. If | ||
1805 | * it is non-NULL, then a make_request is active, and new requests | ||
1806 | * should be added at the tail | ||
1807 | */ | ||
1808 | if (current->bio_list) { | 1566 | if (current->bio_list) { |
1567 | /* make_request is active */ | ||
1809 | bio_list_add(current->bio_list, bio); | 1568 | bio_list_add(current->bio_list, bio); |
1810 | return; | 1569 | return; |
1811 | } | 1570 | } |
1812 | |||
1813 | /* following loop may be a bit non-obvious, and so deserves some | 1571 | /* following loop may be a bit non-obvious, and so deserves some |
1814 | * explanation. | 1572 | * explanation. |
1815 | * Before entering the loop, bio->bi_next is NULL (as all callers | 1573 | * Before entering the loop, bio->bi_next is NULL (as all callers |
@@ -1817,21 +1575,22 @@ void generic_make_request(struct bio *bio) | |||
1817 | * We pretend that we have just taken it off a longer list, so | 1575 | * We pretend that we have just taken it off a longer list, so |
1818 | * we assign bio_list to a pointer to the bio_list_on_stack, | 1576 | * we assign bio_list to a pointer to the bio_list_on_stack, |
1819 | * thus initialising the bio_list of new bios to be | 1577 | * thus initialising the bio_list of new bios to be |
1820 | * added. ->make_request() may indeed add some more bios | 1578 | * added. __generic_make_request may indeed add some more bios |
1821 | * through a recursive call to generic_make_request. If it | 1579 | * through a recursive call to generic_make_request. If it |
1822 | * did, we find a non-NULL value in bio_list and re-enter the loop | 1580 | * did, we find a non-NULL value in bio_list and re-enter the loop |
1823 | * from the top. In this case we really did just take the bio | 1581 | * from the top. In this case we really did just take the bio |
1824 | * of the top of the list (no pretending) and so remove it from | 1582 | * of the top of the list (no pretending) and so remove it from |
1825 | * bio_list, and call into ->make_request() again. | 1583 | * bio_list, and call into __generic_make_request again. |
1584 | * | ||
1585 | * The loop was structured like this to make only one call to | ||
1586 | * __generic_make_request (which is important as it is large and | ||
1587 | * inlined) and to keep the structure simple. | ||
1826 | */ | 1588 | */ |
1827 | BUG_ON(bio->bi_next); | 1589 | BUG_ON(bio->bi_next); |
1828 | bio_list_init(&bio_list_on_stack); | 1590 | bio_list_init(&bio_list_on_stack); |
1829 | current->bio_list = &bio_list_on_stack; | 1591 | current->bio_list = &bio_list_on_stack; |
1830 | do { | 1592 | do { |
1831 | struct request_queue *q = bdev_get_queue(bio->bi_bdev); | 1593 | __generic_make_request(bio); |
1832 | |||
1833 | q->make_request_fn(q, bio); | ||
1834 | |||
1835 | bio = bio_list_pop(current->bio_list); | 1594 | bio = bio_list_pop(current->bio_list); |
1836 | } while (bio); | 1595 | } while (bio); |
1837 | current->bio_list = NULL; /* deactivate */ | 1596 | current->bio_list = NULL; /* deactivate */ |
@@ -1850,20 +1609,15 @@ EXPORT_SYMBOL(generic_make_request); | |||
1850 | */ | 1609 | */ |
1851 | void submit_bio(int rw, struct bio *bio) | 1610 | void submit_bio(int rw, struct bio *bio) |
1852 | { | 1611 | { |
1612 | int count = bio_sectors(bio); | ||
1613 | |||
1853 | bio->bi_rw |= rw; | 1614 | bio->bi_rw |= rw; |
1854 | 1615 | ||
1855 | /* | 1616 | /* |
1856 | * If it's a regular read/write or a barrier with data attached, | 1617 | * If it's a regular read/write or a barrier with data attached, |
1857 | * go through the normal accounting stuff before submission. | 1618 | * go through the normal accounting stuff before submission. |
1858 | */ | 1619 | */ |
1859 | if (bio_has_data(bio)) { | 1620 | if (bio_has_data(bio) && !(rw & REQ_DISCARD)) { |
1860 | unsigned int count; | ||
1861 | |||
1862 | if (unlikely(rw & REQ_WRITE_SAME)) | ||
1863 | count = bdev_logical_block_size(bio->bi_bdev) >> 9; | ||
1864 | else | ||
1865 | count = bio_sectors(bio); | ||
1866 | |||
1867 | if (rw & WRITE) { | 1621 | if (rw & WRITE) { |
1868 | count_vm_events(PGPGOUT, count); | 1622 | count_vm_events(PGPGOUT, count); |
1869 | } else { | 1623 | } else { |
@@ -1909,10 +1663,11 @@ EXPORT_SYMBOL(submit_bio); | |||
1909 | */ | 1663 | */ |
1910 | int blk_rq_check_limits(struct request_queue *q, struct request *rq) | 1664 | int blk_rq_check_limits(struct request_queue *q, struct request *rq) |
1911 | { | 1665 | { |
1912 | if (!rq_mergeable(rq)) | 1666 | if (rq->cmd_flags & REQ_DISCARD) |
1913 | return 0; | 1667 | return 0; |
1914 | 1668 | ||
1915 | if (blk_rq_sectors(rq) > blk_queue_get_max_sectors(q, rq->cmd_flags)) { | 1669 | if (blk_rq_sectors(rq) > queue_max_sectors(q) || |
1670 | blk_rq_bytes(rq) > queue_max_hw_sectors(q) << 9) { | ||
1916 | printk(KERN_ERR "%s: over max size limit.\n", __func__); | 1671 | printk(KERN_ERR "%s: over max size limit.\n", __func__); |
1917 | return -EIO; | 1672 | return -EIO; |
1918 | } | 1673 | } |
@@ -1951,10 +1706,6 @@ int blk_insert_cloned_request(struct request_queue *q, struct request *rq) | |||
1951 | return -EIO; | 1706 | return -EIO; |
1952 | 1707 | ||
1953 | spin_lock_irqsave(q->queue_lock, flags); | 1708 | spin_lock_irqsave(q->queue_lock, flags); |
1954 | if (unlikely(blk_queue_dying(q))) { | ||
1955 | spin_unlock_irqrestore(q->queue_lock, flags); | ||
1956 | return -ENODEV; | ||
1957 | } | ||
1958 | 1709 | ||
1959 | /* | 1710 | /* |
1960 | * Submitting request must be dequeued before calling this function | 1711 | * Submitting request must be dequeued before calling this function |
@@ -2296,11 +2047,9 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes) | |||
2296 | error_type = "I/O"; | 2047 | error_type = "I/O"; |
2297 | break; | 2048 | break; |
2298 | } | 2049 | } |
2299 | printk_ratelimited(KERN_ERR "end_request: %s error, dev %s, sector %llu\n", | 2050 | printk(KERN_ERR "end_request: %s error, dev %s, sector %llu\n", |
2300 | error_type, req->rq_disk ? | 2051 | error_type, req->rq_disk ? req->rq_disk->disk_name : "?", |
2301 | req->rq_disk->disk_name : "?", | 2052 | (unsigned long long)blk_rq_pos(req)); |
2302 | (unsigned long long)blk_rq_pos(req)); | ||
2303 | |||
2304 | } | 2053 | } |
2305 | 2054 | ||
2306 | blk_account_io_completion(req, nr_bytes); | 2055 | blk_account_io_completion(req, nr_bytes); |
@@ -2384,7 +2133,7 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes) | |||
2384 | req->buffer = bio_data(req->bio); | 2133 | req->buffer = bio_data(req->bio); |
2385 | 2134 | ||
2386 | /* update sector only for requests with clear definition of sector */ | 2135 | /* update sector only for requests with clear definition of sector */ |
2387 | if (req->cmd_type == REQ_TYPE_FS) | 2136 | if (req->cmd_type == REQ_TYPE_FS || (req->cmd_flags & REQ_DISCARD)) |
2388 | req->__sector += total_bytes >> 9; | 2137 | req->__sector += total_bytes >> 9; |
2389 | 2138 | ||
2390 | /* mixed attributes always follow the first bio */ | 2139 | /* mixed attributes always follow the first bio */ |
@@ -2825,10 +2574,16 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src, | |||
2825 | blk_rq_init(NULL, rq); | 2574 | blk_rq_init(NULL, rq); |
2826 | 2575 | ||
2827 | __rq_for_each_bio(bio_src, rq_src) { | 2576 | __rq_for_each_bio(bio_src, rq_src) { |
2828 | bio = bio_clone_bioset(bio_src, gfp_mask, bs); | 2577 | bio = bio_alloc_bioset(gfp_mask, bio_src->bi_max_vecs, bs); |
2829 | if (!bio) | 2578 | if (!bio) |
2830 | goto free_and_out; | 2579 | goto free_and_out; |
2831 | 2580 | ||
2581 | __bio_clone(bio, bio_src); | ||
2582 | |||
2583 | if (bio_integrity(bio_src) && | ||
2584 | bio_integrity_clone(bio, bio_src, gfp_mask, bs)) | ||
2585 | goto free_and_out; | ||
2586 | |||
2832 | if (bio_ctr && bio_ctr(bio, bio_src, data)) | 2587 | if (bio_ctr && bio_ctr(bio, bio_src, data)) |
2833 | goto free_and_out; | 2588 | goto free_and_out; |
2834 | 2589 | ||
@@ -2845,7 +2600,7 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src, | |||
2845 | 2600 | ||
2846 | free_and_out: | 2601 | free_and_out: |
2847 | if (bio) | 2602 | if (bio) |
2848 | bio_put(bio); | 2603 | bio_free(bio, bs); |
2849 | blk_rq_unprep_clone(rq); | 2604 | blk_rq_unprep_clone(rq); |
2850 | 2605 | ||
2851 | return -ENOMEM; | 2606 | return -ENOMEM; |
@@ -2867,20 +2622,6 @@ EXPORT_SYMBOL(kblockd_schedule_delayed_work); | |||
2867 | 2622 | ||
2868 | #define PLUG_MAGIC 0x91827364 | 2623 | #define PLUG_MAGIC 0x91827364 |
2869 | 2624 | ||
2870 | /** | ||
2871 | * blk_start_plug - initialize blk_plug and track it inside the task_struct | ||
2872 | * @plug: The &struct blk_plug that needs to be initialized | ||
2873 | * | ||
2874 | * Description: | ||
2875 | * Tracking blk_plug inside the task_struct will help with auto-flushing the | ||
2876 | * pending I/O should the task end up blocking between blk_start_plug() and | ||
2877 | * blk_finish_plug(). This is important from a performance perspective, but | ||
2878 | * also ensures that we don't deadlock. For instance, if the task is blocking | ||
2879 | * for a memory allocation, memory reclaim could end up wanting to free a | ||
2880 | * page belonging to that request that is currently residing in our private | ||
2881 | * plug. By flushing the pending I/O when the process goes to sleep, we avoid | ||
2882 | * this kind of deadlock. | ||
2883 | */ | ||
2884 | void blk_start_plug(struct blk_plug *plug) | 2625 | void blk_start_plug(struct blk_plug *plug) |
2885 | { | 2626 | { |
2886 | struct task_struct *tsk = current; | 2627 | struct task_struct *tsk = current; |
@@ -2909,8 +2650,7 @@ static int plug_rq_cmp(void *priv, struct list_head *a, struct list_head *b) | |||
2909 | struct request *rqa = container_of(a, struct request, queuelist); | 2650 | struct request *rqa = container_of(a, struct request, queuelist); |
2910 | struct request *rqb = container_of(b, struct request, queuelist); | 2651 | struct request *rqb = container_of(b, struct request, queuelist); |
2911 | 2652 | ||
2912 | return !(rqa->q < rqb->q || | 2653 | return !(rqa->q <= rqb->q); |
2913 | (rqa->q == rqb->q && blk_rq_pos(rqa) < blk_rq_pos(rqb))); | ||
2914 | } | 2654 | } |
2915 | 2655 | ||
2916 | /* | 2656 | /* |
@@ -2925,55 +2665,39 @@ static void queue_unplugged(struct request_queue *q, unsigned int depth, | |||
2925 | { | 2665 | { |
2926 | trace_block_unplug(q, depth, !from_schedule); | 2666 | trace_block_unplug(q, depth, !from_schedule); |
2927 | 2667 | ||
2928 | if (from_schedule) | 2668 | /* |
2669 | * If we are punting this to kblockd, then we can safely drop | ||
2670 | * the queue_lock before waking kblockd (which needs to take | ||
2671 | * this lock). | ||
2672 | */ | ||
2673 | if (from_schedule) { | ||
2674 | spin_unlock(q->queue_lock); | ||
2929 | blk_run_queue_async(q); | 2675 | blk_run_queue_async(q); |
2930 | else | 2676 | } else { |
2931 | __blk_run_queue(q); | 2677 | __blk_run_queue(q); |
2932 | spin_unlock(q->queue_lock); | 2678 | spin_unlock(q->queue_lock); |
2679 | } | ||
2680 | |||
2933 | } | 2681 | } |
2934 | 2682 | ||
2935 | static void flush_plug_callbacks(struct blk_plug *plug, bool from_schedule) | 2683 | static void flush_plug_callbacks(struct blk_plug *plug) |
2936 | { | 2684 | { |
2937 | LIST_HEAD(callbacks); | 2685 | LIST_HEAD(callbacks); |
2938 | 2686 | ||
2939 | while (!list_empty(&plug->cb_list)) { | 2687 | if (list_empty(&plug->cb_list)) |
2940 | list_splice_init(&plug->cb_list, &callbacks); | 2688 | return; |
2689 | |||
2690 | list_splice_init(&plug->cb_list, &callbacks); | ||
2941 | 2691 | ||
2942 | while (!list_empty(&callbacks)) { | 2692 | while (!list_empty(&callbacks)) { |
2943 | struct blk_plug_cb *cb = list_first_entry(&callbacks, | 2693 | struct blk_plug_cb *cb = list_first_entry(&callbacks, |
2944 | struct blk_plug_cb, | 2694 | struct blk_plug_cb, |
2945 | list); | 2695 | list); |
2946 | list_del(&cb->list); | 2696 | list_del(&cb->list); |
2947 | cb->callback(cb, from_schedule); | 2697 | cb->callback(cb); |
2948 | } | ||
2949 | } | 2698 | } |
2950 | } | 2699 | } |
2951 | 2700 | ||
2952 | struct blk_plug_cb *blk_check_plugged(blk_plug_cb_fn unplug, void *data, | ||
2953 | int size) | ||
2954 | { | ||
2955 | struct blk_plug *plug = current->plug; | ||
2956 | struct blk_plug_cb *cb; | ||
2957 | |||
2958 | if (!plug) | ||
2959 | return NULL; | ||
2960 | |||
2961 | list_for_each_entry(cb, &plug->cb_list, list) | ||
2962 | if (cb->callback == unplug && cb->data == data) | ||
2963 | return cb; | ||
2964 | |||
2965 | /* Not currently on the callback list */ | ||
2966 | BUG_ON(size < sizeof(*cb)); | ||
2967 | cb = kzalloc(size, GFP_ATOMIC); | ||
2968 | if (cb) { | ||
2969 | cb->data = data; | ||
2970 | cb->callback = unplug; | ||
2971 | list_add(&cb->list, &plug->cb_list); | ||
2972 | } | ||
2973 | return cb; | ||
2974 | } | ||
2975 | EXPORT_SYMBOL(blk_check_plugged); | ||
2976 | |||
2977 | void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule) | 2701 | void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule) |
2978 | { | 2702 | { |
2979 | struct request_queue *q; | 2703 | struct request_queue *q; |
@@ -2984,7 +2708,7 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule) | |||
2984 | 2708 | ||
2985 | BUG_ON(plug->magic != PLUG_MAGIC); | 2709 | BUG_ON(plug->magic != PLUG_MAGIC); |
2986 | 2710 | ||
2987 | flush_plug_callbacks(plug, from_schedule); | 2711 | flush_plug_callbacks(plug); |
2988 | if (list_empty(&plug->list)) | 2712 | if (list_empty(&plug->list)) |
2989 | return; | 2713 | return; |
2990 | 2714 | ||
@@ -3017,15 +2741,6 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule) | |||
3017 | depth = 0; | 2741 | depth = 0; |
3018 | spin_lock(q->queue_lock); | 2742 | spin_lock(q->queue_lock); |
3019 | } | 2743 | } |
3020 | |||
3021 | /* | ||
3022 | * Short-circuit if @q is dead | ||
3023 | */ | ||
3024 | if (unlikely(blk_queue_dying(q))) { | ||
3025 | __blk_end_request_all(rq, -ENODEV); | ||
3026 | continue; | ||
3027 | } | ||
3028 | |||
3029 | /* | 2744 | /* |
3030 | * rq is already accounted, so use raw insert | 2745 | * rq is already accounted, so use raw insert |
3031 | */ | 2746 | */ |
diff --git a/block/blk-exec.c b/block/blk-exec.c index 74638ec234c..a1ebceb332f 100644 --- a/block/blk-exec.c +++ b/block/blk-exec.c | |||
@@ -43,42 +43,29 @@ static void blk_end_sync_rq(struct request *rq, int error) | |||
43 | * Description: | 43 | * Description: |
44 | * Insert a fully prepared request at the back of the I/O scheduler queue | 44 | * Insert a fully prepared request at the back of the I/O scheduler queue |
45 | * for execution. Don't wait for completion. | 45 | * for execution. Don't wait for completion. |
46 | * | ||
47 | * Note: | ||
48 | * This function will invoke @done directly if the queue is dead. | ||
49 | */ | 46 | */ |
50 | void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk, | 47 | void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk, |
51 | struct request *rq, int at_head, | 48 | struct request *rq, int at_head, |
52 | rq_end_io_fn *done) | 49 | rq_end_io_fn *done) |
53 | { | 50 | { |
54 | int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK; | 51 | int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK; |
55 | bool is_pm_resume; | ||
56 | |||
57 | WARN_ON(irqs_disabled()); | ||
58 | |||
59 | rq->rq_disk = bd_disk; | ||
60 | rq->end_io = done; | ||
61 | /* | ||
62 | * need to check this before __blk_run_queue(), because rq can | ||
63 | * be freed before that returns. | ||
64 | */ | ||
65 | is_pm_resume = rq->cmd_type == REQ_TYPE_PM_RESUME; | ||
66 | |||
67 | spin_lock_irq(q->queue_lock); | ||
68 | 52 | ||
69 | if (unlikely(blk_queue_dying(q))) { | 53 | if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) { |
70 | rq->errors = -ENXIO; | 54 | rq->errors = -ENXIO; |
71 | if (rq->end_io) | 55 | if (rq->end_io) |
72 | rq->end_io(rq, rq->errors); | 56 | rq->end_io(rq, rq->errors); |
73 | spin_unlock_irq(q->queue_lock); | ||
74 | return; | 57 | return; |
75 | } | 58 | } |
76 | 59 | ||
60 | rq->rq_disk = bd_disk; | ||
61 | rq->end_io = done; | ||
62 | WARN_ON(irqs_disabled()); | ||
63 | spin_lock_irq(q->queue_lock); | ||
77 | __elv_add_request(q, rq, where); | 64 | __elv_add_request(q, rq, where); |
78 | __blk_run_queue(q); | 65 | __blk_run_queue(q); |
79 | /* the queue is stopped so it won't be run */ | 66 | /* the queue is stopped so it won't be run */ |
80 | if (is_pm_resume) | 67 | if (rq->cmd_type == REQ_TYPE_PM_RESUME) |
81 | __blk_run_queue_uncond(q); | 68 | q->request_fn(q); |
82 | spin_unlock_irq(q->queue_lock); | 69 | spin_unlock_irq(q->queue_lock); |
83 | } | 70 | } |
84 | EXPORT_SYMBOL_GPL(blk_execute_rq_nowait); | 71 | EXPORT_SYMBOL_GPL(blk_execute_rq_nowait); |
diff --git a/block/blk-integrity.c b/block/blk-integrity.c index da2a818c3a9..129b9e209a3 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c | |||
@@ -24,7 +24,6 @@ | |||
24 | #include <linux/mempool.h> | 24 | #include <linux/mempool.h> |
25 | #include <linux/bio.h> | 25 | #include <linux/bio.h> |
26 | #include <linux/scatterlist.h> | 26 | #include <linux/scatterlist.h> |
27 | #include <linux/export.h> | ||
28 | #include <linux/slab.h> | 27 | #include <linux/slab.h> |
29 | 28 | ||
30 | #include "blk.h" | 29 | #include "blk.h" |
diff --git a/block/blk-ioc.c b/block/blk-ioc.c index fab4cdd3f7b..6f9bbd97865 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c | |||
@@ -16,185 +16,52 @@ | |||
16 | */ | 16 | */ |
17 | static struct kmem_cache *iocontext_cachep; | 17 | static struct kmem_cache *iocontext_cachep; |
18 | 18 | ||
19 | /** | 19 | static void cfq_dtor(struct io_context *ioc) |
20 | * get_io_context - increment reference count to io_context | ||
21 | * @ioc: io_context to get | ||
22 | * | ||
23 | * Increment reference count to @ioc. | ||
24 | */ | ||
25 | void get_io_context(struct io_context *ioc) | ||
26 | { | ||
27 | BUG_ON(atomic_long_read(&ioc->refcount) <= 0); | ||
28 | atomic_long_inc(&ioc->refcount); | ||
29 | } | ||
30 | EXPORT_SYMBOL(get_io_context); | ||
31 | |||
32 | static void icq_free_icq_rcu(struct rcu_head *head) | ||
33 | { | ||
34 | struct io_cq *icq = container_of(head, struct io_cq, __rcu_head); | ||
35 | |||
36 | kmem_cache_free(icq->__rcu_icq_cache, icq); | ||
37 | } | ||
38 | |||
39 | /* Exit an icq. Called with both ioc and q locked. */ | ||
40 | static void ioc_exit_icq(struct io_cq *icq) | ||
41 | { | ||
42 | struct elevator_type *et = icq->q->elevator->type; | ||
43 | |||
44 | if (icq->flags & ICQ_EXITED) | ||
45 | return; | ||
46 | |||
47 | if (et->ops.elevator_exit_icq_fn) | ||
48 | et->ops.elevator_exit_icq_fn(icq); | ||
49 | |||
50 | icq->flags |= ICQ_EXITED; | ||
51 | } | ||
52 | |||
53 | /* Release an icq. Called with both ioc and q locked. */ | ||
54 | static void ioc_destroy_icq(struct io_cq *icq) | ||
55 | { | ||
56 | struct io_context *ioc = icq->ioc; | ||
57 | struct request_queue *q = icq->q; | ||
58 | struct elevator_type *et = q->elevator->type; | ||
59 | |||
60 | lockdep_assert_held(&ioc->lock); | ||
61 | lockdep_assert_held(q->queue_lock); | ||
62 | |||
63 | radix_tree_delete(&ioc->icq_tree, icq->q->id); | ||
64 | hlist_del_init(&icq->ioc_node); | ||
65 | list_del_init(&icq->q_node); | ||
66 | |||
67 | /* | ||
68 | * Both setting lookup hint to and clearing it from @icq are done | ||
69 | * under queue_lock. If it's not pointing to @icq now, it never | ||
70 | * will. Hint assignment itself can race safely. | ||
71 | */ | ||
72 | if (rcu_dereference_raw(ioc->icq_hint) == icq) | ||
73 | rcu_assign_pointer(ioc->icq_hint, NULL); | ||
74 | |||
75 | ioc_exit_icq(icq); | ||
76 | |||
77 | /* | ||
78 | * @icq->q might have gone away by the time RCU callback runs | ||
79 | * making it impossible to determine icq_cache. Record it in @icq. | ||
80 | */ | ||
81 | icq->__rcu_icq_cache = et->icq_cache; | ||
82 | call_rcu(&icq->__rcu_head, icq_free_icq_rcu); | ||
83 | } | ||
84 | |||
85 | /* | ||
86 | * Slow path for ioc release in put_io_context(). Performs double-lock | ||
87 | * dancing to unlink all icq's and then frees ioc. | ||
88 | */ | ||
89 | static void ioc_release_fn(struct work_struct *work) | ||
90 | { | 20 | { |
91 | struct io_context *ioc = container_of(work, struct io_context, | 21 | if (!hlist_empty(&ioc->cic_list)) { |
92 | release_work); | 22 | struct cfq_io_context *cic; |
93 | unsigned long flags; | ||
94 | 23 | ||
95 | /* | 24 | cic = hlist_entry(ioc->cic_list.first, struct cfq_io_context, |
96 | * Exiting icq may call into put_io_context() through elevator | 25 | cic_list); |
97 | * which will trigger lockdep warning. The ioc's are guaranteed to | 26 | cic->dtor(ioc); |
98 | * be different, use a different locking subclass here. Use | ||
99 | * irqsave variant as there's no spin_lock_irq_nested(). | ||
100 | */ | ||
101 | spin_lock_irqsave_nested(&ioc->lock, flags, 1); | ||
102 | |||
103 | while (!hlist_empty(&ioc->icq_list)) { | ||
104 | struct io_cq *icq = hlist_entry(ioc->icq_list.first, | ||
105 | struct io_cq, ioc_node); | ||
106 | struct request_queue *q = icq->q; | ||
107 | |||
108 | if (spin_trylock(q->queue_lock)) { | ||
109 | ioc_destroy_icq(icq); | ||
110 | spin_unlock(q->queue_lock); | ||
111 | } else { | ||
112 | spin_unlock_irqrestore(&ioc->lock, flags); | ||
113 | cpu_relax(); | ||
114 | spin_lock_irqsave_nested(&ioc->lock, flags, 1); | ||
115 | } | ||
116 | } | 27 | } |
117 | |||
118 | spin_unlock_irqrestore(&ioc->lock, flags); | ||
119 | |||
120 | kmem_cache_free(iocontext_cachep, ioc); | ||
121 | } | 28 | } |
122 | 29 | ||
123 | /** | 30 | /* |
124 | * put_io_context - put a reference of io_context | 31 | * IO Context helper functions. put_io_context() returns 1 if there are no |
125 | * @ioc: io_context to put | 32 | * more users of this io context, 0 otherwise. |
126 | * | ||
127 | * Decrement reference count of @ioc and release it if the count reaches | ||
128 | * zero. | ||
129 | */ | 33 | */ |
130 | void put_io_context(struct io_context *ioc) | 34 | int put_io_context(struct io_context *ioc) |
131 | { | 35 | { |
132 | unsigned long flags; | ||
133 | bool free_ioc = false; | ||
134 | |||
135 | if (ioc == NULL) | 36 | if (ioc == NULL) |
136 | return; | 37 | return 1; |
137 | 38 | ||
138 | BUG_ON(atomic_long_read(&ioc->refcount) <= 0); | 39 | BUG_ON(atomic_long_read(&ioc->refcount) == 0); |
139 | 40 | ||
140 | /* | ||
141 | * Releasing ioc requires reverse order double locking and we may | ||
142 | * already be holding a queue_lock. Do it asynchronously from wq. | ||
143 | */ | ||
144 | if (atomic_long_dec_and_test(&ioc->refcount)) { | 41 | if (atomic_long_dec_and_test(&ioc->refcount)) { |
145 | spin_lock_irqsave(&ioc->lock, flags); | 42 | rcu_read_lock(); |
146 | if (!hlist_empty(&ioc->icq_list)) | 43 | cfq_dtor(ioc); |
147 | schedule_work(&ioc->release_work); | 44 | rcu_read_unlock(); |
148 | else | ||
149 | free_ioc = true; | ||
150 | spin_unlock_irqrestore(&ioc->lock, flags); | ||
151 | } | ||
152 | 45 | ||
153 | if (free_ioc) | ||
154 | kmem_cache_free(iocontext_cachep, ioc); | 46 | kmem_cache_free(iocontext_cachep, ioc); |
47 | return 1; | ||
48 | } | ||
49 | return 0; | ||
155 | } | 50 | } |
156 | EXPORT_SYMBOL(put_io_context); | 51 | EXPORT_SYMBOL(put_io_context); |
157 | 52 | ||
158 | /** | 53 | static void cfq_exit(struct io_context *ioc) |
159 | * put_io_context_active - put active reference on ioc | ||
160 | * @ioc: ioc of interest | ||
161 | * | ||
162 | * Undo get_io_context_active(). If active reference reaches zero after | ||
163 | * put, @ioc can never issue further IOs and ioscheds are notified. | ||
164 | */ | ||
165 | void put_io_context_active(struct io_context *ioc) | ||
166 | { | 54 | { |
167 | struct hlist_node *n; | 55 | rcu_read_lock(); |
168 | unsigned long flags; | ||
169 | struct io_cq *icq; | ||
170 | 56 | ||
171 | if (!atomic_dec_and_test(&ioc->active_ref)) { | 57 | if (!hlist_empty(&ioc->cic_list)) { |
172 | put_io_context(ioc); | 58 | struct cfq_io_context *cic; |
173 | return; | ||
174 | } | ||
175 | 59 | ||
176 | /* | 60 | cic = hlist_entry(ioc->cic_list.first, struct cfq_io_context, |
177 | * Need ioc lock to walk icq_list and q lock to exit icq. Perform | 61 | cic_list); |
178 | * reverse double locking. Read comment in ioc_release_fn() for | 62 | cic->exit(ioc); |
179 | * explanation on the nested locking annotation. | ||
180 | */ | ||
181 | retry: | ||
182 | spin_lock_irqsave_nested(&ioc->lock, flags, 1); | ||
183 | hlist_for_each_entry(icq, n, &ioc->icq_list, ioc_node) { | ||
184 | if (icq->flags & ICQ_EXITED) | ||
185 | continue; | ||
186 | if (spin_trylock(icq->q->queue_lock)) { | ||
187 | ioc_exit_icq(icq); | ||
188 | spin_unlock(icq->q->queue_lock); | ||
189 | } else { | ||
190 | spin_unlock_irqrestore(&ioc->lock, flags); | ||
191 | cpu_relax(); | ||
192 | goto retry; | ||
193 | } | ||
194 | } | 63 | } |
195 | spin_unlock_irqrestore(&ioc->lock, flags); | 64 | rcu_read_unlock(); |
196 | |||
197 | put_io_context(ioc); | ||
198 | } | 65 | } |
199 | 66 | ||
200 | /* Called by the exiting task */ | 67 | /* Called by the exiting task */ |
@@ -207,197 +74,86 @@ void exit_io_context(struct task_struct *task) | |||
207 | task->io_context = NULL; | 74 | task->io_context = NULL; |
208 | task_unlock(task); | 75 | task_unlock(task); |
209 | 76 | ||
210 | atomic_dec(&ioc->nr_tasks); | 77 | if (atomic_dec_and_test(&ioc->nr_tasks)) |
211 | put_io_context_active(ioc); | 78 | cfq_exit(ioc); |
212 | } | ||
213 | 79 | ||
214 | /** | 80 | put_io_context(ioc); |
215 | * ioc_clear_queue - break any ioc association with the specified queue | ||
216 | * @q: request_queue being cleared | ||
217 | * | ||
218 | * Walk @q->icq_list and exit all io_cq's. Must be called with @q locked. | ||
219 | */ | ||
220 | void ioc_clear_queue(struct request_queue *q) | ||
221 | { | ||
222 | lockdep_assert_held(q->queue_lock); | ||
223 | |||
224 | while (!list_empty(&q->icq_list)) { | ||
225 | struct io_cq *icq = list_entry(q->icq_list.next, | ||
226 | struct io_cq, q_node); | ||
227 | struct io_context *ioc = icq->ioc; | ||
228 | |||
229 | spin_lock(&ioc->lock); | ||
230 | ioc_destroy_icq(icq); | ||
231 | spin_unlock(&ioc->lock); | ||
232 | } | ||
233 | } | 81 | } |
234 | 82 | ||
235 | int create_task_io_context(struct task_struct *task, gfp_t gfp_flags, int node) | 83 | struct io_context *alloc_io_context(gfp_t gfp_flags, int node) |
236 | { | 84 | { |
237 | struct io_context *ioc; | 85 | struct io_context *ioc; |
238 | int ret; | ||
239 | |||
240 | ioc = kmem_cache_alloc_node(iocontext_cachep, gfp_flags | __GFP_ZERO, | ||
241 | node); | ||
242 | if (unlikely(!ioc)) | ||
243 | return -ENOMEM; | ||
244 | |||
245 | /* initialize */ | ||
246 | atomic_long_set(&ioc->refcount, 1); | ||
247 | atomic_set(&ioc->nr_tasks, 1); | ||
248 | atomic_set(&ioc->active_ref, 1); | ||
249 | spin_lock_init(&ioc->lock); | ||
250 | INIT_RADIX_TREE(&ioc->icq_tree, GFP_ATOMIC | __GFP_HIGH); | ||
251 | INIT_HLIST_HEAD(&ioc->icq_list); | ||
252 | INIT_WORK(&ioc->release_work, ioc_release_fn); | ||
253 | 86 | ||
254 | /* | 87 | ioc = kmem_cache_alloc_node(iocontext_cachep, gfp_flags, node); |
255 | * Try to install. ioc shouldn't be installed if someone else | 88 | if (ioc) { |
256 | * already did or @task, which isn't %current, is exiting. Note | 89 | atomic_long_set(&ioc->refcount, 1); |
257 | * that we need to allow ioc creation on exiting %current as exit | 90 | atomic_set(&ioc->nr_tasks, 1); |
258 | * path may issue IOs from e.g. exit_files(). The exit path is | 91 | spin_lock_init(&ioc->lock); |
259 | * responsible for not issuing IO after exit_io_context(). | 92 | ioc->ioprio_changed = 0; |
260 | */ | 93 | ioc->ioprio = 0; |
261 | task_lock(task); | 94 | ioc->last_waited = 0; /* doesn't matter... */ |
262 | if (!task->io_context && | 95 | ioc->nr_batch_requests = 0; /* because this is 0 */ |
263 | (task == current || !(task->flags & PF_EXITING))) | 96 | INIT_RADIX_TREE(&ioc->radix_root, GFP_ATOMIC | __GFP_HIGH); |
264 | task->io_context = ioc; | 97 | INIT_HLIST_HEAD(&ioc->cic_list); |
265 | else | 98 | ioc->ioc_data = NULL; |
266 | kmem_cache_free(iocontext_cachep, ioc); | 99 | #if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE) |
267 | 100 | ioc->cgroup_changed = 0; | |
268 | ret = task->io_context ? 0 : -EBUSY; | 101 | #endif |
269 | 102 | } | |
270 | task_unlock(task); | ||
271 | 103 | ||
272 | return ret; | 104 | return ioc; |
273 | } | 105 | } |
274 | 106 | ||
275 | /** | 107 | /* |
276 | * get_task_io_context - get io_context of a task | 108 | * If the current task has no IO context then create one and initialise it. |
277 | * @task: task of interest | 109 | * Otherwise, return its existing IO context. |
278 | * @gfp_flags: allocation flags, used if allocation is necessary | ||
279 | * @node: allocation node, used if allocation is necessary | ||
280 | * | ||
281 | * Return io_context of @task. If it doesn't exist, it is created with | ||
282 | * @gfp_flags and @node. The returned io_context has its reference count | ||
283 | * incremented. | ||
284 | * | 110 | * |
285 | * This function always goes through task_lock() and it's better to use | 111 | * This returned IO context doesn't have a specifically elevated refcount, |
286 | * %current->io_context + get_io_context() for %current. | 112 | * but since the current task itself holds a reference, the context can be |
113 | * used in general code, so long as it stays within `current` context. | ||
287 | */ | 114 | */ |
288 | struct io_context *get_task_io_context(struct task_struct *task, | 115 | struct io_context *current_io_context(gfp_t gfp_flags, int node) |
289 | gfp_t gfp_flags, int node) | ||
290 | { | 116 | { |
291 | struct io_context *ioc; | 117 | struct task_struct *tsk = current; |
292 | 118 | struct io_context *ret; | |
293 | might_sleep_if(gfp_flags & __GFP_WAIT); | 119 | |
294 | 120 | ret = tsk->io_context; | |
295 | do { | 121 | if (likely(ret)) |
296 | task_lock(task); | 122 | return ret; |
297 | ioc = task->io_context; | 123 | |
298 | if (likely(ioc)) { | 124 | ret = alloc_io_context(gfp_flags, node); |
299 | get_io_context(ioc); | 125 | if (ret) { |
300 | task_unlock(task); | 126 | /* make sure set_task_ioprio() sees the settings above */ |
301 | return ioc; | 127 | smp_wmb(); |
302 | } | 128 | tsk->io_context = ret; |
303 | task_unlock(task); | 129 | } |
304 | } while (!create_task_io_context(task, gfp_flags, node)); | ||
305 | 130 | ||
306 | return NULL; | 131 | return ret; |
307 | } | 132 | } |
308 | EXPORT_SYMBOL(get_task_io_context); | ||
309 | 133 | ||
310 | /** | 134 | /* |
311 | * ioc_lookup_icq - lookup io_cq from ioc | 135 | * If the current task has no IO context then create one and initialise it. |
312 | * @ioc: the associated io_context | 136 | * If it does have a context, take a ref on it. |
313 | * @q: the associated request_queue | ||
314 | * | 137 | * |
315 | * Look up io_cq associated with @ioc - @q pair from @ioc. Must be called | 138 | * This is always called in the context of the task which submitted the I/O. |
316 | * with @q->queue_lock held. | ||
317 | */ | 139 | */ |
318 | struct io_cq *ioc_lookup_icq(struct io_context *ioc, struct request_queue *q) | 140 | struct io_context *get_io_context(gfp_t gfp_flags, int node) |
319 | { | 141 | { |
320 | struct io_cq *icq; | 142 | struct io_context *ioc = NULL; |
321 | |||
322 | lockdep_assert_held(q->queue_lock); | ||
323 | 143 | ||
324 | /* | 144 | /* |
325 | * icq's are indexed from @ioc using radix tree and hint pointer, | 145 | * Check for unlikely race with exiting task. ioc ref count is |
326 | * both of which are protected with RCU. All removals are done | 146 | * zero when ioc is being detached. |
327 | * holding both q and ioc locks, and we're holding q lock - if we | ||
328 | * find a icq which points to us, it's guaranteed to be valid. | ||
329 | */ | 147 | */ |
330 | rcu_read_lock(); | 148 | do { |
331 | icq = rcu_dereference(ioc->icq_hint); | 149 | ioc = current_io_context(gfp_flags, node); |
332 | if (icq && icq->q == q) | 150 | if (unlikely(!ioc)) |
333 | goto out; | 151 | break; |
334 | 152 | } while (!atomic_long_inc_not_zero(&ioc->refcount)); | |
335 | icq = radix_tree_lookup(&ioc->icq_tree, q->id); | ||
336 | if (icq && icq->q == q) | ||
337 | rcu_assign_pointer(ioc->icq_hint, icq); /* allowed to race */ | ||
338 | else | ||
339 | icq = NULL; | ||
340 | out: | ||
341 | rcu_read_unlock(); | ||
342 | return icq; | ||
343 | } | ||
344 | EXPORT_SYMBOL(ioc_lookup_icq); | ||
345 | |||
346 | /** | ||
347 | * ioc_create_icq - create and link io_cq | ||
348 | * @ioc: io_context of interest | ||
349 | * @q: request_queue of interest | ||
350 | * @gfp_mask: allocation mask | ||
351 | * | ||
352 | * Make sure io_cq linking @ioc and @q exists. If icq doesn't exist, they | ||
353 | * will be created using @gfp_mask. | ||
354 | * | ||
355 | * The caller is responsible for ensuring @ioc won't go away and @q is | ||
356 | * alive and will stay alive until this function returns. | ||
357 | */ | ||
358 | struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q, | ||
359 | gfp_t gfp_mask) | ||
360 | { | ||
361 | struct elevator_type *et = q->elevator->type; | ||
362 | struct io_cq *icq; | ||
363 | |||
364 | /* allocate stuff */ | ||
365 | icq = kmem_cache_alloc_node(et->icq_cache, gfp_mask | __GFP_ZERO, | ||
366 | q->node); | ||
367 | if (!icq) | ||
368 | return NULL; | ||
369 | |||
370 | if (radix_tree_preload(gfp_mask) < 0) { | ||
371 | kmem_cache_free(et->icq_cache, icq); | ||
372 | return NULL; | ||
373 | } | ||
374 | |||
375 | icq->ioc = ioc; | ||
376 | icq->q = q; | ||
377 | INIT_LIST_HEAD(&icq->q_node); | ||
378 | INIT_HLIST_NODE(&icq->ioc_node); | ||
379 | |||
380 | /* lock both q and ioc and try to link @icq */ | ||
381 | spin_lock_irq(q->queue_lock); | ||
382 | spin_lock(&ioc->lock); | ||
383 | |||
384 | if (likely(!radix_tree_insert(&ioc->icq_tree, q->id, icq))) { | ||
385 | hlist_add_head(&icq->ioc_node, &ioc->icq_list); | ||
386 | list_add(&icq->q_node, &q->icq_list); | ||
387 | if (et->ops.elevator_init_icq_fn) | ||
388 | et->ops.elevator_init_icq_fn(icq); | ||
389 | } else { | ||
390 | kmem_cache_free(et->icq_cache, icq); | ||
391 | icq = ioc_lookup_icq(ioc, q); | ||
392 | if (!icq) | ||
393 | printk(KERN_ERR "cfq: icq link failed!\n"); | ||
394 | } | ||
395 | 153 | ||
396 | spin_unlock(&ioc->lock); | 154 | return ioc; |
397 | spin_unlock_irq(q->queue_lock); | ||
398 | radix_tree_preload_end(); | ||
399 | return icq; | ||
400 | } | 155 | } |
156 | EXPORT_SYMBOL(get_io_context); | ||
401 | 157 | ||
402 | static int __init blk_ioc_init(void) | 158 | static int __init blk_ioc_init(void) |
403 | { | 159 | { |
diff --git a/block/blk-lib.c b/block/blk-lib.c index b3a1f2b70b3..2b461b496a7 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c | |||
@@ -43,12 +43,10 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector, | |||
43 | DECLARE_COMPLETION_ONSTACK(wait); | 43 | DECLARE_COMPLETION_ONSTACK(wait); |
44 | struct request_queue *q = bdev_get_queue(bdev); | 44 | struct request_queue *q = bdev_get_queue(bdev); |
45 | int type = REQ_WRITE | REQ_DISCARD; | 45 | int type = REQ_WRITE | REQ_DISCARD; |
46 | sector_t max_discard_sectors; | 46 | unsigned int max_discard_sectors; |
47 | sector_t granularity, alignment; | ||
48 | struct bio_batch bb; | 47 | struct bio_batch bb; |
49 | struct bio *bio; | 48 | struct bio *bio; |
50 | int ret = 0; | 49 | int ret = 0; |
51 | struct blk_plug plug; | ||
52 | 50 | ||
53 | if (!q) | 51 | if (!q) |
54 | return -ENXIO; | 52 | return -ENXIO; |
@@ -56,21 +54,18 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector, | |||
56 | if (!blk_queue_discard(q)) | 54 | if (!blk_queue_discard(q)) |
57 | return -EOPNOTSUPP; | 55 | return -EOPNOTSUPP; |
58 | 56 | ||
59 | /* Zero-sector (unknown) and one-sector granularities are the same. */ | ||
60 | granularity = max(q->limits.discard_granularity >> 9, 1U); | ||
61 | alignment = bdev_discard_alignment(bdev) >> 9; | ||
62 | alignment = sector_div(alignment, granularity); | ||
63 | |||
64 | /* | 57 | /* |
65 | * Ensure that max_discard_sectors is of the proper | 58 | * Ensure that max_discard_sectors is of the proper |
66 | * granularity, so that requests stay aligned after a split. | 59 | * granularity |
67 | */ | 60 | */ |
68 | max_discard_sectors = min(q->limits.max_discard_sectors, UINT_MAX >> 9); | 61 | max_discard_sectors = min(q->limits.max_discard_sectors, UINT_MAX >> 9); |
69 | sector_div(max_discard_sectors, granularity); | ||
70 | max_discard_sectors *= granularity; | ||
71 | if (unlikely(!max_discard_sectors)) { | 62 | if (unlikely(!max_discard_sectors)) { |
72 | /* Avoid infinite loop below. Being cautious never hurts. */ | 63 | /* Avoid infinite loop below. Being cautious never hurts. */ |
73 | return -EOPNOTSUPP; | 64 | return -EOPNOTSUPP; |
65 | } else if (q->limits.discard_granularity) { | ||
66 | unsigned int disc_sects = q->limits.discard_granularity >> 9; | ||
67 | |||
68 | max_discard_sectors &= ~(disc_sects - 1); | ||
74 | } | 69 | } |
75 | 70 | ||
76 | if (flags & BLKDEV_DISCARD_SECURE) { | 71 | if (flags & BLKDEV_DISCARD_SECURE) { |
@@ -83,119 +78,29 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector, | |||
83 | bb.flags = 1 << BIO_UPTODATE; | 78 | bb.flags = 1 << BIO_UPTODATE; |
84 | bb.wait = &wait; | 79 | bb.wait = &wait; |
85 | 80 | ||
86 | blk_start_plug(&plug); | ||
87 | while (nr_sects) { | 81 | while (nr_sects) { |
88 | unsigned int req_sects; | ||
89 | sector_t end_sect, tmp; | ||
90 | |||
91 | bio = bio_alloc(gfp_mask, 1); | 82 | bio = bio_alloc(gfp_mask, 1); |
92 | if (!bio) { | 83 | if (!bio) { |
93 | ret = -ENOMEM; | 84 | ret = -ENOMEM; |
94 | break; | 85 | break; |
95 | } | 86 | } |
96 | 87 | ||
97 | req_sects = min_t(sector_t, nr_sects, max_discard_sectors); | ||
98 | |||
99 | /* | ||
100 | * If splitting a request, and the next starting sector would be | ||
101 | * misaligned, stop the discard at the previous aligned sector. | ||
102 | */ | ||
103 | end_sect = sector + req_sects; | ||
104 | tmp = end_sect; | ||
105 | if (req_sects < nr_sects && | ||
106 | sector_div(tmp, granularity) != alignment) { | ||
107 | end_sect = end_sect - alignment; | ||
108 | sector_div(end_sect, granularity); | ||
109 | end_sect = end_sect * granularity + alignment; | ||
110 | req_sects = end_sect - sector; | ||
111 | } | ||
112 | |||
113 | bio->bi_sector = sector; | 88 | bio->bi_sector = sector; |
114 | bio->bi_end_io = bio_batch_end_io; | 89 | bio->bi_end_io = bio_batch_end_io; |
115 | bio->bi_bdev = bdev; | 90 | bio->bi_bdev = bdev; |
116 | bio->bi_private = &bb; | 91 | bio->bi_private = &bb; |
117 | 92 | ||
118 | bio->bi_size = req_sects << 9; | 93 | if (nr_sects > max_discard_sectors) { |
119 | nr_sects -= req_sects; | 94 | bio->bi_size = max_discard_sectors << 9; |
120 | sector = end_sect; | 95 | nr_sects -= max_discard_sectors; |
121 | 96 | sector += max_discard_sectors; | |
122 | atomic_inc(&bb.done); | ||
123 | submit_bio(type, bio); | ||
124 | } | ||
125 | blk_finish_plug(&plug); | ||
126 | |||
127 | /* Wait for bios in-flight */ | ||
128 | if (!atomic_dec_and_test(&bb.done)) | ||
129 | wait_for_completion(&wait); | ||
130 | |||
131 | if (!test_bit(BIO_UPTODATE, &bb.flags)) | ||
132 | ret = -EIO; | ||
133 | |||
134 | return ret; | ||
135 | } | ||
136 | EXPORT_SYMBOL(blkdev_issue_discard); | ||
137 | |||
138 | /** | ||
139 | * blkdev_issue_write_same - queue a write same operation | ||
140 | * @bdev: target blockdev | ||
141 | * @sector: start sector | ||
142 | * @nr_sects: number of sectors to write | ||
143 | * @gfp_mask: memory allocation flags (for bio_alloc) | ||
144 | * @page: page containing data to write | ||
145 | * | ||
146 | * Description: | ||
147 | * Issue a write same request for the sectors in question. | ||
148 | */ | ||
149 | int blkdev_issue_write_same(struct block_device *bdev, sector_t sector, | ||
150 | sector_t nr_sects, gfp_t gfp_mask, | ||
151 | struct page *page) | ||
152 | { | ||
153 | DECLARE_COMPLETION_ONSTACK(wait); | ||
154 | struct request_queue *q = bdev_get_queue(bdev); | ||
155 | unsigned int max_write_same_sectors; | ||
156 | struct bio_batch bb; | ||
157 | struct bio *bio; | ||
158 | int ret = 0; | ||
159 | |||
160 | if (!q) | ||
161 | return -ENXIO; | ||
162 | |||
163 | max_write_same_sectors = q->limits.max_write_same_sectors; | ||
164 | |||
165 | if (max_write_same_sectors == 0) | ||
166 | return -EOPNOTSUPP; | ||
167 | |||
168 | atomic_set(&bb.done, 1); | ||
169 | bb.flags = 1 << BIO_UPTODATE; | ||
170 | bb.wait = &wait; | ||
171 | |||
172 | while (nr_sects) { | ||
173 | bio = bio_alloc(gfp_mask, 1); | ||
174 | if (!bio) { | ||
175 | ret = -ENOMEM; | ||
176 | break; | ||
177 | } | ||
178 | |||
179 | bio->bi_sector = sector; | ||
180 | bio->bi_end_io = bio_batch_end_io; | ||
181 | bio->bi_bdev = bdev; | ||
182 | bio->bi_private = &bb; | ||
183 | bio->bi_vcnt = 1; | ||
184 | bio->bi_io_vec->bv_page = page; | ||
185 | bio->bi_io_vec->bv_offset = 0; | ||
186 | bio->bi_io_vec->bv_len = bdev_logical_block_size(bdev); | ||
187 | |||
188 | if (nr_sects > max_write_same_sectors) { | ||
189 | bio->bi_size = max_write_same_sectors << 9; | ||
190 | nr_sects -= max_write_same_sectors; | ||
191 | sector += max_write_same_sectors; | ||
192 | } else { | 97 | } else { |
193 | bio->bi_size = nr_sects << 9; | 98 | bio->bi_size = nr_sects << 9; |
194 | nr_sects = 0; | 99 | nr_sects = 0; |
195 | } | 100 | } |
196 | 101 | ||
197 | atomic_inc(&bb.done); | 102 | atomic_inc(&bb.done); |
198 | submit_bio(REQ_WRITE | REQ_WRITE_SAME, bio); | 103 | submit_bio(type, bio); |
199 | } | 104 | } |
200 | 105 | ||
201 | /* Wait for bios in-flight */ | 106 | /* Wait for bios in-flight */ |
@@ -203,11 +108,11 @@ int blkdev_issue_write_same(struct block_device *bdev, sector_t sector, | |||
203 | wait_for_completion(&wait); | 108 | wait_for_completion(&wait); |
204 | 109 | ||
205 | if (!test_bit(BIO_UPTODATE, &bb.flags)) | 110 | if (!test_bit(BIO_UPTODATE, &bb.flags)) |
206 | ret = -ENOTSUPP; | 111 | ret = -EIO; |
207 | 112 | ||
208 | return ret; | 113 | return ret; |
209 | } | 114 | } |
210 | EXPORT_SYMBOL(blkdev_issue_write_same); | 115 | EXPORT_SYMBOL(blkdev_issue_discard); |
211 | 116 | ||
212 | /** | 117 | /** |
213 | * blkdev_issue_zeroout - generate number of zero filed write bios | 118 | * blkdev_issue_zeroout - generate number of zero filed write bios |
@@ -220,7 +125,7 @@ EXPORT_SYMBOL(blkdev_issue_write_same); | |||
220 | * Generate and issue number of bios with zerofiled pages. | 125 | * Generate and issue number of bios with zerofiled pages. |
221 | */ | 126 | */ |
222 | 127 | ||
223 | int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, | 128 | int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, |
224 | sector_t nr_sects, gfp_t gfp_mask) | 129 | sector_t nr_sects, gfp_t gfp_mask) |
225 | { | 130 | { |
226 | int ret; | 131 | int ret; |
@@ -270,32 +175,4 @@ int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, | |||
270 | 175 | ||
271 | return ret; | 176 | return ret; |
272 | } | 177 | } |
273 | |||
274 | /** | ||
275 | * blkdev_issue_zeroout - zero-fill a block range | ||
276 | * @bdev: blockdev to write | ||
277 | * @sector: start sector | ||
278 | * @nr_sects: number of sectors to write | ||
279 | * @gfp_mask: memory allocation flags (for bio_alloc) | ||
280 | * | ||
281 | * Description: | ||
282 | * Generate and issue number of bios with zerofiled pages. | ||
283 | */ | ||
284 | |||
285 | int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, | ||
286 | sector_t nr_sects, gfp_t gfp_mask) | ||
287 | { | ||
288 | if (bdev_write_same(bdev)) { | ||
289 | unsigned char bdn[BDEVNAME_SIZE]; | ||
290 | |||
291 | if (!blkdev_issue_write_same(bdev, sector, nr_sects, gfp_mask, | ||
292 | ZERO_PAGE(0))) | ||
293 | return 0; | ||
294 | |||
295 | bdevname(bdev, bdn); | ||
296 | pr_err("%s: WRITE SAME failed. Manually zeroing.\n", bdn); | ||
297 | } | ||
298 | |||
299 | return __blkdev_issue_zeroout(bdev, sector, nr_sects, gfp_mask); | ||
300 | } | ||
301 | EXPORT_SYMBOL(blkdev_issue_zeroout); | 178 | EXPORT_SYMBOL(blkdev_issue_zeroout); |
diff --git a/block/blk-map.c b/block/blk-map.c index 623e1cd4cff..164cd005970 100644 --- a/block/blk-map.c +++ b/block/blk-map.c | |||
@@ -311,7 +311,7 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf, | |||
311 | if (IS_ERR(bio)) | 311 | if (IS_ERR(bio)) |
312 | return PTR_ERR(bio); | 312 | return PTR_ERR(bio); |
313 | 313 | ||
314 | if (!reading) | 314 | if (rq_data_dir(rq) == WRITE) |
315 | bio->bi_rw |= REQ_WRITE; | 315 | bio->bi_rw |= REQ_WRITE; |
316 | 316 | ||
317 | if (do_copy) | 317 | if (do_copy) |
diff --git a/block/blk-merge.c b/block/blk-merge.c index 936a110de0b..cfcc37cb222 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c | |||
@@ -110,49 +110,6 @@ static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio, | |||
110 | return 0; | 110 | return 0; |
111 | } | 111 | } |
112 | 112 | ||
113 | static void | ||
114 | __blk_segment_map_sg(struct request_queue *q, struct bio_vec *bvec, | ||
115 | struct scatterlist *sglist, struct bio_vec **bvprv, | ||
116 | struct scatterlist **sg, int *nsegs, int *cluster) | ||
117 | { | ||
118 | |||
119 | int nbytes = bvec->bv_len; | ||
120 | |||
121 | if (*bvprv && *cluster) { | ||
122 | if ((*sg)->length + nbytes > queue_max_segment_size(q)) | ||
123 | goto new_segment; | ||
124 | |||
125 | if (!BIOVEC_PHYS_MERGEABLE(*bvprv, bvec)) | ||
126 | goto new_segment; | ||
127 | if (!BIOVEC_SEG_BOUNDARY(q, *bvprv, bvec)) | ||
128 | goto new_segment; | ||
129 | |||
130 | (*sg)->length += nbytes; | ||
131 | } else { | ||
132 | new_segment: | ||
133 | if (!*sg) | ||
134 | *sg = sglist; | ||
135 | else { | ||
136 | /* | ||
137 | * If the driver previously mapped a shorter | ||
138 | * list, we could see a termination bit | ||
139 | * prematurely unless it fully inits the sg | ||
140 | * table on each mapping. We KNOW that there | ||
141 | * must be more entries here or the driver | ||
142 | * would be buggy, so force clear the | ||
143 | * termination bit to avoid doing a full | ||
144 | * sg_init_table() in drivers for each command. | ||
145 | */ | ||
146 | (*sg)->page_link &= ~0x02; | ||
147 | *sg = sg_next(*sg); | ||
148 | } | ||
149 | |||
150 | sg_set_page(*sg, bvec->bv_page, nbytes, bvec->bv_offset); | ||
151 | (*nsegs)++; | ||
152 | } | ||
153 | *bvprv = bvec; | ||
154 | } | ||
155 | |||
156 | /* | 113 | /* |
157 | * map a request to scatterlist, return number of sg entries setup. Caller | 114 | * map a request to scatterlist, return number of sg entries setup. Caller |
158 | * must make sure sg can hold rq->nr_phys_segments entries | 115 | * must make sure sg can hold rq->nr_phys_segments entries |
@@ -174,8 +131,41 @@ int blk_rq_map_sg(struct request_queue *q, struct request *rq, | |||
174 | bvprv = NULL; | 131 | bvprv = NULL; |
175 | sg = NULL; | 132 | sg = NULL; |
176 | rq_for_each_segment(bvec, rq, iter) { | 133 | rq_for_each_segment(bvec, rq, iter) { |
177 | __blk_segment_map_sg(q, bvec, sglist, &bvprv, &sg, | 134 | int nbytes = bvec->bv_len; |
178 | &nsegs, &cluster); | 135 | |
136 | if (bvprv && cluster) { | ||
137 | if (sg->length + nbytes > queue_max_segment_size(q)) | ||
138 | goto new_segment; | ||
139 | |||
140 | if (!BIOVEC_PHYS_MERGEABLE(bvprv, bvec)) | ||
141 | goto new_segment; | ||
142 | if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bvec)) | ||
143 | goto new_segment; | ||
144 | |||
145 | sg->length += nbytes; | ||
146 | } else { | ||
147 | new_segment: | ||
148 | if (!sg) | ||
149 | sg = sglist; | ||
150 | else { | ||
151 | /* | ||
152 | * If the driver previously mapped a shorter | ||
153 | * list, we could see a termination bit | ||
154 | * prematurely unless it fully inits the sg | ||
155 | * table on each mapping. We KNOW that there | ||
156 | * must be more entries here or the driver | ||
157 | * would be buggy, so force clear the | ||
158 | * termination bit to avoid doing a full | ||
159 | * sg_init_table() in drivers for each command. | ||
160 | */ | ||
161 | sg->page_link &= ~0x02; | ||
162 | sg = sg_next(sg); | ||
163 | } | ||
164 | |||
165 | sg_set_page(sg, bvec->bv_page, nbytes, bvec->bv_offset); | ||
166 | nsegs++; | ||
167 | } | ||
168 | bvprv = bvec; | ||
179 | } /* segments in rq */ | 169 | } /* segments in rq */ |
180 | 170 | ||
181 | 171 | ||
@@ -209,43 +199,6 @@ int blk_rq_map_sg(struct request_queue *q, struct request *rq, | |||
209 | } | 199 | } |
210 | EXPORT_SYMBOL(blk_rq_map_sg); | 200 | EXPORT_SYMBOL(blk_rq_map_sg); |
211 | 201 | ||
212 | /** | ||
213 | * blk_bio_map_sg - map a bio to a scatterlist | ||
214 | * @q: request_queue in question | ||
215 | * @bio: bio being mapped | ||
216 | * @sglist: scatterlist being mapped | ||
217 | * | ||
218 | * Note: | ||
219 | * Caller must make sure sg can hold bio->bi_phys_segments entries | ||
220 | * | ||
221 | * Will return the number of sg entries setup | ||
222 | */ | ||
223 | int blk_bio_map_sg(struct request_queue *q, struct bio *bio, | ||
224 | struct scatterlist *sglist) | ||
225 | { | ||
226 | struct bio_vec *bvec, *bvprv; | ||
227 | struct scatterlist *sg; | ||
228 | int nsegs, cluster; | ||
229 | unsigned long i; | ||
230 | |||
231 | nsegs = 0; | ||
232 | cluster = blk_queue_cluster(q); | ||
233 | |||
234 | bvprv = NULL; | ||
235 | sg = NULL; | ||
236 | bio_for_each_segment(bvec, bio, i) { | ||
237 | __blk_segment_map_sg(q, bvec, sglist, &bvprv, &sg, | ||
238 | &nsegs, &cluster); | ||
239 | } /* segments in bio */ | ||
240 | |||
241 | if (sg) | ||
242 | sg_mark_end(sg); | ||
243 | |||
244 | BUG_ON(bio->bi_phys_segments && nsegs > bio->bi_phys_segments); | ||
245 | return nsegs; | ||
246 | } | ||
247 | EXPORT_SYMBOL(blk_bio_map_sg); | ||
248 | |||
249 | static inline int ll_new_hw_segment(struct request_queue *q, | 202 | static inline int ll_new_hw_segment(struct request_queue *q, |
250 | struct request *req, | 203 | struct request *req, |
251 | struct bio *bio) | 204 | struct bio *bio) |
@@ -275,8 +228,14 @@ no_merge: | |||
275 | int ll_back_merge_fn(struct request_queue *q, struct request *req, | 228 | int ll_back_merge_fn(struct request_queue *q, struct request *req, |
276 | struct bio *bio) | 229 | struct bio *bio) |
277 | { | 230 | { |
278 | if (blk_rq_sectors(req) + bio_sectors(bio) > | 231 | unsigned short max_sectors; |
279 | blk_rq_get_max_sectors(req)) { | 232 | |
233 | if (unlikely(req->cmd_type == REQ_TYPE_BLOCK_PC)) | ||
234 | max_sectors = queue_max_hw_sectors(q); | ||
235 | else | ||
236 | max_sectors = queue_max_sectors(q); | ||
237 | |||
238 | if (blk_rq_sectors(req) + bio_sectors(bio) > max_sectors) { | ||
280 | req->cmd_flags |= REQ_NOMERGE; | 239 | req->cmd_flags |= REQ_NOMERGE; |
281 | if (req == q->last_merge) | 240 | if (req == q->last_merge) |
282 | q->last_merge = NULL; | 241 | q->last_merge = NULL; |
@@ -293,8 +252,15 @@ int ll_back_merge_fn(struct request_queue *q, struct request *req, | |||
293 | int ll_front_merge_fn(struct request_queue *q, struct request *req, | 252 | int ll_front_merge_fn(struct request_queue *q, struct request *req, |
294 | struct bio *bio) | 253 | struct bio *bio) |
295 | { | 254 | { |
296 | if (blk_rq_sectors(req) + bio_sectors(bio) > | 255 | unsigned short max_sectors; |
297 | blk_rq_get_max_sectors(req)) { | 256 | |
257 | if (unlikely(req->cmd_type == REQ_TYPE_BLOCK_PC)) | ||
258 | max_sectors = queue_max_hw_sectors(q); | ||
259 | else | ||
260 | max_sectors = queue_max_sectors(q); | ||
261 | |||
262 | |||
263 | if (blk_rq_sectors(req) + bio_sectors(bio) > max_sectors) { | ||
298 | req->cmd_flags |= REQ_NOMERGE; | 264 | req->cmd_flags |= REQ_NOMERGE; |
299 | if (req == q->last_merge) | 265 | if (req == q->last_merge) |
300 | q->last_merge = NULL; | 266 | q->last_merge = NULL; |
@@ -325,8 +291,7 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req, | |||
325 | /* | 291 | /* |
326 | * Will it become too large? | 292 | * Will it become too large? |
327 | */ | 293 | */ |
328 | if ((blk_rq_sectors(req) + blk_rq_sectors(next)) > | 294 | if ((blk_rq_sectors(req) + blk_rq_sectors(next)) > queue_max_sectors(q)) |
329 | blk_rq_get_max_sectors(req)) | ||
330 | return 0; | 295 | return 0; |
331 | 296 | ||
332 | total_phys_segments = req->nr_phys_segments + next->nr_phys_segments; | 297 | total_phys_segments = req->nr_phys_segments + next->nr_phys_segments; |
@@ -405,7 +370,16 @@ static int attempt_merge(struct request_queue *q, struct request *req, | |||
405 | if (!rq_mergeable(req) || !rq_mergeable(next)) | 370 | if (!rq_mergeable(req) || !rq_mergeable(next)) |
406 | return 0; | 371 | return 0; |
407 | 372 | ||
408 | if (!blk_check_merge_flags(req->cmd_flags, next->cmd_flags)) | 373 | /* |
374 | * Don't merge file system requests and discard requests | ||
375 | */ | ||
376 | if ((req->cmd_flags & REQ_DISCARD) != (next->cmd_flags & REQ_DISCARD)) | ||
377 | return 0; | ||
378 | |||
379 | /* | ||
380 | * Don't merge discard requests and secure discard requests | ||
381 | */ | ||
382 | if ((req->cmd_flags & REQ_SECURE) != (next->cmd_flags & REQ_SECURE)) | ||
409 | return 0; | 383 | return 0; |
410 | 384 | ||
411 | /* | 385 | /* |
@@ -419,10 +393,6 @@ static int attempt_merge(struct request_queue *q, struct request *req, | |||
419 | || next->special) | 393 | || next->special) |
420 | return 0; | 394 | return 0; |
421 | 395 | ||
422 | if (req->cmd_flags & REQ_WRITE_SAME && | ||
423 | !blk_write_same_mergeable(req->bio, next->bio)) | ||
424 | return 0; | ||
425 | |||
426 | /* | 396 | /* |
427 | * If we are allowed to merge, then append bio list | 397 | * If we are allowed to merge, then append bio list |
428 | * from next to rq and release next. merge_requests_fn | 398 | * from next to rq and release next. merge_requests_fn |
@@ -501,40 +471,3 @@ int blk_attempt_req_merge(struct request_queue *q, struct request *rq, | |||
501 | { | 471 | { |
502 | return attempt_merge(q, rq, next); | 472 | return attempt_merge(q, rq, next); |
503 | } | 473 | } |
504 | |||
505 | bool blk_rq_merge_ok(struct request *rq, struct bio *bio) | ||
506 | { | ||
507 | if (!rq_mergeable(rq) || !bio_mergeable(bio)) | ||
508 | return false; | ||
509 | |||
510 | if (!blk_check_merge_flags(rq->cmd_flags, bio->bi_rw)) | ||
511 | return false; | ||
512 | |||
513 | /* different data direction or already started, don't merge */ | ||
514 | if (bio_data_dir(bio) != rq_data_dir(rq)) | ||
515 | return false; | ||
516 | |||
517 | /* must be same device and not a special request */ | ||
518 | if (rq->rq_disk != bio->bi_bdev->bd_disk || rq->special) | ||
519 | return false; | ||
520 | |||
521 | /* only merge integrity protected bio into ditto rq */ | ||
522 | if (bio_integrity(bio) != blk_integrity_rq(rq)) | ||
523 | return false; | ||
524 | |||
525 | /* must be using the same buffer */ | ||
526 | if (rq->cmd_flags & REQ_WRITE_SAME && | ||
527 | !blk_write_same_mergeable(rq->bio, bio)) | ||
528 | return false; | ||
529 | |||
530 | return true; | ||
531 | } | ||
532 | |||
533 | int blk_try_merge(struct request *rq, struct bio *bio) | ||
534 | { | ||
535 | if (blk_rq_pos(rq) + blk_rq_sectors(rq) == bio->bi_sector) | ||
536 | return ELEVATOR_BACK_MERGE; | ||
537 | else if (blk_rq_pos(rq) - bio_sectors(bio) == bio->bi_sector) | ||
538 | return ELEVATOR_FRONT_MERGE; | ||
539 | return ELEVATOR_NO_MERGE; | ||
540 | } | ||
diff --git a/block/blk-settings.c b/block/blk-settings.c index c50ecf0ea3b..fa1eb0449a0 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c | |||
@@ -104,7 +104,9 @@ EXPORT_SYMBOL_GPL(blk_queue_lld_busy); | |||
104 | * @lim: the queue_limits structure to reset | 104 | * @lim: the queue_limits structure to reset |
105 | * | 105 | * |
106 | * Description: | 106 | * Description: |
107 | * Returns a queue_limit struct to its default state. | 107 | * Returns a queue_limit struct to its default state. Can be used by |
108 | * stacking drivers like DM that stage table swaps and reuse an | ||
109 | * existing device queue. | ||
108 | */ | 110 | */ |
109 | void blk_set_default_limits(struct queue_limits *lim) | 111 | void blk_set_default_limits(struct queue_limits *lim) |
110 | { | 112 | { |
@@ -112,13 +114,13 @@ void blk_set_default_limits(struct queue_limits *lim) | |||
112 | lim->max_integrity_segments = 0; | 114 | lim->max_integrity_segments = 0; |
113 | lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK; | 115 | lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK; |
114 | lim->max_segment_size = BLK_MAX_SEGMENT_SIZE; | 116 | lim->max_segment_size = BLK_MAX_SEGMENT_SIZE; |
115 | lim->max_sectors = lim->max_hw_sectors = BLK_SAFE_MAX_SECTORS; | 117 | lim->max_sectors = BLK_DEF_MAX_SECTORS; |
116 | lim->max_write_same_sectors = 0; | 118 | lim->max_hw_sectors = INT_MAX; |
117 | lim->max_discard_sectors = 0; | 119 | lim->max_discard_sectors = 0; |
118 | lim->discard_granularity = 0; | 120 | lim->discard_granularity = 0; |
119 | lim->discard_alignment = 0; | 121 | lim->discard_alignment = 0; |
120 | lim->discard_misaligned = 0; | 122 | lim->discard_misaligned = 0; |
121 | lim->discard_zeroes_data = 0; | 123 | lim->discard_zeroes_data = 1; |
122 | lim->logical_block_size = lim->physical_block_size = lim->io_min = 512; | 124 | lim->logical_block_size = lim->physical_block_size = lim->io_min = 512; |
123 | lim->bounce_pfn = (unsigned long)(BLK_BOUNCE_ANY >> PAGE_SHIFT); | 125 | lim->bounce_pfn = (unsigned long)(BLK_BOUNCE_ANY >> PAGE_SHIFT); |
124 | lim->alignment_offset = 0; | 126 | lim->alignment_offset = 0; |
@@ -129,27 +131,6 @@ void blk_set_default_limits(struct queue_limits *lim) | |||
129 | EXPORT_SYMBOL(blk_set_default_limits); | 131 | EXPORT_SYMBOL(blk_set_default_limits); |
130 | 132 | ||
131 | /** | 133 | /** |
132 | * blk_set_stacking_limits - set default limits for stacking devices | ||
133 | * @lim: the queue_limits structure to reset | ||
134 | * | ||
135 | * Description: | ||
136 | * Returns a queue_limit struct to its default state. Should be used | ||
137 | * by stacking drivers like DM that have no internal limits. | ||
138 | */ | ||
139 | void blk_set_stacking_limits(struct queue_limits *lim) | ||
140 | { | ||
141 | blk_set_default_limits(lim); | ||
142 | |||
143 | /* Inherit limits from component devices */ | ||
144 | lim->discard_zeroes_data = 1; | ||
145 | lim->max_segments = USHRT_MAX; | ||
146 | lim->max_hw_sectors = UINT_MAX; | ||
147 | lim->max_sectors = UINT_MAX; | ||
148 | lim->max_write_same_sectors = UINT_MAX; | ||
149 | } | ||
150 | EXPORT_SYMBOL(blk_set_stacking_limits); | ||
151 | |||
152 | /** | ||
153 | * blk_queue_make_request - define an alternate make_request function for a device | 134 | * blk_queue_make_request - define an alternate make_request function for a device |
154 | * @q: the request queue for the device to be affected | 135 | * @q: the request queue for the device to be affected |
155 | * @mfn: the alternate make_request function | 136 | * @mfn: the alternate make_request function |
@@ -184,6 +165,8 @@ void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn) | |||
184 | q->nr_batching = BLK_BATCH_REQ; | 165 | q->nr_batching = BLK_BATCH_REQ; |
185 | 166 | ||
186 | blk_set_default_limits(&q->limits); | 167 | blk_set_default_limits(&q->limits); |
168 | blk_queue_max_hw_sectors(q, BLK_SAFE_MAX_SECTORS); | ||
169 | q->limits.discard_zeroes_data = 0; | ||
187 | 170 | ||
188 | /* | 171 | /* |
189 | * by default assume old behaviour and bounce for any highmem page | 172 | * by default assume old behaviour and bounce for any highmem page |
@@ -288,18 +271,6 @@ void blk_queue_max_discard_sectors(struct request_queue *q, | |||
288 | EXPORT_SYMBOL(blk_queue_max_discard_sectors); | 271 | EXPORT_SYMBOL(blk_queue_max_discard_sectors); |
289 | 272 | ||
290 | /** | 273 | /** |
291 | * blk_queue_max_write_same_sectors - set max sectors for a single write same | ||
292 | * @q: the request queue for the device | ||
293 | * @max_write_same_sectors: maximum number of sectors to write per command | ||
294 | **/ | ||
295 | void blk_queue_max_write_same_sectors(struct request_queue *q, | ||
296 | unsigned int max_write_same_sectors) | ||
297 | { | ||
298 | q->limits.max_write_same_sectors = max_write_same_sectors; | ||
299 | } | ||
300 | EXPORT_SYMBOL(blk_queue_max_write_same_sectors); | ||
301 | |||
302 | /** | ||
303 | * blk_queue_max_segments - set max hw segments for a request for this queue | 274 | * blk_queue_max_segments - set max hw segments for a request for this queue |
304 | * @q: the request queue for the device | 275 | * @q: the request queue for the device |
305 | * @max_segments: max number of segments | 276 | * @max_segments: max number of segments |
@@ -524,8 +495,6 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, | |||
524 | 495 | ||
525 | t->max_sectors = min_not_zero(t->max_sectors, b->max_sectors); | 496 | t->max_sectors = min_not_zero(t->max_sectors, b->max_sectors); |
526 | t->max_hw_sectors = min_not_zero(t->max_hw_sectors, b->max_hw_sectors); | 497 | t->max_hw_sectors = min_not_zero(t->max_hw_sectors, b->max_hw_sectors); |
527 | t->max_write_same_sectors = min(t->max_write_same_sectors, | ||
528 | b->max_write_same_sectors); | ||
529 | t->bounce_pfn = min_not_zero(t->bounce_pfn, b->bounce_pfn); | 498 | t->bounce_pfn = min_not_zero(t->bounce_pfn, b->bounce_pfn); |
530 | 499 | ||
531 | t->seg_boundary_mask = min_not_zero(t->seg_boundary_mask, | 500 | t->seg_boundary_mask = min_not_zero(t->seg_boundary_mask, |
@@ -611,7 +580,7 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, | |||
611 | bottom = b->discard_granularity + alignment; | 580 | bottom = b->discard_granularity + alignment; |
612 | 581 | ||
613 | /* Verify that top and bottom intervals line up */ | 582 | /* Verify that top and bottom intervals line up */ |
614 | if ((max(top, bottom) % min(top, bottom)) != 0) | 583 | if (max(top, bottom) & (min(top, bottom) - 1)) |
615 | t->discard_misaligned = 1; | 584 | t->discard_misaligned = 1; |
616 | } | 585 | } |
617 | 586 | ||
@@ -619,8 +588,8 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, | |||
619 | b->max_discard_sectors); | 588 | b->max_discard_sectors); |
620 | t->discard_granularity = max(t->discard_granularity, | 589 | t->discard_granularity = max(t->discard_granularity, |
621 | b->discard_granularity); | 590 | b->discard_granularity); |
622 | t->discard_alignment = lcm(t->discard_alignment, alignment) % | 591 | t->discard_alignment = lcm(t->discard_alignment, alignment) & |
623 | t->discard_granularity; | 592 | (t->discard_granularity - 1); |
624 | } | 593 | } |
625 | 594 | ||
626 | return ret; | 595 | return ret; |
diff --git a/block/blk-softirq.c b/block/blk-softirq.c index 467c8de8864..1366a89d8e6 100644 --- a/block/blk-softirq.c +++ b/block/blk-softirq.c | |||
@@ -8,7 +8,6 @@ | |||
8 | #include <linux/blkdev.h> | 8 | #include <linux/blkdev.h> |
9 | #include <linux/interrupt.h> | 9 | #include <linux/interrupt.h> |
10 | #include <linux/cpu.h> | 10 | #include <linux/cpu.h> |
11 | #include <linux/sched.h> | ||
12 | 11 | ||
13 | #include "blk.h" | 12 | #include "blk.h" |
14 | 13 | ||
@@ -104,10 +103,9 @@ static struct notifier_block __cpuinitdata blk_cpu_notifier = { | |||
104 | 103 | ||
105 | void __blk_complete_request(struct request *req) | 104 | void __blk_complete_request(struct request *req) |
106 | { | 105 | { |
107 | int ccpu, cpu; | 106 | int ccpu, cpu, group_cpu = NR_CPUS; |
108 | struct request_queue *q = req->q; | 107 | struct request_queue *q = req->q; |
109 | unsigned long flags; | 108 | unsigned long flags; |
110 | bool shared = false; | ||
111 | 109 | ||
112 | BUG_ON(!q->softirq_done_fn); | 110 | BUG_ON(!q->softirq_done_fn); |
113 | 111 | ||
@@ -119,20 +117,22 @@ void __blk_complete_request(struct request *req) | |||
119 | */ | 117 | */ |
120 | if (req->cpu != -1) { | 118 | if (req->cpu != -1) { |
121 | ccpu = req->cpu; | 119 | ccpu = req->cpu; |
122 | if (!test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags)) | 120 | if (!test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags)) { |
123 | shared = cpus_share_cache(cpu, ccpu); | 121 | ccpu = blk_cpu_to_group(ccpu); |
122 | group_cpu = blk_cpu_to_group(cpu); | ||
123 | } | ||
124 | } else | 124 | } else |
125 | ccpu = cpu; | 125 | ccpu = cpu; |
126 | 126 | ||
127 | /* | 127 | /* |
128 | * If current CPU and requested CPU share a cache, run the softirq on | 128 | * If current CPU and requested CPU are in the same group, running |
129 | * the current CPU. One might concern this is just like | 129 | * softirq in current CPU. One might concern this is just like |
130 | * QUEUE_FLAG_SAME_FORCE, but actually not. blk_complete_request() is | 130 | * QUEUE_FLAG_SAME_FORCE, but actually not. blk_complete_request() is |
131 | * running in interrupt handler, and currently I/O controller doesn't | 131 | * running in interrupt handler, and currently I/O controller doesn't |
132 | * support multiple interrupts, so current CPU is unique actually. This | 132 | * support multiple interrupts, so current CPU is unique actually. This |
133 | * avoids IPI sending from current CPU to the first CPU of a group. | 133 | * avoids IPI sending from current CPU to the first CPU of a group. |
134 | */ | 134 | */ |
135 | if (ccpu == cpu || shared) { | 135 | if (ccpu == cpu || ccpu == group_cpu) { |
136 | struct list_head *list; | 136 | struct list_head *list; |
137 | do_local: | 137 | do_local: |
138 | list = &__get_cpu_var(blk_cpu_done); | 138 | list = &__get_cpu_var(blk_cpu_done); |
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 788147797a7..60fda88c57f 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c | |||
@@ -9,7 +9,6 @@ | |||
9 | #include <linux/blktrace_api.h> | 9 | #include <linux/blktrace_api.h> |
10 | 10 | ||
11 | #include "blk.h" | 11 | #include "blk.h" |
12 | #include "blk-cgroup.h" | ||
13 | 12 | ||
14 | struct queue_sysfs_entry { | 13 | struct queue_sysfs_entry { |
15 | struct attribute attr; | 14 | struct attribute attr; |
@@ -26,15 +25,9 @@ queue_var_show(unsigned long var, char *page) | |||
26 | static ssize_t | 25 | static ssize_t |
27 | queue_var_store(unsigned long *var, const char *page, size_t count) | 26 | queue_var_store(unsigned long *var, const char *page, size_t count) |
28 | { | 27 | { |
29 | int err; | 28 | char *p = (char *) page; |
30 | unsigned long v; | ||
31 | |||
32 | err = strict_strtoul(page, 10, &v); | ||
33 | if (err || v > UINT_MAX) | ||
34 | return -EINVAL; | ||
35 | |||
36 | *var = v; | ||
37 | 29 | ||
30 | *var = simple_strtoul(p, &p, 10); | ||
38 | return count; | 31 | return count; |
39 | } | 32 | } |
40 | 33 | ||
@@ -46,7 +39,7 @@ static ssize_t queue_requests_show(struct request_queue *q, char *page) | |||
46 | static ssize_t | 39 | static ssize_t |
47 | queue_requests_store(struct request_queue *q, const char *page, size_t count) | 40 | queue_requests_store(struct request_queue *q, const char *page, size_t count) |
48 | { | 41 | { |
49 | struct request_list *rl; | 42 | struct request_list *rl = &q->rq; |
50 | unsigned long nr; | 43 | unsigned long nr; |
51 | int ret; | 44 | int ret; |
52 | 45 | ||
@@ -54,9 +47,6 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count) | |||
54 | return -EINVAL; | 47 | return -EINVAL; |
55 | 48 | ||
56 | ret = queue_var_store(&nr, page, count); | 49 | ret = queue_var_store(&nr, page, count); |
57 | if (ret < 0) | ||
58 | return ret; | ||
59 | |||
60 | if (nr < BLKDEV_MIN_RQ) | 50 | if (nr < BLKDEV_MIN_RQ) |
61 | nr = BLKDEV_MIN_RQ; | 51 | nr = BLKDEV_MIN_RQ; |
62 | 52 | ||
@@ -64,9 +54,6 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count) | |||
64 | q->nr_requests = nr; | 54 | q->nr_requests = nr; |
65 | blk_queue_congestion_threshold(q); | 55 | blk_queue_congestion_threshold(q); |
66 | 56 | ||
67 | /* congestion isn't cgroup aware and follows root blkcg for now */ | ||
68 | rl = &q->root_rl; | ||
69 | |||
70 | if (rl->count[BLK_RW_SYNC] >= queue_congestion_on_threshold(q)) | 57 | if (rl->count[BLK_RW_SYNC] >= queue_congestion_on_threshold(q)) |
71 | blk_set_queue_congested(q, BLK_RW_SYNC); | 58 | blk_set_queue_congested(q, BLK_RW_SYNC); |
72 | else if (rl->count[BLK_RW_SYNC] < queue_congestion_off_threshold(q)) | 59 | else if (rl->count[BLK_RW_SYNC] < queue_congestion_off_threshold(q)) |
@@ -77,22 +64,19 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count) | |||
77 | else if (rl->count[BLK_RW_ASYNC] < queue_congestion_off_threshold(q)) | 64 | else if (rl->count[BLK_RW_ASYNC] < queue_congestion_off_threshold(q)) |
78 | blk_clear_queue_congested(q, BLK_RW_ASYNC); | 65 | blk_clear_queue_congested(q, BLK_RW_ASYNC); |
79 | 66 | ||
80 | blk_queue_for_each_rl(rl, q) { | 67 | if (rl->count[BLK_RW_SYNC] >= q->nr_requests) { |
81 | if (rl->count[BLK_RW_SYNC] >= q->nr_requests) { | 68 | blk_set_queue_full(q, BLK_RW_SYNC); |
82 | blk_set_rl_full(rl, BLK_RW_SYNC); | 69 | } else { |
83 | } else { | 70 | blk_clear_queue_full(q, BLK_RW_SYNC); |
84 | blk_clear_rl_full(rl, BLK_RW_SYNC); | 71 | wake_up(&rl->wait[BLK_RW_SYNC]); |
85 | wake_up(&rl->wait[BLK_RW_SYNC]); | ||
86 | } | ||
87 | |||
88 | if (rl->count[BLK_RW_ASYNC] >= q->nr_requests) { | ||
89 | blk_set_rl_full(rl, BLK_RW_ASYNC); | ||
90 | } else { | ||
91 | blk_clear_rl_full(rl, BLK_RW_ASYNC); | ||
92 | wake_up(&rl->wait[BLK_RW_ASYNC]); | ||
93 | } | ||
94 | } | 72 | } |
95 | 73 | ||
74 | if (rl->count[BLK_RW_ASYNC] >= q->nr_requests) { | ||
75 | blk_set_queue_full(q, BLK_RW_ASYNC); | ||
76 | } else { | ||
77 | blk_clear_queue_full(q, BLK_RW_ASYNC); | ||
78 | wake_up(&rl->wait[BLK_RW_ASYNC]); | ||
79 | } | ||
96 | spin_unlock_irq(q->queue_lock); | 80 | spin_unlock_irq(q->queue_lock); |
97 | return ret; | 81 | return ret; |
98 | } | 82 | } |
@@ -111,9 +95,6 @@ queue_ra_store(struct request_queue *q, const char *page, size_t count) | |||
111 | unsigned long ra_kb; | 95 | unsigned long ra_kb; |
112 | ssize_t ret = queue_var_store(&ra_kb, page, count); | 96 | ssize_t ret = queue_var_store(&ra_kb, page, count); |
113 | 97 | ||
114 | if (ret < 0) | ||
115 | return ret; | ||
116 | |||
117 | q->backing_dev_info.ra_pages = ra_kb >> (PAGE_CACHE_SHIFT - 10); | 98 | q->backing_dev_info.ra_pages = ra_kb >> (PAGE_CACHE_SHIFT - 10); |
118 | 99 | ||
119 | return ret; | 100 | return ret; |
@@ -180,13 +161,6 @@ static ssize_t queue_discard_zeroes_data_show(struct request_queue *q, char *pag | |||
180 | return queue_var_show(queue_discard_zeroes_data(q), page); | 161 | return queue_var_show(queue_discard_zeroes_data(q), page); |
181 | } | 162 | } |
182 | 163 | ||
183 | static ssize_t queue_write_same_max_show(struct request_queue *q, char *page) | ||
184 | { | ||
185 | return sprintf(page, "%llu\n", | ||
186 | (unsigned long long)q->limits.max_write_same_sectors << 9); | ||
187 | } | ||
188 | |||
189 | |||
190 | static ssize_t | 164 | static ssize_t |
191 | queue_max_sectors_store(struct request_queue *q, const char *page, size_t count) | 165 | queue_max_sectors_store(struct request_queue *q, const char *page, size_t count) |
192 | { | 166 | { |
@@ -195,9 +169,6 @@ queue_max_sectors_store(struct request_queue *q, const char *page, size_t count) | |||
195 | page_kb = 1 << (PAGE_CACHE_SHIFT - 10); | 169 | page_kb = 1 << (PAGE_CACHE_SHIFT - 10); |
196 | ssize_t ret = queue_var_store(&max_sectors_kb, page, count); | 170 | ssize_t ret = queue_var_store(&max_sectors_kb, page, count); |
197 | 171 | ||
198 | if (ret < 0) | ||
199 | return ret; | ||
200 | |||
201 | if (max_sectors_kb > max_hw_sectors_kb || max_sectors_kb < page_kb) | 172 | if (max_sectors_kb > max_hw_sectors_kb || max_sectors_kb < page_kb) |
202 | return -EINVAL; | 173 | return -EINVAL; |
203 | 174 | ||
@@ -258,9 +229,6 @@ static ssize_t queue_nomerges_store(struct request_queue *q, const char *page, | |||
258 | unsigned long nm; | 229 | unsigned long nm; |
259 | ssize_t ret = queue_var_store(&nm, page, count); | 230 | ssize_t ret = queue_var_store(&nm, page, count); |
260 | 231 | ||
261 | if (ret < 0) | ||
262 | return ret; | ||
263 | |||
264 | spin_lock_irq(q->queue_lock); | 232 | spin_lock_irq(q->queue_lock); |
265 | queue_flag_clear(QUEUE_FLAG_NOMERGES, q); | 233 | queue_flag_clear(QUEUE_FLAG_NOMERGES, q); |
266 | queue_flag_clear(QUEUE_FLAG_NOXMERGES, q); | 234 | queue_flag_clear(QUEUE_FLAG_NOXMERGES, q); |
@@ -289,9 +257,6 @@ queue_rq_affinity_store(struct request_queue *q, const char *page, size_t count) | |||
289 | unsigned long val; | 257 | unsigned long val; |
290 | 258 | ||
291 | ret = queue_var_store(&val, page, count); | 259 | ret = queue_var_store(&val, page, count); |
292 | if (ret < 0) | ||
293 | return ret; | ||
294 | |||
295 | spin_lock_irq(q->queue_lock); | 260 | spin_lock_irq(q->queue_lock); |
296 | if (val == 2) { | 261 | if (val == 2) { |
297 | queue_flag_set(QUEUE_FLAG_SAME_COMP, q); | 262 | queue_flag_set(QUEUE_FLAG_SAME_COMP, q); |
@@ -392,11 +357,6 @@ static struct queue_sysfs_entry queue_discard_zeroes_data_entry = { | |||
392 | .show = queue_discard_zeroes_data_show, | 357 | .show = queue_discard_zeroes_data_show, |
393 | }; | 358 | }; |
394 | 359 | ||
395 | static struct queue_sysfs_entry queue_write_same_max_entry = { | ||
396 | .attr = {.name = "write_same_max_bytes", .mode = S_IRUGO }, | ||
397 | .show = queue_write_same_max_show, | ||
398 | }; | ||
399 | |||
400 | static struct queue_sysfs_entry queue_nonrot_entry = { | 360 | static struct queue_sysfs_entry queue_nonrot_entry = { |
401 | .attr = {.name = "rotational", .mode = S_IRUGO | S_IWUSR }, | 361 | .attr = {.name = "rotational", .mode = S_IRUGO | S_IWUSR }, |
402 | .show = queue_show_nonrot, | 362 | .show = queue_show_nonrot, |
@@ -444,7 +404,6 @@ static struct attribute *default_attrs[] = { | |||
444 | &queue_discard_granularity_entry.attr, | 404 | &queue_discard_granularity_entry.attr, |
445 | &queue_discard_max_entry.attr, | 405 | &queue_discard_max_entry.attr, |
446 | &queue_discard_zeroes_data_entry.attr, | 406 | &queue_discard_zeroes_data_entry.attr, |
447 | &queue_write_same_max_entry.attr, | ||
448 | &queue_nonrot_entry.attr, | 407 | &queue_nonrot_entry.attr, |
449 | &queue_nomerges_entry.attr, | 408 | &queue_nomerges_entry.attr, |
450 | &queue_rq_affinity_entry.attr, | 409 | &queue_rq_affinity_entry.attr, |
@@ -466,7 +425,7 @@ queue_attr_show(struct kobject *kobj, struct attribute *attr, char *page) | |||
466 | if (!entry->show) | 425 | if (!entry->show) |
467 | return -EIO; | 426 | return -EIO; |
468 | mutex_lock(&q->sysfs_lock); | 427 | mutex_lock(&q->sysfs_lock); |
469 | if (blk_queue_dying(q)) { | 428 | if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)) { |
470 | mutex_unlock(&q->sysfs_lock); | 429 | mutex_unlock(&q->sysfs_lock); |
471 | return -ENOENT; | 430 | return -ENOENT; |
472 | } | 431 | } |
@@ -488,7 +447,7 @@ queue_attr_store(struct kobject *kobj, struct attribute *attr, | |||
488 | 447 | ||
489 | q = container_of(kobj, struct request_queue, kobj); | 448 | q = container_of(kobj, struct request_queue, kobj); |
490 | mutex_lock(&q->sysfs_lock); | 449 | mutex_lock(&q->sysfs_lock); |
491 | if (blk_queue_dying(q)) { | 450 | if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)) { |
492 | mutex_unlock(&q->sysfs_lock); | 451 | mutex_unlock(&q->sysfs_lock); |
493 | return -ENOENT; | 452 | return -ENOENT; |
494 | } | 453 | } |
@@ -498,11 +457,11 @@ queue_attr_store(struct kobject *kobj, struct attribute *attr, | |||
498 | } | 457 | } |
499 | 458 | ||
500 | /** | 459 | /** |
501 | * blk_release_queue: - release a &struct request_queue when it is no longer needed | 460 | * blk_cleanup_queue: - release a &struct request_queue when it is no longer needed |
502 | * @kobj: the kobj belonging to the request queue to be released | 461 | * @kobj: the kobj belonging of the request queue to be released |
503 | * | 462 | * |
504 | * Description: | 463 | * Description: |
505 | * blk_release_queue is the pair to blk_init_queue() or | 464 | * blk_cleanup_queue is the pair to blk_init_queue() or |
506 | * blk_queue_make_request(). It should be called when a request queue is | 465 | * blk_queue_make_request(). It should be called when a request queue is |
507 | * being released; typically when a block device is being de-registered. | 466 | * being released; typically when a block device is being de-registered. |
508 | * Currently, its primary task it to free all the &struct request | 467 | * Currently, its primary task it to free all the &struct request |
@@ -516,19 +475,17 @@ static void blk_release_queue(struct kobject *kobj) | |||
516 | { | 475 | { |
517 | struct request_queue *q = | 476 | struct request_queue *q = |
518 | container_of(kobj, struct request_queue, kobj); | 477 | container_of(kobj, struct request_queue, kobj); |
478 | struct request_list *rl = &q->rq; | ||
519 | 479 | ||
520 | blk_sync_queue(q); | 480 | blk_sync_queue(q); |
521 | 481 | ||
522 | blkcg_exit_queue(q); | 482 | if (q->elevator) |
523 | |||
524 | if (q->elevator) { | ||
525 | spin_lock_irq(q->queue_lock); | ||
526 | ioc_clear_queue(q); | ||
527 | spin_unlock_irq(q->queue_lock); | ||
528 | elevator_exit(q->elevator); | 483 | elevator_exit(q->elevator); |
529 | } | ||
530 | 484 | ||
531 | blk_exit_rl(&q->root_rl); | 485 | blk_throtl_exit(q); |
486 | |||
487 | if (rl->rq_pool) | ||
488 | mempool_destroy(rl->rq_pool); | ||
532 | 489 | ||
533 | if (q->queue_tags) | 490 | if (q->queue_tags) |
534 | __blk_queue_free_tags(q); | 491 | __blk_queue_free_tags(q); |
@@ -536,8 +493,6 @@ static void blk_release_queue(struct kobject *kobj) | |||
536 | blk_trace_shutdown(q); | 493 | blk_trace_shutdown(q); |
537 | 494 | ||
538 | bdi_destroy(&q->backing_dev_info); | 495 | bdi_destroy(&q->backing_dev_info); |
539 | |||
540 | ida_simple_remove(&blk_queue_ida, q->id); | ||
541 | kmem_cache_free(blk_requestq_cachep, q); | 496 | kmem_cache_free(blk_requestq_cachep, q); |
542 | } | 497 | } |
543 | 498 | ||
@@ -561,12 +516,6 @@ int blk_register_queue(struct gendisk *disk) | |||
561 | if (WARN_ON(!q)) | 516 | if (WARN_ON(!q)) |
562 | return -ENXIO; | 517 | return -ENXIO; |
563 | 518 | ||
564 | /* | ||
565 | * Initialization must be complete by now. Finish the initial | ||
566 | * bypass from queue allocation. | ||
567 | */ | ||
568 | blk_queue_bypass_end(q); | ||
569 | |||
570 | ret = blk_trace_init_sysfs(dev); | 519 | ret = blk_trace_init_sysfs(dev); |
571 | if (ret) | 520 | if (ret) |
572 | return ret; | 521 | return ret; |
diff --git a/block/blk-tag.c b/block/blk-tag.c index cc345e1d8d4..ece65fc4c79 100644 --- a/block/blk-tag.c +++ b/block/blk-tag.c | |||
@@ -186,8 +186,7 @@ int blk_queue_init_tags(struct request_queue *q, int depth, | |||
186 | tags = __blk_queue_init_tags(q, depth); | 186 | tags = __blk_queue_init_tags(q, depth); |
187 | 187 | ||
188 | if (!tags) | 188 | if (!tags) |
189 | return -ENOMEM; | 189 | goto fail; |
190 | |||
191 | } else if (q->queue_tags) { | 190 | } else if (q->queue_tags) { |
192 | rc = blk_queue_resize_tags(q, depth); | 191 | rc = blk_queue_resize_tags(q, depth); |
193 | if (rc) | 192 | if (rc) |
@@ -204,6 +203,9 @@ int blk_queue_init_tags(struct request_queue *q, int depth, | |||
204 | queue_flag_set_unlocked(QUEUE_FLAG_QUEUED, q); | 203 | queue_flag_set_unlocked(QUEUE_FLAG_QUEUED, q); |
205 | INIT_LIST_HEAD(&q->tag_busy_list); | 204 | INIT_LIST_HEAD(&q->tag_busy_list); |
206 | return 0; | 205 | return 0; |
206 | fail: | ||
207 | kfree(tags); | ||
208 | return -ENOMEM; | ||
207 | } | 209 | } |
208 | EXPORT_SYMBOL(blk_queue_init_tags); | 210 | EXPORT_SYMBOL(blk_queue_init_tags); |
209 | 211 | ||
@@ -280,9 +282,16 @@ EXPORT_SYMBOL(blk_queue_resize_tags); | |||
280 | void blk_queue_end_tag(struct request_queue *q, struct request *rq) | 282 | void blk_queue_end_tag(struct request_queue *q, struct request *rq) |
281 | { | 283 | { |
282 | struct blk_queue_tag *bqt = q->queue_tags; | 284 | struct blk_queue_tag *bqt = q->queue_tags; |
283 | unsigned tag = rq->tag; /* negative tags invalid */ | 285 | int tag = rq->tag; |
286 | |||
287 | BUG_ON(tag == -1); | ||
284 | 288 | ||
285 | BUG_ON(tag >= bqt->real_max_depth); | 289 | if (unlikely(tag >= bqt->real_max_depth)) |
290 | /* | ||
291 | * This can happen after tag depth has been reduced. | ||
292 | * FIXME: how about a warning or info message here? | ||
293 | */ | ||
294 | return; | ||
286 | 295 | ||
287 | list_del_init(&rq->queuelist); | 296 | list_del_init(&rq->queuelist); |
288 | rq->cmd_flags &= ~REQ_QUEUED; | 297 | rq->cmd_flags &= ~REQ_QUEUED; |
diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 31146225f3d..a19f58c6fc3 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c | |||
@@ -10,7 +10,6 @@ | |||
10 | #include <linux/bio.h> | 10 | #include <linux/bio.h> |
11 | #include <linux/blktrace_api.h> | 11 | #include <linux/blktrace_api.h> |
12 | #include "blk-cgroup.h" | 12 | #include "blk-cgroup.h" |
13 | #include "blk.h" | ||
14 | 13 | ||
15 | /* Max dispatch from a group in 1 round */ | 14 | /* Max dispatch from a group in 1 round */ |
16 | static int throtl_grp_quantum = 8; | 15 | static int throtl_grp_quantum = 8; |
@@ -21,8 +20,6 @@ static int throtl_quantum = 32; | |||
21 | /* Throttling is performed over 100ms slice and after that slice is renewed */ | 20 | /* Throttling is performed over 100ms slice and after that slice is renewed */ |
22 | static unsigned long throtl_slice = HZ/10; /* 100 ms */ | 21 | static unsigned long throtl_slice = HZ/10; /* 100 ms */ |
23 | 22 | ||
24 | static struct blkcg_policy blkcg_policy_throtl; | ||
25 | |||
26 | /* A workqueue to queue throttle related work */ | 23 | /* A workqueue to queue throttle related work */ |
27 | static struct workqueue_struct *kthrotld_workqueue; | 24 | static struct workqueue_struct *kthrotld_workqueue; |
28 | static void throtl_schedule_delayed_work(struct throtl_data *td, | 25 | static void throtl_schedule_delayed_work(struct throtl_data *td, |
@@ -40,17 +37,9 @@ struct throtl_rb_root { | |||
40 | 37 | ||
41 | #define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node) | 38 | #define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node) |
42 | 39 | ||
43 | /* Per-cpu group stats */ | ||
44 | struct tg_stats_cpu { | ||
45 | /* total bytes transferred */ | ||
46 | struct blkg_rwstat service_bytes; | ||
47 | /* total IOs serviced, post merge */ | ||
48 | struct blkg_rwstat serviced; | ||
49 | }; | ||
50 | |||
51 | struct throtl_grp { | 40 | struct throtl_grp { |
52 | /* must be the first member */ | 41 | /* List of throtl groups on the request queue*/ |
53 | struct blkg_policy_data pd; | 42 | struct hlist_node tg_node; |
54 | 43 | ||
55 | /* active throtl group service_tree member */ | 44 | /* active throtl group service_tree member */ |
56 | struct rb_node rb_node; | 45 | struct rb_node rb_node; |
@@ -62,6 +51,8 @@ struct throtl_grp { | |||
62 | */ | 51 | */ |
63 | unsigned long disptime; | 52 | unsigned long disptime; |
64 | 53 | ||
54 | struct blkio_group blkg; | ||
55 | atomic_t ref; | ||
65 | unsigned int flags; | 56 | unsigned int flags; |
66 | 57 | ||
67 | /* Two lists for READ and WRITE */ | 58 | /* Two lists for READ and WRITE */ |
@@ -88,18 +79,18 @@ struct throtl_grp { | |||
88 | /* Some throttle limits got updated for the group */ | 79 | /* Some throttle limits got updated for the group */ |
89 | int limits_changed; | 80 | int limits_changed; |
90 | 81 | ||
91 | /* Per cpu stats pointer */ | 82 | struct rcu_head rcu_head; |
92 | struct tg_stats_cpu __percpu *stats_cpu; | ||
93 | |||
94 | /* List of tgs waiting for per cpu stats memory to be allocated */ | ||
95 | struct list_head stats_alloc_node; | ||
96 | }; | 83 | }; |
97 | 84 | ||
98 | struct throtl_data | 85 | struct throtl_data |
99 | { | 86 | { |
87 | /* List of throtl groups */ | ||
88 | struct hlist_head tg_list; | ||
89 | |||
100 | /* service tree for active throtl groups */ | 90 | /* service tree for active throtl groups */ |
101 | struct throtl_rb_root tg_service_tree; | 91 | struct throtl_rb_root tg_service_tree; |
102 | 92 | ||
93 | struct throtl_grp *root_tg; | ||
103 | struct request_queue *queue; | 94 | struct request_queue *queue; |
104 | 95 | ||
105 | /* Total Number of queued bios on READ and WRITE lists */ | 96 | /* Total Number of queued bios on READ and WRITE lists */ |
@@ -116,33 +107,6 @@ struct throtl_data | |||
116 | int limits_changed; | 107 | int limits_changed; |
117 | }; | 108 | }; |
118 | 109 | ||
119 | /* list and work item to allocate percpu group stats */ | ||
120 | static DEFINE_SPINLOCK(tg_stats_alloc_lock); | ||
121 | static LIST_HEAD(tg_stats_alloc_list); | ||
122 | |||
123 | static void tg_stats_alloc_fn(struct work_struct *); | ||
124 | static DECLARE_DELAYED_WORK(tg_stats_alloc_work, tg_stats_alloc_fn); | ||
125 | |||
126 | static inline struct throtl_grp *pd_to_tg(struct blkg_policy_data *pd) | ||
127 | { | ||
128 | return pd ? container_of(pd, struct throtl_grp, pd) : NULL; | ||
129 | } | ||
130 | |||
131 | static inline struct throtl_grp *blkg_to_tg(struct blkcg_gq *blkg) | ||
132 | { | ||
133 | return pd_to_tg(blkg_to_pd(blkg, &blkcg_policy_throtl)); | ||
134 | } | ||
135 | |||
136 | static inline struct blkcg_gq *tg_to_blkg(struct throtl_grp *tg) | ||
137 | { | ||
138 | return pd_to_blkg(&tg->pd); | ||
139 | } | ||
140 | |||
141 | static inline struct throtl_grp *td_root_tg(struct throtl_data *td) | ||
142 | { | ||
143 | return blkg_to_tg(td->queue->root_blkg); | ||
144 | } | ||
145 | |||
146 | enum tg_state_flags { | 110 | enum tg_state_flags { |
147 | THROTL_TG_FLAG_on_rr = 0, /* on round-robin busy list */ | 111 | THROTL_TG_FLAG_on_rr = 0, /* on round-robin busy list */ |
148 | }; | 112 | }; |
@@ -163,149 +127,254 @@ static inline int throtl_tg_##name(const struct throtl_grp *tg) \ | |||
163 | 127 | ||
164 | THROTL_TG_FNS(on_rr); | 128 | THROTL_TG_FNS(on_rr); |
165 | 129 | ||
166 | #define throtl_log_tg(td, tg, fmt, args...) do { \ | 130 | #define throtl_log_tg(td, tg, fmt, args...) \ |
167 | char __pbuf[128]; \ | 131 | blk_add_trace_msg((td)->queue, "throtl %s " fmt, \ |
168 | \ | 132 | blkg_path(&(tg)->blkg), ##args); \ |
169 | blkg_path(tg_to_blkg(tg), __pbuf, sizeof(__pbuf)); \ | ||
170 | blk_add_trace_msg((td)->queue, "throtl %s " fmt, __pbuf, ##args); \ | ||
171 | } while (0) | ||
172 | 133 | ||
173 | #define throtl_log(td, fmt, args...) \ | 134 | #define throtl_log(td, fmt, args...) \ |
174 | blk_add_trace_msg((td)->queue, "throtl " fmt, ##args) | 135 | blk_add_trace_msg((td)->queue, "throtl " fmt, ##args) |
175 | 136 | ||
137 | static inline struct throtl_grp *tg_of_blkg(struct blkio_group *blkg) | ||
138 | { | ||
139 | if (blkg) | ||
140 | return container_of(blkg, struct throtl_grp, blkg); | ||
141 | |||
142 | return NULL; | ||
143 | } | ||
144 | |||
176 | static inline unsigned int total_nr_queued(struct throtl_data *td) | 145 | static inline unsigned int total_nr_queued(struct throtl_data *td) |
177 | { | 146 | { |
178 | return td->nr_queued[0] + td->nr_queued[1]; | 147 | return td->nr_queued[0] + td->nr_queued[1]; |
179 | } | 148 | } |
180 | 149 | ||
181 | /* | 150 | static inline struct throtl_grp *throtl_ref_get_tg(struct throtl_grp *tg) |
182 | * Worker for allocating per cpu stat for tgs. This is scheduled on the | ||
183 | * system_wq once there are some groups on the alloc_list waiting for | ||
184 | * allocation. | ||
185 | */ | ||
186 | static void tg_stats_alloc_fn(struct work_struct *work) | ||
187 | { | 151 | { |
188 | static struct tg_stats_cpu *stats_cpu; /* this fn is non-reentrant */ | 152 | atomic_inc(&tg->ref); |
189 | struct delayed_work *dwork = to_delayed_work(work); | 153 | return tg; |
190 | bool empty = false; | 154 | } |
191 | |||
192 | alloc_stats: | ||
193 | if (!stats_cpu) { | ||
194 | stats_cpu = alloc_percpu(struct tg_stats_cpu); | ||
195 | if (!stats_cpu) { | ||
196 | /* allocation failed, try again after some time */ | ||
197 | schedule_delayed_work(dwork, msecs_to_jiffies(10)); | ||
198 | return; | ||
199 | } | ||
200 | } | ||
201 | |||
202 | spin_lock_irq(&tg_stats_alloc_lock); | ||
203 | 155 | ||
204 | if (!list_empty(&tg_stats_alloc_list)) { | 156 | static void throtl_free_tg(struct rcu_head *head) |
205 | struct throtl_grp *tg = list_first_entry(&tg_stats_alloc_list, | 157 | { |
206 | struct throtl_grp, | 158 | struct throtl_grp *tg; |
207 | stats_alloc_node); | ||
208 | swap(tg->stats_cpu, stats_cpu); | ||
209 | list_del_init(&tg->stats_alloc_node); | ||
210 | } | ||
211 | 159 | ||
212 | empty = list_empty(&tg_stats_alloc_list); | 160 | tg = container_of(head, struct throtl_grp, rcu_head); |
213 | spin_unlock_irq(&tg_stats_alloc_lock); | 161 | free_percpu(tg->blkg.stats_cpu); |
214 | if (!empty) | 162 | kfree(tg); |
215 | goto alloc_stats; | ||
216 | } | 163 | } |
217 | 164 | ||
218 | static void throtl_pd_init(struct blkcg_gq *blkg) | 165 | static void throtl_put_tg(struct throtl_grp *tg) |
219 | { | 166 | { |
220 | struct throtl_grp *tg = blkg_to_tg(blkg); | 167 | BUG_ON(atomic_read(&tg->ref) <= 0); |
221 | unsigned long flags; | 168 | if (!atomic_dec_and_test(&tg->ref)) |
169 | return; | ||
222 | 170 | ||
171 | /* | ||
172 | * A group is freed in rcu manner. But having an rcu lock does not | ||
173 | * mean that one can access all the fields of blkg and assume these | ||
174 | * are valid. For example, don't try to follow throtl_data and | ||
175 | * request queue links. | ||
176 | * | ||
177 | * Having a reference to blkg under an rcu allows acess to only | ||
178 | * values local to groups like group stats and group rate limits | ||
179 | */ | ||
180 | call_rcu(&tg->rcu_head, throtl_free_tg); | ||
181 | } | ||
182 | |||
183 | static void throtl_init_group(struct throtl_grp *tg) | ||
184 | { | ||
185 | INIT_HLIST_NODE(&tg->tg_node); | ||
223 | RB_CLEAR_NODE(&tg->rb_node); | 186 | RB_CLEAR_NODE(&tg->rb_node); |
224 | bio_list_init(&tg->bio_lists[0]); | 187 | bio_list_init(&tg->bio_lists[0]); |
225 | bio_list_init(&tg->bio_lists[1]); | 188 | bio_list_init(&tg->bio_lists[1]); |
226 | tg->limits_changed = false; | 189 | tg->limits_changed = false; |
227 | 190 | ||
228 | tg->bps[READ] = -1; | 191 | /* Practically unlimited BW */ |
229 | tg->bps[WRITE] = -1; | 192 | tg->bps[0] = tg->bps[1] = -1; |
230 | tg->iops[READ] = -1; | 193 | tg->iops[0] = tg->iops[1] = -1; |
231 | tg->iops[WRITE] = -1; | ||
232 | 194 | ||
233 | /* | 195 | /* |
234 | * Ugh... We need to perform per-cpu allocation for tg->stats_cpu | 196 | * Take the initial reference that will be released on destroy |
235 | * but percpu allocator can't be called from IO path. Queue tg on | 197 | * This can be thought of a joint reference by cgroup and |
236 | * tg_stats_alloc_list and allocate from work item. | 198 | * request queue which will be dropped by either request queue |
199 | * exit or cgroup deletion path depending on who is exiting first. | ||
237 | */ | 200 | */ |
238 | spin_lock_irqsave(&tg_stats_alloc_lock, flags); | 201 | atomic_set(&tg->ref, 1); |
239 | list_add(&tg->stats_alloc_node, &tg_stats_alloc_list); | ||
240 | schedule_delayed_work(&tg_stats_alloc_work, 0); | ||
241 | spin_unlock_irqrestore(&tg_stats_alloc_lock, flags); | ||
242 | } | 202 | } |
243 | 203 | ||
244 | static void throtl_pd_exit(struct blkcg_gq *blkg) | 204 | /* Should be called with rcu read lock held (needed for blkcg) */ |
205 | static void | ||
206 | throtl_add_group_to_td_list(struct throtl_data *td, struct throtl_grp *tg) | ||
245 | { | 207 | { |
246 | struct throtl_grp *tg = blkg_to_tg(blkg); | 208 | hlist_add_head(&tg->tg_node, &td->tg_list); |
247 | unsigned long flags; | 209 | td->nr_undestroyed_grps++; |
210 | } | ||
248 | 211 | ||
249 | spin_lock_irqsave(&tg_stats_alloc_lock, flags); | 212 | static void |
250 | list_del_init(&tg->stats_alloc_node); | 213 | __throtl_tg_fill_dev_details(struct throtl_data *td, struct throtl_grp *tg) |
251 | spin_unlock_irqrestore(&tg_stats_alloc_lock, flags); | 214 | { |
215 | struct backing_dev_info *bdi = &td->queue->backing_dev_info; | ||
216 | unsigned int major, minor; | ||
217 | |||
218 | if (!tg || tg->blkg.dev) | ||
219 | return; | ||
252 | 220 | ||
253 | free_percpu(tg->stats_cpu); | 221 | /* |
222 | * Fill in device details for a group which might not have been | ||
223 | * filled at group creation time as queue was being instantiated | ||
224 | * and driver had not attached a device yet | ||
225 | */ | ||
226 | if (bdi->dev && dev_name(bdi->dev)) { | ||
227 | sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); | ||
228 | tg->blkg.dev = MKDEV(major, minor); | ||
229 | } | ||
254 | } | 230 | } |
255 | 231 | ||
256 | static void throtl_pd_reset_stats(struct blkcg_gq *blkg) | 232 | /* |
233 | * Should be called with without queue lock held. Here queue lock will be | ||
234 | * taken rarely. It will be taken only once during life time of a group | ||
235 | * if need be | ||
236 | */ | ||
237 | static void | ||
238 | throtl_tg_fill_dev_details(struct throtl_data *td, struct throtl_grp *tg) | ||
257 | { | 239 | { |
258 | struct throtl_grp *tg = blkg_to_tg(blkg); | 240 | if (!tg || tg->blkg.dev) |
259 | int cpu; | ||
260 | |||
261 | if (tg->stats_cpu == NULL) | ||
262 | return; | 241 | return; |
263 | 242 | ||
264 | for_each_possible_cpu(cpu) { | 243 | spin_lock_irq(td->queue->queue_lock); |
265 | struct tg_stats_cpu *sc = per_cpu_ptr(tg->stats_cpu, cpu); | 244 | __throtl_tg_fill_dev_details(td, tg); |
245 | spin_unlock_irq(td->queue->queue_lock); | ||
246 | } | ||
247 | |||
248 | static void throtl_init_add_tg_lists(struct throtl_data *td, | ||
249 | struct throtl_grp *tg, struct blkio_cgroup *blkcg) | ||
250 | { | ||
251 | __throtl_tg_fill_dev_details(td, tg); | ||
252 | |||
253 | /* Add group onto cgroup list */ | ||
254 | blkiocg_add_blkio_group(blkcg, &tg->blkg, (void *)td, | ||
255 | tg->blkg.dev, BLKIO_POLICY_THROTL); | ||
266 | 256 | ||
267 | blkg_rwstat_reset(&sc->service_bytes); | 257 | tg->bps[READ] = blkcg_get_read_bps(blkcg, tg->blkg.dev); |
268 | blkg_rwstat_reset(&sc->serviced); | 258 | tg->bps[WRITE] = blkcg_get_write_bps(blkcg, tg->blkg.dev); |
259 | tg->iops[READ] = blkcg_get_read_iops(blkcg, tg->blkg.dev); | ||
260 | tg->iops[WRITE] = blkcg_get_write_iops(blkcg, tg->blkg.dev); | ||
261 | |||
262 | throtl_add_group_to_td_list(td, tg); | ||
263 | } | ||
264 | |||
265 | /* Should be called without queue lock and outside of rcu period */ | ||
266 | static struct throtl_grp *throtl_alloc_tg(struct throtl_data *td) | ||
267 | { | ||
268 | struct throtl_grp *tg = NULL; | ||
269 | int ret; | ||
270 | |||
271 | tg = kzalloc_node(sizeof(*tg), GFP_ATOMIC, td->queue->node); | ||
272 | if (!tg) | ||
273 | return NULL; | ||
274 | |||
275 | ret = blkio_alloc_blkg_stats(&tg->blkg); | ||
276 | |||
277 | if (ret) { | ||
278 | kfree(tg); | ||
279 | return NULL; | ||
269 | } | 280 | } |
281 | |||
282 | throtl_init_group(tg); | ||
283 | return tg; | ||
270 | } | 284 | } |
271 | 285 | ||
272 | static struct throtl_grp *throtl_lookup_tg(struct throtl_data *td, | 286 | static struct |
273 | struct blkcg *blkcg) | 287 | throtl_grp *throtl_find_tg(struct throtl_data *td, struct blkio_cgroup *blkcg) |
274 | { | 288 | { |
289 | struct throtl_grp *tg = NULL; | ||
290 | void *key = td; | ||
291 | |||
275 | /* | 292 | /* |
276 | * This is the common case when there are no blkcgs. Avoid lookup | 293 | * This is the common case when there are no blkio cgroups. |
277 | * in this case | 294 | * Avoid lookup in this case |
278 | */ | 295 | */ |
279 | if (blkcg == &blkcg_root) | 296 | if (blkcg == &blkio_root_cgroup) |
280 | return td_root_tg(td); | 297 | tg = td->root_tg; |
298 | else | ||
299 | tg = tg_of_blkg(blkiocg_lookup_group(blkcg, key)); | ||
281 | 300 | ||
282 | return blkg_to_tg(blkg_lookup(blkcg, td->queue)); | 301 | __throtl_tg_fill_dev_details(td, tg); |
302 | return tg; | ||
283 | } | 303 | } |
284 | 304 | ||
285 | static struct throtl_grp *throtl_lookup_create_tg(struct throtl_data *td, | 305 | /* |
286 | struct blkcg *blkcg) | 306 | * This function returns with queue lock unlocked in case of error, like |
307 | * request queue is no more | ||
308 | */ | ||
309 | static struct throtl_grp * throtl_get_tg(struct throtl_data *td) | ||
287 | { | 310 | { |
311 | struct throtl_grp *tg = NULL, *__tg = NULL; | ||
312 | struct blkio_cgroup *blkcg; | ||
288 | struct request_queue *q = td->queue; | 313 | struct request_queue *q = td->queue; |
289 | struct throtl_grp *tg = NULL; | ||
290 | 314 | ||
315 | rcu_read_lock(); | ||
316 | blkcg = task_blkio_cgroup(current); | ||
317 | tg = throtl_find_tg(td, blkcg); | ||
318 | if (tg) { | ||
319 | rcu_read_unlock(); | ||
320 | return tg; | ||
321 | } | ||
322 | |||
323 | /* | ||
324 | * Need to allocate a group. Allocation of group also needs allocation | ||
325 | * of per cpu stats which in-turn takes a mutex() and can block. Hence | ||
326 | * we need to drop rcu lock and queue_lock before we call alloc | ||
327 | * | ||
328 | * Take the request queue reference to make sure queue does not | ||
329 | * go away once we return from allocation. | ||
330 | */ | ||
331 | blk_get_queue(q); | ||
332 | rcu_read_unlock(); | ||
333 | spin_unlock_irq(q->queue_lock); | ||
334 | |||
335 | tg = throtl_alloc_tg(td); | ||
291 | /* | 336 | /* |
292 | * This is the common case when there are no blkcgs. Avoid lookup | 337 | * We might have slept in group allocation. Make sure queue is not |
293 | * in this case | 338 | * dead |
294 | */ | 339 | */ |
295 | if (blkcg == &blkcg_root) { | 340 | if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) { |
296 | tg = td_root_tg(td); | 341 | blk_put_queue(q); |
297 | } else { | 342 | if (tg) |
298 | struct blkcg_gq *blkg; | 343 | kfree(tg); |
299 | 344 | ||
300 | blkg = blkg_lookup_create(blkcg, q); | 345 | return ERR_PTR(-ENODEV); |
301 | |||
302 | /* if %NULL and @q is alive, fall back to root_tg */ | ||
303 | if (!IS_ERR(blkg)) | ||
304 | tg = blkg_to_tg(blkg); | ||
305 | else if (!blk_queue_dying(q)) | ||
306 | tg = td_root_tg(td); | ||
307 | } | 346 | } |
347 | blk_put_queue(q); | ||
348 | |||
349 | /* Group allocated and queue is still alive. take the lock */ | ||
350 | spin_lock_irq(q->queue_lock); | ||
308 | 351 | ||
352 | /* | ||
353 | * Initialize the new group. After sleeping, read the blkcg again. | ||
354 | */ | ||
355 | rcu_read_lock(); | ||
356 | blkcg = task_blkio_cgroup(current); | ||
357 | |||
358 | /* | ||
359 | * If some other thread already allocated the group while we were | ||
360 | * not holding queue lock, free up the group | ||
361 | */ | ||
362 | __tg = throtl_find_tg(td, blkcg); | ||
363 | |||
364 | if (__tg) { | ||
365 | kfree(tg); | ||
366 | rcu_read_unlock(); | ||
367 | return __tg; | ||
368 | } | ||
369 | |||
370 | /* Group allocation failed. Account the IO to root group */ | ||
371 | if (!tg) { | ||
372 | tg = td->root_tg; | ||
373 | return tg; | ||
374 | } | ||
375 | |||
376 | throtl_init_add_tg_lists(td, tg, blkcg); | ||
377 | rcu_read_unlock(); | ||
309 | return tg; | 378 | return tg; |
310 | } | 379 | } |
311 | 380 | ||
@@ -674,41 +743,16 @@ static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg, | |||
674 | return 0; | 743 | return 0; |
675 | } | 744 | } |
676 | 745 | ||
677 | static void throtl_update_dispatch_stats(struct blkcg_gq *blkg, u64 bytes, | ||
678 | int rw) | ||
679 | { | ||
680 | struct throtl_grp *tg = blkg_to_tg(blkg); | ||
681 | struct tg_stats_cpu *stats_cpu; | ||
682 | unsigned long flags; | ||
683 | |||
684 | /* If per cpu stats are not allocated yet, don't do any accounting. */ | ||
685 | if (tg->stats_cpu == NULL) | ||
686 | return; | ||
687 | |||
688 | /* | ||
689 | * Disabling interrupts to provide mutual exclusion between two | ||
690 | * writes on same cpu. It probably is not needed for 64bit. Not | ||
691 | * optimizing that case yet. | ||
692 | */ | ||
693 | local_irq_save(flags); | ||
694 | |||
695 | stats_cpu = this_cpu_ptr(tg->stats_cpu); | ||
696 | |||
697 | blkg_rwstat_add(&stats_cpu->serviced, rw, 1); | ||
698 | blkg_rwstat_add(&stats_cpu->service_bytes, rw, bytes); | ||
699 | |||
700 | local_irq_restore(flags); | ||
701 | } | ||
702 | |||
703 | static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) | 746 | static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) |
704 | { | 747 | { |
705 | bool rw = bio_data_dir(bio); | 748 | bool rw = bio_data_dir(bio); |
749 | bool sync = rw_is_sync(bio->bi_rw); | ||
706 | 750 | ||
707 | /* Charge the bio to the group */ | 751 | /* Charge the bio to the group */ |
708 | tg->bytes_disp[rw] += bio->bi_size; | 752 | tg->bytes_disp[rw] += bio->bi_size; |
709 | tg->io_disp[rw]++; | 753 | tg->io_disp[rw]++; |
710 | 754 | ||
711 | throtl_update_dispatch_stats(tg_to_blkg(tg), bio->bi_size, bio->bi_rw); | 755 | blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size, rw, sync); |
712 | } | 756 | } |
713 | 757 | ||
714 | static void throtl_add_bio_tg(struct throtl_data *td, struct throtl_grp *tg, | 758 | static void throtl_add_bio_tg(struct throtl_data *td, struct throtl_grp *tg, |
@@ -718,7 +762,7 @@ static void throtl_add_bio_tg(struct throtl_data *td, struct throtl_grp *tg, | |||
718 | 762 | ||
719 | bio_list_add(&tg->bio_lists[rw], bio); | 763 | bio_list_add(&tg->bio_lists[rw], bio); |
720 | /* Take a bio reference on tg */ | 764 | /* Take a bio reference on tg */ |
721 | blkg_get(tg_to_blkg(tg)); | 765 | throtl_ref_get_tg(tg); |
722 | tg->nr_queued[rw]++; | 766 | tg->nr_queued[rw]++; |
723 | td->nr_queued[rw]++; | 767 | td->nr_queued[rw]++; |
724 | throtl_enqueue_tg(td, tg); | 768 | throtl_enqueue_tg(td, tg); |
@@ -751,8 +795,8 @@ static void tg_dispatch_one_bio(struct throtl_data *td, struct throtl_grp *tg, | |||
751 | 795 | ||
752 | bio = bio_list_pop(&tg->bio_lists[rw]); | 796 | bio = bio_list_pop(&tg->bio_lists[rw]); |
753 | tg->nr_queued[rw]--; | 797 | tg->nr_queued[rw]--; |
754 | /* Drop bio reference on blkg */ | 798 | /* Drop bio reference on tg */ |
755 | blkg_put(tg_to_blkg(tg)); | 799 | throtl_put_tg(tg); |
756 | 800 | ||
757 | BUG_ON(td->nr_queued[rw] <= 0); | 801 | BUG_ON(td->nr_queued[rw] <= 0); |
758 | td->nr_queued[rw]--; | 802 | td->nr_queued[rw]--; |
@@ -830,8 +874,8 @@ static int throtl_select_dispatch(struct throtl_data *td, struct bio_list *bl) | |||
830 | 874 | ||
831 | static void throtl_process_limit_change(struct throtl_data *td) | 875 | static void throtl_process_limit_change(struct throtl_data *td) |
832 | { | 876 | { |
833 | struct request_queue *q = td->queue; | 877 | struct throtl_grp *tg; |
834 | struct blkcg_gq *blkg, *n; | 878 | struct hlist_node *pos, *n; |
835 | 879 | ||
836 | if (!td->limits_changed) | 880 | if (!td->limits_changed) |
837 | return; | 881 | return; |
@@ -840,9 +884,7 @@ static void throtl_process_limit_change(struct throtl_data *td) | |||
840 | 884 | ||
841 | throtl_log(td, "limits changed"); | 885 | throtl_log(td, "limits changed"); |
842 | 886 | ||
843 | list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) { | 887 | hlist_for_each_entry_safe(tg, pos, n, &td->tg_list, tg_node) { |
844 | struct throtl_grp *tg = blkg_to_tg(blkg); | ||
845 | |||
846 | if (!tg->limits_changed) | 888 | if (!tg->limits_changed) |
847 | continue; | 889 | continue; |
848 | 890 | ||
@@ -929,164 +971,135 @@ throtl_schedule_delayed_work(struct throtl_data *td, unsigned long delay) | |||
929 | 971 | ||
930 | /* schedule work if limits changed even if no bio is queued */ | 972 | /* schedule work if limits changed even if no bio is queued */ |
931 | if (total_nr_queued(td) || td->limits_changed) { | 973 | if (total_nr_queued(td) || td->limits_changed) { |
932 | mod_delayed_work(kthrotld_workqueue, dwork, delay); | 974 | /* |
975 | * We might have a work scheduled to be executed in future. | ||
976 | * Cancel that and schedule a new one. | ||
977 | */ | ||
978 | __cancel_delayed_work(dwork); | ||
979 | queue_delayed_work(kthrotld_workqueue, dwork, delay); | ||
933 | throtl_log(td, "schedule work. delay=%lu jiffies=%lu", | 980 | throtl_log(td, "schedule work. delay=%lu jiffies=%lu", |
934 | delay, jiffies); | 981 | delay, jiffies); |
935 | } | 982 | } |
936 | } | 983 | } |
937 | 984 | ||
938 | static u64 tg_prfill_cpu_rwstat(struct seq_file *sf, | 985 | static void |
939 | struct blkg_policy_data *pd, int off) | 986 | throtl_destroy_tg(struct throtl_data *td, struct throtl_grp *tg) |
940 | { | 987 | { |
941 | struct throtl_grp *tg = pd_to_tg(pd); | 988 | /* Something wrong if we are trying to remove same group twice */ |
942 | struct blkg_rwstat rwstat = { }, tmp; | 989 | BUG_ON(hlist_unhashed(&tg->tg_node)); |
943 | int i, cpu; | ||
944 | 990 | ||
945 | for_each_possible_cpu(cpu) { | 991 | hlist_del_init(&tg->tg_node); |
946 | struct tg_stats_cpu *sc = per_cpu_ptr(tg->stats_cpu, cpu); | ||
947 | |||
948 | tmp = blkg_rwstat_read((void *)sc + off); | ||
949 | for (i = 0; i < BLKG_RWSTAT_NR; i++) | ||
950 | rwstat.cnt[i] += tmp.cnt[i]; | ||
951 | } | ||
952 | 992 | ||
953 | return __blkg_prfill_rwstat(sf, pd, &rwstat); | 993 | /* |
994 | * Put the reference taken at the time of creation so that when all | ||
995 | * queues are gone, group can be destroyed. | ||
996 | */ | ||
997 | throtl_put_tg(tg); | ||
998 | td->nr_undestroyed_grps--; | ||
954 | } | 999 | } |
955 | 1000 | ||
956 | static int tg_print_cpu_rwstat(struct cgroup *cgrp, struct cftype *cft, | 1001 | static void throtl_release_tgs(struct throtl_data *td) |
957 | struct seq_file *sf) | ||
958 | { | 1002 | { |
959 | struct blkcg *blkcg = cgroup_to_blkcg(cgrp); | 1003 | struct hlist_node *pos, *n; |
1004 | struct throtl_grp *tg; | ||
960 | 1005 | ||
961 | blkcg_print_blkgs(sf, blkcg, tg_prfill_cpu_rwstat, &blkcg_policy_throtl, | 1006 | hlist_for_each_entry_safe(tg, pos, n, &td->tg_list, tg_node) { |
962 | cft->private, true); | 1007 | /* |
963 | return 0; | 1008 | * If cgroup removal path got to blk_group first and removed |
1009 | * it from cgroup list, then it will take care of destroying | ||
1010 | * cfqg also. | ||
1011 | */ | ||
1012 | if (!blkiocg_del_blkio_group(&tg->blkg)) | ||
1013 | throtl_destroy_tg(td, tg); | ||
1014 | } | ||
964 | } | 1015 | } |
965 | 1016 | ||
966 | static u64 tg_prfill_conf_u64(struct seq_file *sf, struct blkg_policy_data *pd, | 1017 | static void throtl_td_free(struct throtl_data *td) |
967 | int off) | ||
968 | { | 1018 | { |
969 | struct throtl_grp *tg = pd_to_tg(pd); | 1019 | kfree(td); |
970 | u64 v = *(u64 *)((void *)tg + off); | ||
971 | |||
972 | if (v == -1) | ||
973 | return 0; | ||
974 | return __blkg_prfill_u64(sf, pd, v); | ||
975 | } | 1020 | } |
976 | 1021 | ||
977 | static u64 tg_prfill_conf_uint(struct seq_file *sf, struct blkg_policy_data *pd, | 1022 | /* |
978 | int off) | 1023 | * Blk cgroup controller notification saying that blkio_group object is being |
1024 | * delinked as associated cgroup object is going away. That also means that | ||
1025 | * no new IO will come in this group. So get rid of this group as soon as | ||
1026 | * any pending IO in the group is finished. | ||
1027 | * | ||
1028 | * This function is called under rcu_read_lock(). key is the rcu protected | ||
1029 | * pointer. That means "key" is a valid throtl_data pointer as long as we are | ||
1030 | * rcu read lock. | ||
1031 | * | ||
1032 | * "key" was fetched from blkio_group under blkio_cgroup->lock. That means | ||
1033 | * it should not be NULL as even if queue was going away, cgroup deltion | ||
1034 | * path got to it first. | ||
1035 | */ | ||
1036 | void throtl_unlink_blkio_group(void *key, struct blkio_group *blkg) | ||
979 | { | 1037 | { |
980 | struct throtl_grp *tg = pd_to_tg(pd); | 1038 | unsigned long flags; |
981 | unsigned int v = *(unsigned int *)((void *)tg + off); | 1039 | struct throtl_data *td = key; |
982 | 1040 | ||
983 | if (v == -1) | 1041 | spin_lock_irqsave(td->queue->queue_lock, flags); |
984 | return 0; | 1042 | throtl_destroy_tg(td, tg_of_blkg(blkg)); |
985 | return __blkg_prfill_u64(sf, pd, v); | 1043 | spin_unlock_irqrestore(td->queue->queue_lock, flags); |
986 | } | 1044 | } |
987 | 1045 | ||
988 | static int tg_print_conf_u64(struct cgroup *cgrp, struct cftype *cft, | 1046 | static void throtl_update_blkio_group_common(struct throtl_data *td, |
989 | struct seq_file *sf) | 1047 | struct throtl_grp *tg) |
990 | { | 1048 | { |
991 | blkcg_print_blkgs(sf, cgroup_to_blkcg(cgrp), tg_prfill_conf_u64, | 1049 | xchg(&tg->limits_changed, true); |
992 | &blkcg_policy_throtl, cft->private, false); | 1050 | xchg(&td->limits_changed, true); |
993 | return 0; | 1051 | /* Schedule a work now to process the limit change */ |
1052 | throtl_schedule_delayed_work(td, 0); | ||
994 | } | 1053 | } |
995 | 1054 | ||
996 | static int tg_print_conf_uint(struct cgroup *cgrp, struct cftype *cft, | 1055 | /* |
997 | struct seq_file *sf) | 1056 | * For all update functions, key should be a valid pointer because these |
1057 | * update functions are called under blkcg_lock, that means, blkg is | ||
1058 | * valid and in turn key is valid. queue exit path can not race because | ||
1059 | * of blkcg_lock | ||
1060 | * | ||
1061 | * Can not take queue lock in update functions as queue lock under blkcg_lock | ||
1062 | * is not allowed. Under other paths we take blkcg_lock under queue_lock. | ||
1063 | */ | ||
1064 | static void throtl_update_blkio_group_read_bps(void *key, | ||
1065 | struct blkio_group *blkg, u64 read_bps) | ||
998 | { | 1066 | { |
999 | blkcg_print_blkgs(sf, cgroup_to_blkcg(cgrp), tg_prfill_conf_uint, | 1067 | struct throtl_data *td = key; |
1000 | &blkcg_policy_throtl, cft->private, false); | 1068 | struct throtl_grp *tg = tg_of_blkg(blkg); |
1001 | return 0; | 1069 | |
1070 | tg->bps[READ] = read_bps; | ||
1071 | throtl_update_blkio_group_common(td, tg); | ||
1002 | } | 1072 | } |
1003 | 1073 | ||
1004 | static int tg_set_conf(struct cgroup *cgrp, struct cftype *cft, const char *buf, | 1074 | static void throtl_update_blkio_group_write_bps(void *key, |
1005 | bool is_u64) | 1075 | struct blkio_group *blkg, u64 write_bps) |
1006 | { | 1076 | { |
1007 | struct blkcg *blkcg = cgroup_to_blkcg(cgrp); | 1077 | struct throtl_data *td = key; |
1008 | struct blkg_conf_ctx ctx; | 1078 | struct throtl_grp *tg = tg_of_blkg(blkg); |
1009 | struct throtl_grp *tg; | ||
1010 | struct throtl_data *td; | ||
1011 | int ret; | ||
1012 | |||
1013 | ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx); | ||
1014 | if (ret) | ||
1015 | return ret; | ||
1016 | |||
1017 | tg = blkg_to_tg(ctx.blkg); | ||
1018 | td = ctx.blkg->q->td; | ||
1019 | |||
1020 | if (!ctx.v) | ||
1021 | ctx.v = -1; | ||
1022 | |||
1023 | if (is_u64) | ||
1024 | *(u64 *)((void *)tg + cft->private) = ctx.v; | ||
1025 | else | ||
1026 | *(unsigned int *)((void *)tg + cft->private) = ctx.v; | ||
1027 | |||
1028 | /* XXX: we don't need the following deferred processing */ | ||
1029 | xchg(&tg->limits_changed, true); | ||
1030 | xchg(&td->limits_changed, true); | ||
1031 | throtl_schedule_delayed_work(td, 0); | ||
1032 | 1079 | ||
1033 | blkg_conf_finish(&ctx); | 1080 | tg->bps[WRITE] = write_bps; |
1034 | return 0; | 1081 | throtl_update_blkio_group_common(td, tg); |
1035 | } | 1082 | } |
1036 | 1083 | ||
1037 | static int tg_set_conf_u64(struct cgroup *cgrp, struct cftype *cft, | 1084 | static void throtl_update_blkio_group_read_iops(void *key, |
1038 | const char *buf) | 1085 | struct blkio_group *blkg, unsigned int read_iops) |
1039 | { | 1086 | { |
1040 | return tg_set_conf(cgrp, cft, buf, true); | 1087 | struct throtl_data *td = key; |
1088 | struct throtl_grp *tg = tg_of_blkg(blkg); | ||
1089 | |||
1090 | tg->iops[READ] = read_iops; | ||
1091 | throtl_update_blkio_group_common(td, tg); | ||
1041 | } | 1092 | } |
1042 | 1093 | ||
1043 | static int tg_set_conf_uint(struct cgroup *cgrp, struct cftype *cft, | 1094 | static void throtl_update_blkio_group_write_iops(void *key, |
1044 | const char *buf) | 1095 | struct blkio_group *blkg, unsigned int write_iops) |
1045 | { | 1096 | { |
1046 | return tg_set_conf(cgrp, cft, buf, false); | 1097 | struct throtl_data *td = key; |
1047 | } | 1098 | struct throtl_grp *tg = tg_of_blkg(blkg); |
1048 | 1099 | ||
1049 | static struct cftype throtl_files[] = { | 1100 | tg->iops[WRITE] = write_iops; |
1050 | { | 1101 | throtl_update_blkio_group_common(td, tg); |
1051 | .name = "throttle.read_bps_device", | 1102 | } |
1052 | .private = offsetof(struct throtl_grp, bps[READ]), | ||
1053 | .read_seq_string = tg_print_conf_u64, | ||
1054 | .write_string = tg_set_conf_u64, | ||
1055 | .max_write_len = 256, | ||
1056 | }, | ||
1057 | { | ||
1058 | .name = "throttle.write_bps_device", | ||
1059 | .private = offsetof(struct throtl_grp, bps[WRITE]), | ||
1060 | .read_seq_string = tg_print_conf_u64, | ||
1061 | .write_string = tg_set_conf_u64, | ||
1062 | .max_write_len = 256, | ||
1063 | }, | ||
1064 | { | ||
1065 | .name = "throttle.read_iops_device", | ||
1066 | .private = offsetof(struct throtl_grp, iops[READ]), | ||
1067 | .read_seq_string = tg_print_conf_uint, | ||
1068 | .write_string = tg_set_conf_uint, | ||
1069 | .max_write_len = 256, | ||
1070 | }, | ||
1071 | { | ||
1072 | .name = "throttle.write_iops_device", | ||
1073 | .private = offsetof(struct throtl_grp, iops[WRITE]), | ||
1074 | .read_seq_string = tg_print_conf_uint, | ||
1075 | .write_string = tg_set_conf_uint, | ||
1076 | .max_write_len = 256, | ||
1077 | }, | ||
1078 | { | ||
1079 | .name = "throttle.io_service_bytes", | ||
1080 | .private = offsetof(struct tg_stats_cpu, service_bytes), | ||
1081 | .read_seq_string = tg_print_cpu_rwstat, | ||
1082 | }, | ||
1083 | { | ||
1084 | .name = "throttle.io_serviced", | ||
1085 | .private = offsetof(struct tg_stats_cpu, serviced), | ||
1086 | .read_seq_string = tg_print_cpu_rwstat, | ||
1087 | }, | ||
1088 | { } /* terminate */ | ||
1089 | }; | ||
1090 | 1103 | ||
1091 | static void throtl_shutdown_wq(struct request_queue *q) | 1104 | static void throtl_shutdown_wq(struct request_queue *q) |
1092 | { | 1105 | { |
@@ -1095,26 +1108,32 @@ static void throtl_shutdown_wq(struct request_queue *q) | |||
1095 | cancel_delayed_work_sync(&td->throtl_work); | 1108 | cancel_delayed_work_sync(&td->throtl_work); |
1096 | } | 1109 | } |
1097 | 1110 | ||
1098 | static struct blkcg_policy blkcg_policy_throtl = { | 1111 | static struct blkio_policy_type blkio_policy_throtl = { |
1099 | .pd_size = sizeof(struct throtl_grp), | 1112 | .ops = { |
1100 | .cftypes = throtl_files, | 1113 | .blkio_unlink_group_fn = throtl_unlink_blkio_group, |
1101 | 1114 | .blkio_update_group_read_bps_fn = | |
1102 | .pd_init_fn = throtl_pd_init, | 1115 | throtl_update_blkio_group_read_bps, |
1103 | .pd_exit_fn = throtl_pd_exit, | 1116 | .blkio_update_group_write_bps_fn = |
1104 | .pd_reset_stats_fn = throtl_pd_reset_stats, | 1117 | throtl_update_blkio_group_write_bps, |
1118 | .blkio_update_group_read_iops_fn = | ||
1119 | throtl_update_blkio_group_read_iops, | ||
1120 | .blkio_update_group_write_iops_fn = | ||
1121 | throtl_update_blkio_group_write_iops, | ||
1122 | }, | ||
1123 | .plid = BLKIO_POLICY_THROTL, | ||
1105 | }; | 1124 | }; |
1106 | 1125 | ||
1107 | bool blk_throtl_bio(struct request_queue *q, struct bio *bio) | 1126 | int blk_throtl_bio(struct request_queue *q, struct bio **biop) |
1108 | { | 1127 | { |
1109 | struct throtl_data *td = q->td; | 1128 | struct throtl_data *td = q->td; |
1110 | struct throtl_grp *tg; | 1129 | struct throtl_grp *tg; |
1130 | struct bio *bio = *biop; | ||
1111 | bool rw = bio_data_dir(bio), update_disptime = true; | 1131 | bool rw = bio_data_dir(bio), update_disptime = true; |
1112 | struct blkcg *blkcg; | 1132 | struct blkio_cgroup *blkcg; |
1113 | bool throttled = false; | ||
1114 | 1133 | ||
1115 | if (bio->bi_rw & REQ_THROTTLED) { | 1134 | if (bio->bi_rw & REQ_THROTTLED) { |
1116 | bio->bi_rw &= ~REQ_THROTTLED; | 1135 | bio->bi_rw &= ~REQ_THROTTLED; |
1117 | goto out; | 1136 | return 0; |
1118 | } | 1137 | } |
1119 | 1138 | ||
1120 | /* | 1139 | /* |
@@ -1122,25 +1141,38 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio) | |||
1122 | * basic fields like stats and io rates. If a group has no rules, | 1141 | * basic fields like stats and io rates. If a group has no rules, |
1123 | * just update the dispatch stats in lockless manner and return. | 1142 | * just update the dispatch stats in lockless manner and return. |
1124 | */ | 1143 | */ |
1144 | |||
1125 | rcu_read_lock(); | 1145 | rcu_read_lock(); |
1126 | blkcg = bio_blkcg(bio); | 1146 | blkcg = task_blkio_cgroup(current); |
1127 | tg = throtl_lookup_tg(td, blkcg); | 1147 | tg = throtl_find_tg(td, blkcg); |
1128 | if (tg) { | 1148 | if (tg) { |
1149 | throtl_tg_fill_dev_details(td, tg); | ||
1150 | |||
1129 | if (tg_no_rule_group(tg, rw)) { | 1151 | if (tg_no_rule_group(tg, rw)) { |
1130 | throtl_update_dispatch_stats(tg_to_blkg(tg), | 1152 | blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size, |
1131 | bio->bi_size, bio->bi_rw); | 1153 | rw, rw_is_sync(bio->bi_rw)); |
1132 | goto out_unlock_rcu; | 1154 | rcu_read_unlock(); |
1155 | return 0; | ||
1133 | } | 1156 | } |
1134 | } | 1157 | } |
1158 | rcu_read_unlock(); | ||
1135 | 1159 | ||
1136 | /* | 1160 | /* |
1137 | * Either group has not been allocated yet or it is not an unlimited | 1161 | * Either group has not been allocated yet or it is not an unlimited |
1138 | * IO group | 1162 | * IO group |
1139 | */ | 1163 | */ |
1164 | |||
1140 | spin_lock_irq(q->queue_lock); | 1165 | spin_lock_irq(q->queue_lock); |
1141 | tg = throtl_lookup_create_tg(td, blkcg); | 1166 | tg = throtl_get_tg(td); |
1142 | if (unlikely(!tg)) | 1167 | |
1143 | goto out_unlock; | 1168 | if (IS_ERR(tg)) { |
1169 | if (PTR_ERR(tg) == -ENODEV) { | ||
1170 | /* | ||
1171 | * Queue is gone. No queue lock held here. | ||
1172 | */ | ||
1173 | return -ENODEV; | ||
1174 | } | ||
1175 | } | ||
1144 | 1176 | ||
1145 | if (tg->nr_queued[rw]) { | 1177 | if (tg->nr_queued[rw]) { |
1146 | /* | 1178 | /* |
@@ -1168,7 +1200,7 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio) | |||
1168 | * So keep on trimming slice even if bio is not queued. | 1200 | * So keep on trimming slice even if bio is not queued. |
1169 | */ | 1201 | */ |
1170 | throtl_trim_slice(td, tg, rw); | 1202 | throtl_trim_slice(td, tg, rw); |
1171 | goto out_unlock; | 1203 | goto out; |
1172 | } | 1204 | } |
1173 | 1205 | ||
1174 | queue_bio: | 1206 | queue_bio: |
@@ -1179,87 +1211,92 @@ queue_bio: | |||
1179 | tg->io_disp[rw], tg->iops[rw], | 1211 | tg->io_disp[rw], tg->iops[rw], |
1180 | tg->nr_queued[READ], tg->nr_queued[WRITE]); | 1212 | tg->nr_queued[READ], tg->nr_queued[WRITE]); |
1181 | 1213 | ||
1182 | bio_associate_current(bio); | ||
1183 | throtl_add_bio_tg(q->td, tg, bio); | 1214 | throtl_add_bio_tg(q->td, tg, bio); |
1184 | throttled = true; | 1215 | *biop = NULL; |
1185 | 1216 | ||
1186 | if (update_disptime) { | 1217 | if (update_disptime) { |
1187 | tg_update_disptime(td, tg); | 1218 | tg_update_disptime(td, tg); |
1188 | throtl_schedule_next_dispatch(td); | 1219 | throtl_schedule_next_dispatch(td); |
1189 | } | 1220 | } |
1190 | 1221 | ||
1191 | out_unlock: | ||
1192 | spin_unlock_irq(q->queue_lock); | ||
1193 | out_unlock_rcu: | ||
1194 | rcu_read_unlock(); | ||
1195 | out: | 1222 | out: |
1196 | return throttled; | ||
1197 | } | ||
1198 | |||
1199 | /** | ||
1200 | * blk_throtl_drain - drain throttled bios | ||
1201 | * @q: request_queue to drain throttled bios for | ||
1202 | * | ||
1203 | * Dispatch all currently throttled bios on @q through ->make_request_fn(). | ||
1204 | */ | ||
1205 | void blk_throtl_drain(struct request_queue *q) | ||
1206 | __releases(q->queue_lock) __acquires(q->queue_lock) | ||
1207 | { | ||
1208 | struct throtl_data *td = q->td; | ||
1209 | struct throtl_rb_root *st = &td->tg_service_tree; | ||
1210 | struct throtl_grp *tg; | ||
1211 | struct bio_list bl; | ||
1212 | struct bio *bio; | ||
1213 | |||
1214 | queue_lockdep_assert_held(q); | ||
1215 | |||
1216 | bio_list_init(&bl); | ||
1217 | |||
1218 | while ((tg = throtl_rb_first(st))) { | ||
1219 | throtl_dequeue_tg(td, tg); | ||
1220 | |||
1221 | while ((bio = bio_list_peek(&tg->bio_lists[READ]))) | ||
1222 | tg_dispatch_one_bio(td, tg, bio_data_dir(bio), &bl); | ||
1223 | while ((bio = bio_list_peek(&tg->bio_lists[WRITE]))) | ||
1224 | tg_dispatch_one_bio(td, tg, bio_data_dir(bio), &bl); | ||
1225 | } | ||
1226 | spin_unlock_irq(q->queue_lock); | 1223 | spin_unlock_irq(q->queue_lock); |
1227 | 1224 | return 0; | |
1228 | while ((bio = bio_list_pop(&bl))) | ||
1229 | generic_make_request(bio); | ||
1230 | |||
1231 | spin_lock_irq(q->queue_lock); | ||
1232 | } | 1225 | } |
1233 | 1226 | ||
1234 | int blk_throtl_init(struct request_queue *q) | 1227 | int blk_throtl_init(struct request_queue *q) |
1235 | { | 1228 | { |
1236 | struct throtl_data *td; | 1229 | struct throtl_data *td; |
1237 | int ret; | 1230 | struct throtl_grp *tg; |
1238 | 1231 | ||
1239 | td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node); | 1232 | td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node); |
1240 | if (!td) | 1233 | if (!td) |
1241 | return -ENOMEM; | 1234 | return -ENOMEM; |
1242 | 1235 | ||
1236 | INIT_HLIST_HEAD(&td->tg_list); | ||
1243 | td->tg_service_tree = THROTL_RB_ROOT; | 1237 | td->tg_service_tree = THROTL_RB_ROOT; |
1244 | td->limits_changed = false; | 1238 | td->limits_changed = false; |
1245 | INIT_DELAYED_WORK(&td->throtl_work, blk_throtl_work); | 1239 | INIT_DELAYED_WORK(&td->throtl_work, blk_throtl_work); |
1246 | 1240 | ||
1247 | q->td = td; | 1241 | /* alloc and Init root group. */ |
1248 | td->queue = q; | 1242 | td->queue = q; |
1243 | tg = throtl_alloc_tg(td); | ||
1249 | 1244 | ||
1250 | /* activate policy */ | 1245 | if (!tg) { |
1251 | ret = blkcg_activate_policy(q, &blkcg_policy_throtl); | ||
1252 | if (ret) | ||
1253 | kfree(td); | 1246 | kfree(td); |
1254 | return ret; | 1247 | return -ENOMEM; |
1248 | } | ||
1249 | |||
1250 | td->root_tg = tg; | ||
1251 | |||
1252 | rcu_read_lock(); | ||
1253 | throtl_init_add_tg_lists(td, tg, &blkio_root_cgroup); | ||
1254 | rcu_read_unlock(); | ||
1255 | |||
1256 | /* Attach throtl data to request queue */ | ||
1257 | q->td = td; | ||
1258 | return 0; | ||
1255 | } | 1259 | } |
1256 | 1260 | ||
1257 | void blk_throtl_exit(struct request_queue *q) | 1261 | void blk_throtl_exit(struct request_queue *q) |
1258 | { | 1262 | { |
1259 | BUG_ON(!q->td); | 1263 | struct throtl_data *td = q->td; |
1264 | bool wait = false; | ||
1265 | |||
1266 | BUG_ON(!td); | ||
1267 | |||
1260 | throtl_shutdown_wq(q); | 1268 | throtl_shutdown_wq(q); |
1261 | blkcg_deactivate_policy(q, &blkcg_policy_throtl); | 1269 | |
1262 | kfree(q->td); | 1270 | spin_lock_irq(q->queue_lock); |
1271 | throtl_release_tgs(td); | ||
1272 | |||
1273 | /* If there are other groups */ | ||
1274 | if (td->nr_undestroyed_grps > 0) | ||
1275 | wait = true; | ||
1276 | |||
1277 | spin_unlock_irq(q->queue_lock); | ||
1278 | |||
1279 | /* | ||
1280 | * Wait for tg->blkg->key accessors to exit their grace periods. | ||
1281 | * Do this wait only if there are other undestroyed groups out | ||
1282 | * there (other than root group). This can happen if cgroup deletion | ||
1283 | * path claimed the responsibility of cleaning up a group before | ||
1284 | * queue cleanup code get to the group. | ||
1285 | * | ||
1286 | * Do not call synchronize_rcu() unconditionally as there are drivers | ||
1287 | * which create/delete request queue hundreds of times during scan/boot | ||
1288 | * and synchronize_rcu() can take significant time and slow down boot. | ||
1289 | */ | ||
1290 | if (wait) | ||
1291 | synchronize_rcu(); | ||
1292 | |||
1293 | /* | ||
1294 | * Just being safe to make sure after previous flush if some body did | ||
1295 | * update limits through cgroup and another work got queued, cancel | ||
1296 | * it. | ||
1297 | */ | ||
1298 | throtl_shutdown_wq(q); | ||
1299 | throtl_td_free(td); | ||
1263 | } | 1300 | } |
1264 | 1301 | ||
1265 | static int __init throtl_init(void) | 1302 | static int __init throtl_init(void) |
@@ -1268,7 +1305,8 @@ static int __init throtl_init(void) | |||
1268 | if (!kthrotld_workqueue) | 1305 | if (!kthrotld_workqueue) |
1269 | panic("Failed to create kthrotld\n"); | 1306 | panic("Failed to create kthrotld\n"); |
1270 | 1307 | ||
1271 | return blkcg_policy_register(&blkcg_policy_throtl); | 1308 | blkio_policy_register(&blkio_policy_throtl); |
1309 | return 0; | ||
1272 | } | 1310 | } |
1273 | 1311 | ||
1274 | module_init(throtl_init); | 1312 | module_init(throtl_init); |
diff --git a/block/blk-timeout.c b/block/blk-timeout.c index 6e4744cbfb5..78035488895 100644 --- a/block/blk-timeout.c +++ b/block/blk-timeout.c | |||
@@ -197,3 +197,44 @@ void blk_add_timer(struct request *req) | |||
197 | mod_timer(&q->timeout, expiry); | 197 | mod_timer(&q->timeout, expiry); |
198 | } | 198 | } |
199 | 199 | ||
200 | /** | ||
201 | * blk_abort_queue -- Abort all request on given queue | ||
202 | * @queue: pointer to queue | ||
203 | * | ||
204 | */ | ||
205 | void blk_abort_queue(struct request_queue *q) | ||
206 | { | ||
207 | unsigned long flags; | ||
208 | struct request *rq, *tmp; | ||
209 | LIST_HEAD(list); | ||
210 | |||
211 | /* | ||
212 | * Not a request based block device, nothing to abort | ||
213 | */ | ||
214 | if (!q->request_fn) | ||
215 | return; | ||
216 | |||
217 | spin_lock_irqsave(q->queue_lock, flags); | ||
218 | |||
219 | elv_abort_queue(q); | ||
220 | |||
221 | /* | ||
222 | * Splice entries to local list, to avoid deadlocking if entries | ||
223 | * get readded to the timeout list by error handling | ||
224 | */ | ||
225 | list_splice_init(&q->timeout_list, &list); | ||
226 | |||
227 | list_for_each_entry_safe(rq, tmp, &list, timeout_list) | ||
228 | blk_abort_request(rq); | ||
229 | |||
230 | /* | ||
231 | * Occasionally, blk_abort_request() will return without | ||
232 | * deleting the element from the list. Make sure we add those back | ||
233 | * instead of leaving them on the local stack list. | ||
234 | */ | ||
235 | list_splice(&list, &q->timeout_list); | ||
236 | |||
237 | spin_unlock_irqrestore(q->queue_lock, flags); | ||
238 | |||
239 | } | ||
240 | EXPORT_SYMBOL_GPL(blk_abort_queue); | ||
diff --git a/block/blk.h b/block/blk.h index 47fdfdd4152..20b900a377c 100644 --- a/block/blk.h +++ b/block/blk.h | |||
@@ -1,8 +1,6 @@ | |||
1 | #ifndef BLK_INTERNAL_H | 1 | #ifndef BLK_INTERNAL_H |
2 | #define BLK_INTERNAL_H | 2 | #define BLK_INTERNAL_H |
3 | 3 | ||
4 | #include <linux/idr.h> | ||
5 | |||
6 | /* Amount of time in which a process may batch requests */ | 4 | /* Amount of time in which a process may batch requests */ |
7 | #define BLK_BATCH_TIME (HZ/50UL) | 5 | #define BLK_BATCH_TIME (HZ/50UL) |
8 | 6 | ||
@@ -11,23 +9,12 @@ | |||
11 | 9 | ||
12 | extern struct kmem_cache *blk_requestq_cachep; | 10 | extern struct kmem_cache *blk_requestq_cachep; |
13 | extern struct kobj_type blk_queue_ktype; | 11 | extern struct kobj_type blk_queue_ktype; |
14 | extern struct ida blk_queue_ida; | ||
15 | |||
16 | static inline void __blk_get_queue(struct request_queue *q) | ||
17 | { | ||
18 | kobject_get(&q->kobj); | ||
19 | } | ||
20 | 12 | ||
21 | int blk_init_rl(struct request_list *rl, struct request_queue *q, | ||
22 | gfp_t gfp_mask); | ||
23 | void blk_exit_rl(struct request_list *rl); | ||
24 | void init_request_from_bio(struct request *req, struct bio *bio); | 13 | void init_request_from_bio(struct request *req, struct bio *bio); |
25 | void blk_rq_bio_prep(struct request_queue *q, struct request *rq, | 14 | void blk_rq_bio_prep(struct request_queue *q, struct request *rq, |
26 | struct bio *bio); | 15 | struct bio *bio); |
27 | int blk_rq_append_bio(struct request_queue *q, struct request *rq, | 16 | int blk_rq_append_bio(struct request_queue *q, struct request *rq, |
28 | struct bio *bio); | 17 | struct bio *bio); |
29 | void blk_queue_bypass_start(struct request_queue *q); | ||
30 | void blk_queue_bypass_end(struct request_queue *q); | ||
31 | void blk_dequeue_request(struct request *rq); | 18 | void blk_dequeue_request(struct request *rq); |
32 | void __blk_queue_free_tags(struct request_queue *q); | 19 | void __blk_queue_free_tags(struct request_queue *q); |
33 | bool __blk_end_bidi_request(struct request *rq, int error, | 20 | bool __blk_end_bidi_request(struct request *rq, int error, |
@@ -36,6 +23,7 @@ bool __blk_end_bidi_request(struct request *rq, int error, | |||
36 | void blk_rq_timed_out_timer(unsigned long data); | 23 | void blk_rq_timed_out_timer(unsigned long data); |
37 | void blk_delete_timer(struct request *); | 24 | void blk_delete_timer(struct request *); |
38 | void blk_add_timer(struct request *); | 25 | void blk_add_timer(struct request *); |
26 | void __generic_unplug_device(struct request_queue *); | ||
39 | 27 | ||
40 | /* | 28 | /* |
41 | * Internal atomic flags for request handling | 29 | * Internal atomic flags for request handling |
@@ -96,8 +84,8 @@ static inline struct request *__elv_next_request(struct request_queue *q) | |||
96 | q->flush_queue_delayed = 1; | 84 | q->flush_queue_delayed = 1; |
97 | return NULL; | 85 | return NULL; |
98 | } | 86 | } |
99 | if (unlikely(blk_queue_dying(q)) || | 87 | if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags) || |
100 | !q->elevator->type->ops.elevator_dispatch_fn(q, 0)) | 88 | !q->elevator->ops->elevator_dispatch_fn(q, 0)) |
101 | return NULL; | 89 | return NULL; |
102 | } | 90 | } |
103 | } | 91 | } |
@@ -106,16 +94,16 @@ static inline void elv_activate_rq(struct request_queue *q, struct request *rq) | |||
106 | { | 94 | { |
107 | struct elevator_queue *e = q->elevator; | 95 | struct elevator_queue *e = q->elevator; |
108 | 96 | ||
109 | if (e->type->ops.elevator_activate_req_fn) | 97 | if (e->ops->elevator_activate_req_fn) |
110 | e->type->ops.elevator_activate_req_fn(q, rq); | 98 | e->ops->elevator_activate_req_fn(q, rq); |
111 | } | 99 | } |
112 | 100 | ||
113 | static inline void elv_deactivate_rq(struct request_queue *q, struct request *rq) | 101 | static inline void elv_deactivate_rq(struct request_queue *q, struct request *rq) |
114 | { | 102 | { |
115 | struct elevator_queue *e = q->elevator; | 103 | struct elevator_queue *e = q->elevator; |
116 | 104 | ||
117 | if (e->type->ops.elevator_deactivate_req_fn) | 105 | if (e->ops->elevator_deactivate_req_fn) |
118 | e->type->ops.elevator_deactivate_req_fn(q, rq); | 106 | e->ops->elevator_deactivate_req_fn(q, rq); |
119 | } | 107 | } |
120 | 108 | ||
121 | #ifdef CONFIG_FAIL_IO_TIMEOUT | 109 | #ifdef CONFIG_FAIL_IO_TIMEOUT |
@@ -130,6 +118,8 @@ static inline int blk_should_fake_timeout(struct request_queue *q) | |||
130 | } | 118 | } |
131 | #endif | 119 | #endif |
132 | 120 | ||
121 | struct io_context *current_io_context(gfp_t gfp_flags, int node); | ||
122 | |||
133 | int ll_back_merge_fn(struct request_queue *q, struct request *req, | 123 | int ll_back_merge_fn(struct request_queue *q, struct request *req, |
134 | struct bio *bio); | 124 | struct bio *bio); |
135 | int ll_front_merge_fn(struct request_queue *q, struct request *req, | 125 | int ll_front_merge_fn(struct request_queue *q, struct request *req, |
@@ -140,15 +130,14 @@ int blk_attempt_req_merge(struct request_queue *q, struct request *rq, | |||
140 | struct request *next); | 130 | struct request *next); |
141 | void blk_recalc_rq_segments(struct request *rq); | 131 | void blk_recalc_rq_segments(struct request *rq); |
142 | void blk_rq_set_mixed_merge(struct request *rq); | 132 | void blk_rq_set_mixed_merge(struct request *rq); |
143 | bool blk_rq_merge_ok(struct request *rq, struct bio *bio); | ||
144 | int blk_try_merge(struct request *rq, struct bio *bio); | ||
145 | 133 | ||
146 | void blk_queue_congestion_threshold(struct request_queue *q); | 134 | void blk_queue_congestion_threshold(struct request_queue *q); |
147 | 135 | ||
148 | void __blk_run_queue_uncond(struct request_queue *q); | ||
149 | |||
150 | int blk_dev_init(void); | 136 | int blk_dev_init(void); |
151 | 137 | ||
138 | void elv_quiesce_start(struct request_queue *q); | ||
139 | void elv_quiesce_end(struct request_queue *q); | ||
140 | |||
152 | 141 | ||
153 | /* | 142 | /* |
154 | * Return the threshold (number of used requests) at which the queue is | 143 | * Return the threshold (number of used requests) at which the queue is |
@@ -168,67 +157,35 @@ static inline int queue_congestion_off_threshold(struct request_queue *q) | |||
168 | return q->nr_congestion_off; | 157 | return q->nr_congestion_off; |
169 | } | 158 | } |
170 | 159 | ||
160 | static inline int blk_cpu_to_group(int cpu) | ||
161 | { | ||
162 | int group = NR_CPUS; | ||
163 | #ifdef CONFIG_SCHED_MC | ||
164 | const struct cpumask *mask = cpu_coregroup_mask(cpu); | ||
165 | group = cpumask_first(mask); | ||
166 | #elif defined(CONFIG_SCHED_SMT) | ||
167 | group = cpumask_first(topology_thread_cpumask(cpu)); | ||
168 | #else | ||
169 | return cpu; | ||
170 | #endif | ||
171 | if (likely(group < NR_CPUS)) | ||
172 | return group; | ||
173 | return cpu; | ||
174 | } | ||
175 | |||
171 | /* | 176 | /* |
172 | * Contribute to IO statistics IFF: | 177 | * Contribute to IO statistics IFF: |
173 | * | 178 | * |
174 | * a) it's attached to a gendisk, and | 179 | * a) it's attached to a gendisk, and |
175 | * b) the queue had IO stats enabled when this request was started, and | 180 | * b) the queue had IO stats enabled when this request was started, and |
176 | * c) it's a file system request | 181 | * c) it's a file system request or a discard request |
177 | */ | 182 | */ |
178 | static inline int blk_do_io_stat(struct request *rq) | 183 | static inline int blk_do_io_stat(struct request *rq) |
179 | { | 184 | { |
180 | return rq->rq_disk && | 185 | return rq->rq_disk && |
181 | (rq->cmd_flags & REQ_IO_STAT) && | 186 | (rq->cmd_flags & REQ_IO_STAT) && |
182 | (rq->cmd_type == REQ_TYPE_FS); | 187 | (rq->cmd_type == REQ_TYPE_FS || |
183 | } | 188 | (rq->cmd_flags & REQ_DISCARD)); |
184 | |||
185 | /* | ||
186 | * Internal io_context interface | ||
187 | */ | ||
188 | void get_io_context(struct io_context *ioc); | ||
189 | struct io_cq *ioc_lookup_icq(struct io_context *ioc, struct request_queue *q); | ||
190 | struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q, | ||
191 | gfp_t gfp_mask); | ||
192 | void ioc_clear_queue(struct request_queue *q); | ||
193 | |||
194 | int create_task_io_context(struct task_struct *task, gfp_t gfp_mask, int node); | ||
195 | |||
196 | /** | ||
197 | * create_io_context - try to create task->io_context | ||
198 | * @gfp_mask: allocation mask | ||
199 | * @node: allocation node | ||
200 | * | ||
201 | * If %current->io_context is %NULL, allocate a new io_context and install | ||
202 | * it. Returns the current %current->io_context which may be %NULL if | ||
203 | * allocation failed. | ||
204 | * | ||
205 | * Note that this function can't be called with IRQ disabled because | ||
206 | * task_lock which protects %current->io_context is IRQ-unsafe. | ||
207 | */ | ||
208 | static inline struct io_context *create_io_context(gfp_t gfp_mask, int node) | ||
209 | { | ||
210 | WARN_ON_ONCE(irqs_disabled()); | ||
211 | if (unlikely(!current->io_context)) | ||
212 | create_task_io_context(current, gfp_mask, node); | ||
213 | return current->io_context; | ||
214 | } | 189 | } |
215 | 190 | ||
216 | /* | 191 | #endif |
217 | * Internal throttling interface | ||
218 | */ | ||
219 | #ifdef CONFIG_BLK_DEV_THROTTLING | ||
220 | extern bool blk_throtl_bio(struct request_queue *q, struct bio *bio); | ||
221 | extern void blk_throtl_drain(struct request_queue *q); | ||
222 | extern int blk_throtl_init(struct request_queue *q); | ||
223 | extern void blk_throtl_exit(struct request_queue *q); | ||
224 | #else /* CONFIG_BLK_DEV_THROTTLING */ | ||
225 | static inline bool blk_throtl_bio(struct request_queue *q, struct bio *bio) | ||
226 | { | ||
227 | return false; | ||
228 | } | ||
229 | static inline void blk_throtl_drain(struct request_queue *q) { } | ||
230 | static inline int blk_throtl_init(struct request_queue *q) { return 0; } | ||
231 | static inline void blk_throtl_exit(struct request_queue *q) { } | ||
232 | #endif /* CONFIG_BLK_DEV_THROTTLING */ | ||
233 | |||
234 | #endif /* BLK_INTERNAL_H */ | ||
diff --git a/block/bsg-lib.c b/block/bsg-lib.c index 650f427d915..6690e6e4103 100644 --- a/block/bsg-lib.c +++ b/block/bsg-lib.c | |||
@@ -25,7 +25,7 @@ | |||
25 | #include <linux/delay.h> | 25 | #include <linux/delay.h> |
26 | #include <linux/scatterlist.h> | 26 | #include <linux/scatterlist.h> |
27 | #include <linux/bsg-lib.h> | 27 | #include <linux/bsg-lib.h> |
28 | #include <linux/export.h> | 28 | #include <linux/module.h> |
29 | #include <scsi/scsi_cmnd.h> | 29 | #include <scsi/scsi_cmnd.h> |
30 | 30 | ||
31 | /** | 31 | /** |
@@ -151,6 +151,19 @@ failjob_rls_job: | |||
151 | return -ENOMEM; | 151 | return -ENOMEM; |
152 | } | 152 | } |
153 | 153 | ||
154 | /* | ||
155 | * bsg_goose_queue - restart queue in case it was stopped | ||
156 | * @q: request q to be restarted | ||
157 | */ | ||
158 | void bsg_goose_queue(struct request_queue *q) | ||
159 | { | ||
160 | if (!q) | ||
161 | return; | ||
162 | |||
163 | blk_run_queue_async(q); | ||
164 | } | ||
165 | EXPORT_SYMBOL_GPL(bsg_goose_queue); | ||
166 | |||
154 | /** | 167 | /** |
155 | * bsg_request_fn - generic handler for bsg requests | 168 | * bsg_request_fn - generic handler for bsg requests |
156 | * @q: request queue to manage | 169 | * @q: request queue to manage |
@@ -230,3 +243,56 @@ int bsg_setup_queue(struct device *dev, struct request_queue *q, | |||
230 | return 0; | 243 | return 0; |
231 | } | 244 | } |
232 | EXPORT_SYMBOL_GPL(bsg_setup_queue); | 245 | EXPORT_SYMBOL_GPL(bsg_setup_queue); |
246 | |||
247 | /** | ||
248 | * bsg_remove_queue - Deletes the bsg dev from the q | ||
249 | * @q: the request_queue that is to be torn down. | ||
250 | * | ||
251 | * Notes: | ||
252 | * Before unregistering the queue empty any requests that are blocked | ||
253 | */ | ||
254 | void bsg_remove_queue(struct request_queue *q) | ||
255 | { | ||
256 | struct request *req; /* block request */ | ||
257 | int counts; /* totals for request_list count and starved */ | ||
258 | |||
259 | if (!q) | ||
260 | return; | ||
261 | |||
262 | /* Stop taking in new requests */ | ||
263 | spin_lock_irq(q->queue_lock); | ||
264 | blk_stop_queue(q); | ||
265 | |||
266 | /* drain all requests in the queue */ | ||
267 | while (1) { | ||
268 | /* need the lock to fetch a request | ||
269 | * this may fetch the same reqeust as the previous pass | ||
270 | */ | ||
271 | req = blk_fetch_request(q); | ||
272 | /* save requests in use and starved */ | ||
273 | counts = q->rq.count[0] + q->rq.count[1] + | ||
274 | q->rq.starved[0] + q->rq.starved[1]; | ||
275 | spin_unlock_irq(q->queue_lock); | ||
276 | /* any requests still outstanding? */ | ||
277 | if (counts == 0) | ||
278 | break; | ||
279 | |||
280 | /* This may be the same req as the previous iteration, | ||
281 | * always send the blk_end_request_all after a prefetch. | ||
282 | * It is not okay to not end the request because the | ||
283 | * prefetch started the request. | ||
284 | */ | ||
285 | if (req) { | ||
286 | /* return -ENXIO to indicate that this queue is | ||
287 | * going away | ||
288 | */ | ||
289 | req->errors = -ENXIO; | ||
290 | blk_end_request_all(req, -ENXIO); | ||
291 | } | ||
292 | |||
293 | msleep(200); /* allow bsg to possibly finish */ | ||
294 | spin_lock_irq(q->queue_lock); | ||
295 | } | ||
296 | bsg_unregister_queue(q); | ||
297 | } | ||
298 | EXPORT_SYMBOL_GPL(bsg_remove_queue); | ||
diff --git a/block/bsg.c b/block/bsg.c index ff64ae3bace..702f1316bb8 100644 --- a/block/bsg.c +++ b/block/bsg.c | |||
@@ -769,10 +769,12 @@ static struct bsg_device *bsg_add_device(struct inode *inode, | |||
769 | struct file *file) | 769 | struct file *file) |
770 | { | 770 | { |
771 | struct bsg_device *bd; | 771 | struct bsg_device *bd; |
772 | int ret; | ||
772 | #ifdef BSG_DEBUG | 773 | #ifdef BSG_DEBUG |
773 | unsigned char buf[32]; | 774 | unsigned char buf[32]; |
774 | #endif | 775 | #endif |
775 | if (!blk_get_queue(rq)) | 776 | ret = blk_get_queue(rq); |
777 | if (ret) | ||
776 | return ERR_PTR(-ENXIO); | 778 | return ERR_PTR(-ENXIO); |
777 | 779 | ||
778 | bd = bsg_alloc_device(); | 780 | bd = bsg_alloc_device(); |
@@ -983,8 +985,7 @@ void bsg_unregister_queue(struct request_queue *q) | |||
983 | 985 | ||
984 | mutex_lock(&bsg_mutex); | 986 | mutex_lock(&bsg_mutex); |
985 | idr_remove(&bsg_minor_idr, bcd->minor); | 987 | idr_remove(&bsg_minor_idr, bcd->minor); |
986 | if (q->kobj.sd) | 988 | sysfs_remove_link(&q->kobj, "bsg"); |
987 | sysfs_remove_link(&q->kobj, "bsg"); | ||
988 | device_unregister(bcd->class_dev); | 989 | device_unregister(bcd->class_dev); |
989 | bcd->class_dev = NULL; | 990 | bcd->class_dev = NULL; |
990 | kref_put(&bcd->ref, bsg_kref_release_function); | 991 | kref_put(&bcd->ref, bsg_kref_release_function); |
@@ -1069,7 +1070,7 @@ EXPORT_SYMBOL_GPL(bsg_register_queue); | |||
1069 | 1070 | ||
1070 | static struct cdev bsg_cdev; | 1071 | static struct cdev bsg_cdev; |
1071 | 1072 | ||
1072 | static char *bsg_devnode(struct device *dev, umode_t *mode) | 1073 | static char *bsg_devnode(struct device *dev, mode_t *mode) |
1073 | { | 1074 | { |
1074 | return kasprintf(GFP_KERNEL, "bsg/%s", dev_name(dev)); | 1075 | return kasprintf(GFP_KERNEL, "bsg/%s", dev_name(dev)); |
1075 | } | 1076 | } |
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index e62e9205b80..4c12869fcf7 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c | |||
@@ -14,8 +14,7 @@ | |||
14 | #include <linux/rbtree.h> | 14 | #include <linux/rbtree.h> |
15 | #include <linux/ioprio.h> | 15 | #include <linux/ioprio.h> |
16 | #include <linux/blktrace_api.h> | 16 | #include <linux/blktrace_api.h> |
17 | #include "blk.h" | 17 | #include "cfq.h" |
18 | #include "blk-cgroup.h" | ||
19 | 18 | ||
20 | /* | 19 | /* |
21 | * tunables | 20 | * tunables |
@@ -54,11 +53,20 @@ static const int cfq_hist_divisor = 4; | |||
54 | #define CFQQ_SECT_THR_NONROT (sector_t)(2 * 32) | 53 | #define CFQQ_SECT_THR_NONROT (sector_t)(2 * 32) |
55 | #define CFQQ_SEEKY(cfqq) (hweight32(cfqq->seek_history) > 32/8) | 54 | #define CFQQ_SEEKY(cfqq) (hweight32(cfqq->seek_history) > 32/8) |
56 | 55 | ||
57 | #define RQ_CIC(rq) icq_to_cic((rq)->elv.icq) | 56 | #define RQ_CIC(rq) \ |
58 | #define RQ_CFQQ(rq) (struct cfq_queue *) ((rq)->elv.priv[0]) | 57 | ((struct cfq_io_context *) (rq)->elevator_private[0]) |
59 | #define RQ_CFQG(rq) (struct cfq_group *) ((rq)->elv.priv[1]) | 58 | #define RQ_CFQQ(rq) (struct cfq_queue *) ((rq)->elevator_private[1]) |
59 | #define RQ_CFQG(rq) (struct cfq_group *) ((rq)->elevator_private[2]) | ||
60 | 60 | ||
61 | static struct kmem_cache *cfq_pool; | 61 | static struct kmem_cache *cfq_pool; |
62 | static struct kmem_cache *cfq_ioc_pool; | ||
63 | |||
64 | static DEFINE_PER_CPU(unsigned long, cfq_ioc_count); | ||
65 | static struct completion *ioc_gone; | ||
66 | static DEFINE_SPINLOCK(ioc_gone_lock); | ||
67 | |||
68 | static DEFINE_SPINLOCK(cic_index_lock); | ||
69 | static DEFINE_IDA(cic_index_ida); | ||
62 | 70 | ||
63 | #define CFQ_PRIO_LISTS IOPRIO_BE_NR | 71 | #define CFQ_PRIO_LISTS IOPRIO_BE_NR |
64 | #define cfq_class_idle(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE) | 72 | #define cfq_class_idle(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE) |
@@ -67,14 +75,6 @@ static struct kmem_cache *cfq_pool; | |||
67 | #define sample_valid(samples) ((samples) > 80) | 75 | #define sample_valid(samples) ((samples) > 80) |
68 | #define rb_entry_cfqg(node) rb_entry((node), struct cfq_group, rb_node) | 76 | #define rb_entry_cfqg(node) rb_entry((node), struct cfq_group, rb_node) |
69 | 77 | ||
70 | struct cfq_ttime { | ||
71 | unsigned long last_end_request; | ||
72 | |||
73 | unsigned long ttime_total; | ||
74 | unsigned long ttime_samples; | ||
75 | unsigned long ttime_mean; | ||
76 | }; | ||
77 | |||
78 | /* | 78 | /* |
79 | * Most of our rbtree usage is for sorting with min extraction, so | 79 | * Most of our rbtree usage is for sorting with min extraction, so |
80 | * if we cache the leftmost node we don't have to walk down the tree | 80 | * if we cache the leftmost node we don't have to walk down the tree |
@@ -171,53 +171,8 @@ enum wl_type_t { | |||
171 | SYNC_WORKLOAD = 2 | 171 | SYNC_WORKLOAD = 2 |
172 | }; | 172 | }; |
173 | 173 | ||
174 | struct cfqg_stats { | ||
175 | #ifdef CONFIG_CFQ_GROUP_IOSCHED | ||
176 | /* total bytes transferred */ | ||
177 | struct blkg_rwstat service_bytes; | ||
178 | /* total IOs serviced, post merge */ | ||
179 | struct blkg_rwstat serviced; | ||
180 | /* number of ios merged */ | ||
181 | struct blkg_rwstat merged; | ||
182 | /* total time spent on device in ns, may not be accurate w/ queueing */ | ||
183 | struct blkg_rwstat service_time; | ||
184 | /* total time spent waiting in scheduler queue in ns */ | ||
185 | struct blkg_rwstat wait_time; | ||
186 | /* number of IOs queued up */ | ||
187 | struct blkg_rwstat queued; | ||
188 | /* total sectors transferred */ | ||
189 | struct blkg_stat sectors; | ||
190 | /* total disk time and nr sectors dispatched by this group */ | ||
191 | struct blkg_stat time; | ||
192 | #ifdef CONFIG_DEBUG_BLK_CGROUP | ||
193 | /* time not charged to this cgroup */ | ||
194 | struct blkg_stat unaccounted_time; | ||
195 | /* sum of number of ios queued across all samples */ | ||
196 | struct blkg_stat avg_queue_size_sum; | ||
197 | /* count of samples taken for average */ | ||
198 | struct blkg_stat avg_queue_size_samples; | ||
199 | /* how many times this group has been removed from service tree */ | ||
200 | struct blkg_stat dequeue; | ||
201 | /* total time spent waiting for it to be assigned a timeslice. */ | ||
202 | struct blkg_stat group_wait_time; | ||
203 | /* time spent idling for this blkcg_gq */ | ||
204 | struct blkg_stat idle_time; | ||
205 | /* total time with empty current active q with other requests queued */ | ||
206 | struct blkg_stat empty_time; | ||
207 | /* fields after this shouldn't be cleared on stat reset */ | ||
208 | uint64_t start_group_wait_time; | ||
209 | uint64_t start_idle_time; | ||
210 | uint64_t start_empty_time; | ||
211 | uint16_t flags; | ||
212 | #endif /* CONFIG_DEBUG_BLK_CGROUP */ | ||
213 | #endif /* CONFIG_CFQ_GROUP_IOSCHED */ | ||
214 | }; | ||
215 | |||
216 | /* This is per cgroup per device grouping structure */ | 174 | /* This is per cgroup per device grouping structure */ |
217 | struct cfq_group { | 175 | struct cfq_group { |
218 | /* must be the first member */ | ||
219 | struct blkg_policy_data pd; | ||
220 | |||
221 | /* group service_tree member */ | 176 | /* group service_tree member */ |
222 | struct rb_node rb_node; | 177 | struct rb_node rb_node; |
223 | 178 | ||
@@ -225,7 +180,7 @@ struct cfq_group { | |||
225 | u64 vdisktime; | 180 | u64 vdisktime; |
226 | unsigned int weight; | 181 | unsigned int weight; |
227 | unsigned int new_weight; | 182 | unsigned int new_weight; |
228 | unsigned int dev_weight; | 183 | bool needs_update; |
229 | 184 | ||
230 | /* number of cfqq currently on this group */ | 185 | /* number of cfqq currently on this group */ |
231 | int nr_cfqq; | 186 | int nr_cfqq; |
@@ -251,21 +206,14 @@ struct cfq_group { | |||
251 | unsigned long saved_workload_slice; | 206 | unsigned long saved_workload_slice; |
252 | enum wl_type_t saved_workload; | 207 | enum wl_type_t saved_workload; |
253 | enum wl_prio_t saved_serving_prio; | 208 | enum wl_prio_t saved_serving_prio; |
254 | 209 | struct blkio_group blkg; | |
210 | #ifdef CONFIG_CFQ_GROUP_IOSCHED | ||
211 | struct hlist_node cfqd_node; | ||
212 | int ref; | ||
213 | #endif | ||
255 | /* number of requests that are on the dispatch list or inside driver */ | 214 | /* number of requests that are on the dispatch list or inside driver */ |
256 | int dispatched; | 215 | int dispatched; |
257 | struct cfq_ttime ttime; | 216 | struct cfq_ttime ttime; |
258 | struct cfqg_stats stats; | ||
259 | }; | ||
260 | |||
261 | struct cfq_io_cq { | ||
262 | struct io_cq icq; /* must be the first member */ | ||
263 | struct cfq_queue *cfqq[2]; | ||
264 | struct cfq_ttime ttime; | ||
265 | int ioprio; /* the current ioprio */ | ||
266 | #ifdef CONFIG_CFQ_GROUP_IOSCHED | ||
267 | uint64_t blkcg_id; /* the current blkcg ID */ | ||
268 | #endif | ||
269 | }; | 217 | }; |
270 | 218 | ||
271 | /* | 219 | /* |
@@ -275,7 +223,7 @@ struct cfq_data { | |||
275 | struct request_queue *queue; | 223 | struct request_queue *queue; |
276 | /* Root service tree for cfq_groups */ | 224 | /* Root service tree for cfq_groups */ |
277 | struct cfq_rb_root grp_service_tree; | 225 | struct cfq_rb_root grp_service_tree; |
278 | struct cfq_group *root_group; | 226 | struct cfq_group root_group; |
279 | 227 | ||
280 | /* | 228 | /* |
281 | * The priority currently being served | 229 | * The priority currently being served |
@@ -319,7 +267,7 @@ struct cfq_data { | |||
319 | struct work_struct unplug_work; | 267 | struct work_struct unplug_work; |
320 | 268 | ||
321 | struct cfq_queue *active_queue; | 269 | struct cfq_queue *active_queue; |
322 | struct cfq_io_cq *active_cic; | 270 | struct cfq_io_context *active_cic; |
323 | 271 | ||
324 | /* | 272 | /* |
325 | * async queue for each priority case | 273 | * async queue for each priority case |
@@ -341,7 +289,9 @@ struct cfq_data { | |||
341 | unsigned int cfq_slice_idle; | 289 | unsigned int cfq_slice_idle; |
342 | unsigned int cfq_group_idle; | 290 | unsigned int cfq_group_idle; |
343 | unsigned int cfq_latency; | 291 | unsigned int cfq_latency; |
344 | unsigned int cfq_target_latency; | 292 | |
293 | unsigned int cic_index; | ||
294 | struct list_head cic_list; | ||
345 | 295 | ||
346 | /* | 296 | /* |
347 | * Fallback dummy cfqq for extreme OOM conditions | 297 | * Fallback dummy cfqq for extreme OOM conditions |
@@ -349,6 +299,12 @@ struct cfq_data { | |||
349 | struct cfq_queue oom_cfqq; | 299 | struct cfq_queue oom_cfqq; |
350 | 300 | ||
351 | unsigned long last_delayed_sync; | 301 | unsigned long last_delayed_sync; |
302 | |||
303 | /* List of cfq groups being managed on this device*/ | ||
304 | struct hlist_head cfqg_list; | ||
305 | |||
306 | /* Number of groups which are on blkcg->blkg_list */ | ||
307 | unsigned int nr_blkcg_linked_grps; | ||
352 | }; | 308 | }; |
353 | 309 | ||
354 | static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd); | 310 | static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd); |
@@ -411,286 +367,21 @@ CFQ_CFQQ_FNS(deep); | |||
411 | CFQ_CFQQ_FNS(wait_busy); | 367 | CFQ_CFQQ_FNS(wait_busy); |
412 | #undef CFQ_CFQQ_FNS | 368 | #undef CFQ_CFQQ_FNS |
413 | 369 | ||
414 | static inline struct cfq_group *pd_to_cfqg(struct blkg_policy_data *pd) | ||
415 | { | ||
416 | return pd ? container_of(pd, struct cfq_group, pd) : NULL; | ||
417 | } | ||
418 | |||
419 | static inline struct blkcg_gq *cfqg_to_blkg(struct cfq_group *cfqg) | ||
420 | { | ||
421 | return pd_to_blkg(&cfqg->pd); | ||
422 | } | ||
423 | |||
424 | #if defined(CONFIG_CFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP) | ||
425 | |||
426 | /* cfqg stats flags */ | ||
427 | enum cfqg_stats_flags { | ||
428 | CFQG_stats_waiting = 0, | ||
429 | CFQG_stats_idling, | ||
430 | CFQG_stats_empty, | ||
431 | }; | ||
432 | |||
433 | #define CFQG_FLAG_FNS(name) \ | ||
434 | static inline void cfqg_stats_mark_##name(struct cfqg_stats *stats) \ | ||
435 | { \ | ||
436 | stats->flags |= (1 << CFQG_stats_##name); \ | ||
437 | } \ | ||
438 | static inline void cfqg_stats_clear_##name(struct cfqg_stats *stats) \ | ||
439 | { \ | ||
440 | stats->flags &= ~(1 << CFQG_stats_##name); \ | ||
441 | } \ | ||
442 | static inline int cfqg_stats_##name(struct cfqg_stats *stats) \ | ||
443 | { \ | ||
444 | return (stats->flags & (1 << CFQG_stats_##name)) != 0; \ | ||
445 | } \ | ||
446 | |||
447 | CFQG_FLAG_FNS(waiting) | ||
448 | CFQG_FLAG_FNS(idling) | ||
449 | CFQG_FLAG_FNS(empty) | ||
450 | #undef CFQG_FLAG_FNS | ||
451 | |||
452 | /* This should be called with the queue_lock held. */ | ||
453 | static void cfqg_stats_update_group_wait_time(struct cfqg_stats *stats) | ||
454 | { | ||
455 | unsigned long long now; | ||
456 | |||
457 | if (!cfqg_stats_waiting(stats)) | ||
458 | return; | ||
459 | |||
460 | now = sched_clock(); | ||
461 | if (time_after64(now, stats->start_group_wait_time)) | ||
462 | blkg_stat_add(&stats->group_wait_time, | ||
463 | now - stats->start_group_wait_time); | ||
464 | cfqg_stats_clear_waiting(stats); | ||
465 | } | ||
466 | |||
467 | /* This should be called with the queue_lock held. */ | ||
468 | static void cfqg_stats_set_start_group_wait_time(struct cfq_group *cfqg, | ||
469 | struct cfq_group *curr_cfqg) | ||
470 | { | ||
471 | struct cfqg_stats *stats = &cfqg->stats; | ||
472 | |||
473 | if (cfqg_stats_waiting(stats)) | ||
474 | return; | ||
475 | if (cfqg == curr_cfqg) | ||
476 | return; | ||
477 | stats->start_group_wait_time = sched_clock(); | ||
478 | cfqg_stats_mark_waiting(stats); | ||
479 | } | ||
480 | |||
481 | /* This should be called with the queue_lock held. */ | ||
482 | static void cfqg_stats_end_empty_time(struct cfqg_stats *stats) | ||
483 | { | ||
484 | unsigned long long now; | ||
485 | |||
486 | if (!cfqg_stats_empty(stats)) | ||
487 | return; | ||
488 | |||
489 | now = sched_clock(); | ||
490 | if (time_after64(now, stats->start_empty_time)) | ||
491 | blkg_stat_add(&stats->empty_time, | ||
492 | now - stats->start_empty_time); | ||
493 | cfqg_stats_clear_empty(stats); | ||
494 | } | ||
495 | |||
496 | static void cfqg_stats_update_dequeue(struct cfq_group *cfqg) | ||
497 | { | ||
498 | blkg_stat_add(&cfqg->stats.dequeue, 1); | ||
499 | } | ||
500 | |||
501 | static void cfqg_stats_set_start_empty_time(struct cfq_group *cfqg) | ||
502 | { | ||
503 | struct cfqg_stats *stats = &cfqg->stats; | ||
504 | |||
505 | if (blkg_rwstat_sum(&stats->queued)) | ||
506 | return; | ||
507 | |||
508 | /* | ||
509 | * group is already marked empty. This can happen if cfqq got new | ||
510 | * request in parent group and moved to this group while being added | ||
511 | * to service tree. Just ignore the event and move on. | ||
512 | */ | ||
513 | if (cfqg_stats_empty(stats)) | ||
514 | return; | ||
515 | |||
516 | stats->start_empty_time = sched_clock(); | ||
517 | cfqg_stats_mark_empty(stats); | ||
518 | } | ||
519 | |||
520 | static void cfqg_stats_update_idle_time(struct cfq_group *cfqg) | ||
521 | { | ||
522 | struct cfqg_stats *stats = &cfqg->stats; | ||
523 | |||
524 | if (cfqg_stats_idling(stats)) { | ||
525 | unsigned long long now = sched_clock(); | ||
526 | |||
527 | if (time_after64(now, stats->start_idle_time)) | ||
528 | blkg_stat_add(&stats->idle_time, | ||
529 | now - stats->start_idle_time); | ||
530 | cfqg_stats_clear_idling(stats); | ||
531 | } | ||
532 | } | ||
533 | |||
534 | static void cfqg_stats_set_start_idle_time(struct cfq_group *cfqg) | ||
535 | { | ||
536 | struct cfqg_stats *stats = &cfqg->stats; | ||
537 | |||
538 | BUG_ON(cfqg_stats_idling(stats)); | ||
539 | |||
540 | stats->start_idle_time = sched_clock(); | ||
541 | cfqg_stats_mark_idling(stats); | ||
542 | } | ||
543 | |||
544 | static void cfqg_stats_update_avg_queue_size(struct cfq_group *cfqg) | ||
545 | { | ||
546 | struct cfqg_stats *stats = &cfqg->stats; | ||
547 | |||
548 | blkg_stat_add(&stats->avg_queue_size_sum, | ||
549 | blkg_rwstat_sum(&stats->queued)); | ||
550 | blkg_stat_add(&stats->avg_queue_size_samples, 1); | ||
551 | cfqg_stats_update_group_wait_time(stats); | ||
552 | } | ||
553 | |||
554 | #else /* CONFIG_CFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */ | ||
555 | |||
556 | static inline void cfqg_stats_set_start_group_wait_time(struct cfq_group *cfqg, struct cfq_group *curr_cfqg) { } | ||
557 | static inline void cfqg_stats_end_empty_time(struct cfqg_stats *stats) { } | ||
558 | static inline void cfqg_stats_update_dequeue(struct cfq_group *cfqg) { } | ||
559 | static inline void cfqg_stats_set_start_empty_time(struct cfq_group *cfqg) { } | ||
560 | static inline void cfqg_stats_update_idle_time(struct cfq_group *cfqg) { } | ||
561 | static inline void cfqg_stats_set_start_idle_time(struct cfq_group *cfqg) { } | ||
562 | static inline void cfqg_stats_update_avg_queue_size(struct cfq_group *cfqg) { } | ||
563 | |||
564 | #endif /* CONFIG_CFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */ | ||
565 | |||
566 | #ifdef CONFIG_CFQ_GROUP_IOSCHED | 370 | #ifdef CONFIG_CFQ_GROUP_IOSCHED |
567 | 371 | #define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \ | |
568 | static struct blkcg_policy blkcg_policy_cfq; | ||
569 | |||
570 | static inline struct cfq_group *blkg_to_cfqg(struct blkcg_gq *blkg) | ||
571 | { | ||
572 | return pd_to_cfqg(blkg_to_pd(blkg, &blkcg_policy_cfq)); | ||
573 | } | ||
574 | |||
575 | static inline void cfqg_get(struct cfq_group *cfqg) | ||
576 | { | ||
577 | return blkg_get(cfqg_to_blkg(cfqg)); | ||
578 | } | ||
579 | |||
580 | static inline void cfqg_put(struct cfq_group *cfqg) | ||
581 | { | ||
582 | return blkg_put(cfqg_to_blkg(cfqg)); | ||
583 | } | ||
584 | |||
585 | #define cfq_log_cfqq(cfqd, cfqq, fmt, args...) do { \ | ||
586 | char __pbuf[128]; \ | ||
587 | \ | ||
588 | blkg_path(cfqg_to_blkg((cfqq)->cfqg), __pbuf, sizeof(__pbuf)); \ | ||
589 | blk_add_trace_msg((cfqd)->queue, "cfq%d%c %s " fmt, (cfqq)->pid, \ | 372 | blk_add_trace_msg((cfqd)->queue, "cfq%d%c %s " fmt, (cfqq)->pid, \ |
590 | cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \ | 373 | cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \ |
591 | __pbuf, ##args); \ | 374 | blkg_path(&(cfqq)->cfqg->blkg), ##args) |
592 | } while (0) | ||
593 | |||
594 | #define cfq_log_cfqg(cfqd, cfqg, fmt, args...) do { \ | ||
595 | char __pbuf[128]; \ | ||
596 | \ | ||
597 | blkg_path(cfqg_to_blkg(cfqg), __pbuf, sizeof(__pbuf)); \ | ||
598 | blk_add_trace_msg((cfqd)->queue, "%s " fmt, __pbuf, ##args); \ | ||
599 | } while (0) | ||
600 | |||
601 | static inline void cfqg_stats_update_io_add(struct cfq_group *cfqg, | ||
602 | struct cfq_group *curr_cfqg, int rw) | ||
603 | { | ||
604 | blkg_rwstat_add(&cfqg->stats.queued, rw, 1); | ||
605 | cfqg_stats_end_empty_time(&cfqg->stats); | ||
606 | cfqg_stats_set_start_group_wait_time(cfqg, curr_cfqg); | ||
607 | } | ||
608 | 375 | ||
609 | static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg, | 376 | #define cfq_log_cfqg(cfqd, cfqg, fmt, args...) \ |
610 | unsigned long time, unsigned long unaccounted_time) | 377 | blk_add_trace_msg((cfqd)->queue, "%s " fmt, \ |
611 | { | 378 | blkg_path(&(cfqg)->blkg), ##args) \ |
612 | blkg_stat_add(&cfqg->stats.time, time); | ||
613 | #ifdef CONFIG_DEBUG_BLK_CGROUP | ||
614 | blkg_stat_add(&cfqg->stats.unaccounted_time, unaccounted_time); | ||
615 | #endif | ||
616 | } | ||
617 | |||
618 | static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg, int rw) | ||
619 | { | ||
620 | blkg_rwstat_add(&cfqg->stats.queued, rw, -1); | ||
621 | } | ||
622 | |||
623 | static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, int rw) | ||
624 | { | ||
625 | blkg_rwstat_add(&cfqg->stats.merged, rw, 1); | ||
626 | } | ||
627 | |||
628 | static inline void cfqg_stats_update_dispatch(struct cfq_group *cfqg, | ||
629 | uint64_t bytes, int rw) | ||
630 | { | ||
631 | blkg_stat_add(&cfqg->stats.sectors, bytes >> 9); | ||
632 | blkg_rwstat_add(&cfqg->stats.serviced, rw, 1); | ||
633 | blkg_rwstat_add(&cfqg->stats.service_bytes, rw, bytes); | ||
634 | } | ||
635 | |||
636 | static inline void cfqg_stats_update_completion(struct cfq_group *cfqg, | ||
637 | uint64_t start_time, uint64_t io_start_time, int rw) | ||
638 | { | ||
639 | struct cfqg_stats *stats = &cfqg->stats; | ||
640 | unsigned long long now = sched_clock(); | ||
641 | |||
642 | if (time_after64(now, io_start_time)) | ||
643 | blkg_rwstat_add(&stats->service_time, rw, now - io_start_time); | ||
644 | if (time_after64(io_start_time, start_time)) | ||
645 | blkg_rwstat_add(&stats->wait_time, rw, | ||
646 | io_start_time - start_time); | ||
647 | } | ||
648 | |||
649 | static void cfq_pd_reset_stats(struct blkcg_gq *blkg) | ||
650 | { | ||
651 | struct cfq_group *cfqg = blkg_to_cfqg(blkg); | ||
652 | struct cfqg_stats *stats = &cfqg->stats; | ||
653 | |||
654 | /* queued stats shouldn't be cleared */ | ||
655 | blkg_rwstat_reset(&stats->service_bytes); | ||
656 | blkg_rwstat_reset(&stats->serviced); | ||
657 | blkg_rwstat_reset(&stats->merged); | ||
658 | blkg_rwstat_reset(&stats->service_time); | ||
659 | blkg_rwstat_reset(&stats->wait_time); | ||
660 | blkg_stat_reset(&stats->time); | ||
661 | #ifdef CONFIG_DEBUG_BLK_CGROUP | ||
662 | blkg_stat_reset(&stats->unaccounted_time); | ||
663 | blkg_stat_reset(&stats->avg_queue_size_sum); | ||
664 | blkg_stat_reset(&stats->avg_queue_size_samples); | ||
665 | blkg_stat_reset(&stats->dequeue); | ||
666 | blkg_stat_reset(&stats->group_wait_time); | ||
667 | blkg_stat_reset(&stats->idle_time); | ||
668 | blkg_stat_reset(&stats->empty_time); | ||
669 | #endif | ||
670 | } | ||
671 | |||
672 | #else /* CONFIG_CFQ_GROUP_IOSCHED */ | ||
673 | |||
674 | static inline void cfqg_get(struct cfq_group *cfqg) { } | ||
675 | static inline void cfqg_put(struct cfq_group *cfqg) { } | ||
676 | 379 | ||
380 | #else | ||
677 | #define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \ | 381 | #define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \ |
678 | blk_add_trace_msg((cfqd)->queue, "cfq%d " fmt, (cfqq)->pid, ##args) | 382 | blk_add_trace_msg((cfqd)->queue, "cfq%d " fmt, (cfqq)->pid, ##args) |
679 | #define cfq_log_cfqg(cfqd, cfqg, fmt, args...) do {} while (0) | 383 | #define cfq_log_cfqg(cfqd, cfqg, fmt, args...) do {} while (0) |
680 | 384 | #endif | |
681 | static inline void cfqg_stats_update_io_add(struct cfq_group *cfqg, | ||
682 | struct cfq_group *curr_cfqg, int rw) { } | ||
683 | static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg, | ||
684 | unsigned long time, unsigned long unaccounted_time) { } | ||
685 | static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg, int rw) { } | ||
686 | static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, int rw) { } | ||
687 | static inline void cfqg_stats_update_dispatch(struct cfq_group *cfqg, | ||
688 | uint64_t bytes, int rw) { } | ||
689 | static inline void cfqg_stats_update_completion(struct cfq_group *cfqg, | ||
690 | uint64_t start_time, uint64_t io_start_time, int rw) { } | ||
691 | |||
692 | #endif /* CONFIG_CFQ_GROUP_IOSCHED */ | ||
693 | |||
694 | #define cfq_log(cfqd, fmt, args...) \ | 385 | #define cfq_log(cfqd, fmt, args...) \ |
695 | blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args) | 386 | blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args) |
696 | 387 | ||
@@ -771,38 +462,39 @@ static inline int cfqg_busy_async_queues(struct cfq_data *cfqd, | |||
771 | } | 462 | } |
772 | 463 | ||
773 | static void cfq_dispatch_insert(struct request_queue *, struct request *); | 464 | static void cfq_dispatch_insert(struct request_queue *, struct request *); |
774 | static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, bool is_sync, | 465 | static struct cfq_queue *cfq_get_queue(struct cfq_data *, bool, |
775 | struct cfq_io_cq *cic, struct bio *bio, | 466 | struct io_context *, gfp_t); |
776 | gfp_t gfp_mask); | 467 | static struct cfq_io_context *cfq_cic_lookup(struct cfq_data *, |
468 | struct io_context *); | ||
777 | 469 | ||
778 | static inline struct cfq_io_cq *icq_to_cic(struct io_cq *icq) | 470 | static inline struct cfq_queue *cic_to_cfqq(struct cfq_io_context *cic, |
471 | bool is_sync) | ||
779 | { | 472 | { |
780 | /* cic->icq is the first member, %NULL will convert to %NULL */ | 473 | return cic->cfqq[is_sync]; |
781 | return container_of(icq, struct cfq_io_cq, icq); | ||
782 | } | 474 | } |
783 | 475 | ||
784 | static inline struct cfq_io_cq *cfq_cic_lookup(struct cfq_data *cfqd, | 476 | static inline void cic_set_cfqq(struct cfq_io_context *cic, |
785 | struct io_context *ioc) | 477 | struct cfq_queue *cfqq, bool is_sync) |
786 | { | 478 | { |
787 | if (ioc) | 479 | cic->cfqq[is_sync] = cfqq; |
788 | return icq_to_cic(ioc_lookup_icq(ioc, cfqd->queue)); | ||
789 | return NULL; | ||
790 | } | 480 | } |
791 | 481 | ||
792 | static inline struct cfq_queue *cic_to_cfqq(struct cfq_io_cq *cic, bool is_sync) | 482 | #define CIC_DEAD_KEY 1ul |
793 | { | 483 | #define CIC_DEAD_INDEX_SHIFT 1 |
794 | return cic->cfqq[is_sync]; | ||
795 | } | ||
796 | 484 | ||
797 | static inline void cic_set_cfqq(struct cfq_io_cq *cic, struct cfq_queue *cfqq, | 485 | static inline void *cfqd_dead_key(struct cfq_data *cfqd) |
798 | bool is_sync) | ||
799 | { | 486 | { |
800 | cic->cfqq[is_sync] = cfqq; | 487 | return (void *)(cfqd->cic_index << CIC_DEAD_INDEX_SHIFT | CIC_DEAD_KEY); |
801 | } | 488 | } |
802 | 489 | ||
803 | static inline struct cfq_data *cic_to_cfqd(struct cfq_io_cq *cic) | 490 | static inline struct cfq_data *cic_to_cfqd(struct cfq_io_context *cic) |
804 | { | 491 | { |
805 | return cic->icq.q->elevator->elevator_data; | 492 | struct cfq_data *cfqd = cic->key; |
493 | |||
494 | if (unlikely((unsigned long) cfqd & CIC_DEAD_KEY)) | ||
495 | return NULL; | ||
496 | |||
497 | return cfqd; | ||
806 | } | 498 | } |
807 | 499 | ||
808 | /* | 500 | /* |
@@ -851,7 +543,7 @@ static inline u64 cfq_scale_slice(unsigned long delta, struct cfq_group *cfqg) | |||
851 | { | 543 | { |
852 | u64 d = delta << CFQ_SERVICE_SHIFT; | 544 | u64 d = delta << CFQ_SERVICE_SHIFT; |
853 | 545 | ||
854 | d = d * CFQ_WEIGHT_DEFAULT; | 546 | d = d * BLKIO_WEIGHT_DEFAULT; |
855 | do_div(d, cfqg->weight); | 547 | do_div(d, cfqg->weight); |
856 | return d; | 548 | return d; |
857 | } | 549 | } |
@@ -911,7 +603,7 @@ cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg) | |||
911 | { | 603 | { |
912 | struct cfq_rb_root *st = &cfqd->grp_service_tree; | 604 | struct cfq_rb_root *st = &cfqd->grp_service_tree; |
913 | 605 | ||
914 | return cfqd->cfq_target_latency * cfqg->weight / st->total_weight; | 606 | return cfq_target_latency * cfqg->weight / st->total_weight; |
915 | } | 607 | } |
916 | 608 | ||
917 | static inline unsigned | 609 | static inline unsigned |
@@ -1178,9 +870,9 @@ static void | |||
1178 | cfq_update_group_weight(struct cfq_group *cfqg) | 870 | cfq_update_group_weight(struct cfq_group *cfqg) |
1179 | { | 871 | { |
1180 | BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node)); | 872 | BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node)); |
1181 | if (cfqg->new_weight) { | 873 | if (cfqg->needs_update) { |
1182 | cfqg->weight = cfqg->new_weight; | 874 | cfqg->weight = cfqg->new_weight; |
1183 | cfqg->new_weight = 0; | 875 | cfqg->needs_update = false; |
1184 | } | 876 | } |
1185 | } | 877 | } |
1186 | 878 | ||
@@ -1242,7 +934,7 @@ cfq_group_notify_queue_del(struct cfq_data *cfqd, struct cfq_group *cfqg) | |||
1242 | cfq_log_cfqg(cfqd, cfqg, "del_from_rr group"); | 934 | cfq_log_cfqg(cfqd, cfqg, "del_from_rr group"); |
1243 | cfq_group_service_tree_del(st, cfqg); | 935 | cfq_group_service_tree_del(st, cfqg); |
1244 | cfqg->saved_workload_slice = 0; | 936 | cfqg->saved_workload_slice = 0; |
1245 | cfqg_stats_update_dequeue(cfqg); | 937 | cfq_blkiocg_update_dequeue_stats(&cfqg->blkg, 1); |
1246 | } | 938 | } |
1247 | 939 | ||
1248 | static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq, | 940 | static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq, |
@@ -1314,59 +1006,178 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg, | |||
1314 | "sl_used=%u disp=%u charge=%u iops=%u sect=%lu", | 1006 | "sl_used=%u disp=%u charge=%u iops=%u sect=%lu", |
1315 | used_sl, cfqq->slice_dispatch, charge, | 1007 | used_sl, cfqq->slice_dispatch, charge, |
1316 | iops_mode(cfqd), cfqq->nr_sectors); | 1008 | iops_mode(cfqd), cfqq->nr_sectors); |
1317 | cfqg_stats_update_timeslice_used(cfqg, used_sl, unaccounted_sl); | 1009 | cfq_blkiocg_update_timeslice_used(&cfqg->blkg, used_sl, |
1318 | cfqg_stats_set_start_empty_time(cfqg); | 1010 | unaccounted_sl); |
1011 | cfq_blkiocg_set_start_empty_time(&cfqg->blkg); | ||
1319 | } | 1012 | } |
1320 | 1013 | ||
1321 | /** | 1014 | #ifdef CONFIG_CFQ_GROUP_IOSCHED |
1322 | * cfq_init_cfqg_base - initialize base part of a cfq_group | 1015 | static inline struct cfq_group *cfqg_of_blkg(struct blkio_group *blkg) |
1323 | * @cfqg: cfq_group to initialize | 1016 | { |
1324 | * | 1017 | if (blkg) |
1325 | * Initialize the base part which is used whether %CONFIG_CFQ_GROUP_IOSCHED | 1018 | return container_of(blkg, struct cfq_group, blkg); |
1326 | * is enabled or not. | 1019 | return NULL; |
1020 | } | ||
1021 | |||
1022 | static void cfq_update_blkio_group_weight(void *key, struct blkio_group *blkg, | ||
1023 | unsigned int weight) | ||
1024 | { | ||
1025 | struct cfq_group *cfqg = cfqg_of_blkg(blkg); | ||
1026 | cfqg->new_weight = weight; | ||
1027 | cfqg->needs_update = true; | ||
1028 | } | ||
1029 | |||
1030 | static void cfq_init_add_cfqg_lists(struct cfq_data *cfqd, | ||
1031 | struct cfq_group *cfqg, struct blkio_cgroup *blkcg) | ||
1032 | { | ||
1033 | struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info; | ||
1034 | unsigned int major, minor; | ||
1035 | |||
1036 | /* | ||
1037 | * Add group onto cgroup list. It might happen that bdi->dev is | ||
1038 | * not initialized yet. Initialize this new group without major | ||
1039 | * and minor info and this info will be filled in once a new thread | ||
1040 | * comes for IO. | ||
1041 | */ | ||
1042 | if (bdi->dev) { | ||
1043 | sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); | ||
1044 | cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, | ||
1045 | (void *)cfqd, MKDEV(major, minor)); | ||
1046 | } else | ||
1047 | cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, | ||
1048 | (void *)cfqd, 0); | ||
1049 | |||
1050 | cfqd->nr_blkcg_linked_grps++; | ||
1051 | cfqg->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev); | ||
1052 | |||
1053 | /* Add group on cfqd list */ | ||
1054 | hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list); | ||
1055 | } | ||
1056 | |||
1057 | /* | ||
1058 | * Should be called from sleepable context. No request queue lock as per | ||
1059 | * cpu stats are allocated dynamically and alloc_percpu needs to be called | ||
1060 | * from sleepable context. | ||
1327 | */ | 1061 | */ |
1328 | static void cfq_init_cfqg_base(struct cfq_group *cfqg) | 1062 | static struct cfq_group * cfq_alloc_cfqg(struct cfq_data *cfqd) |
1329 | { | 1063 | { |
1064 | struct cfq_group *cfqg = NULL; | ||
1065 | int i, j, ret; | ||
1330 | struct cfq_rb_root *st; | 1066 | struct cfq_rb_root *st; |
1331 | int i, j; | 1067 | |
1068 | cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node); | ||
1069 | if (!cfqg) | ||
1070 | return NULL; | ||
1332 | 1071 | ||
1333 | for_each_cfqg_st(cfqg, i, j, st) | 1072 | for_each_cfqg_st(cfqg, i, j, st) |
1334 | *st = CFQ_RB_ROOT; | 1073 | *st = CFQ_RB_ROOT; |
1335 | RB_CLEAR_NODE(&cfqg->rb_node); | 1074 | RB_CLEAR_NODE(&cfqg->rb_node); |
1336 | 1075 | ||
1337 | cfqg->ttime.last_end_request = jiffies; | 1076 | cfqg->ttime.last_end_request = jiffies; |
1077 | |||
1078 | /* | ||
1079 | * Take the initial reference that will be released on destroy | ||
1080 | * This can be thought of a joint reference by cgroup and | ||
1081 | * elevator which will be dropped by either elevator exit | ||
1082 | * or cgroup deletion path depending on who is exiting first. | ||
1083 | */ | ||
1084 | cfqg->ref = 1; | ||
1085 | |||
1086 | ret = blkio_alloc_blkg_stats(&cfqg->blkg); | ||
1087 | if (ret) { | ||
1088 | kfree(cfqg); | ||
1089 | return NULL; | ||
1090 | } | ||
1091 | |||
1092 | return cfqg; | ||
1338 | } | 1093 | } |
1339 | 1094 | ||
1340 | #ifdef CONFIG_CFQ_GROUP_IOSCHED | 1095 | static struct cfq_group * |
1341 | static void cfq_pd_init(struct blkcg_gq *blkg) | 1096 | cfq_find_cfqg(struct cfq_data *cfqd, struct blkio_cgroup *blkcg) |
1342 | { | 1097 | { |
1343 | struct cfq_group *cfqg = blkg_to_cfqg(blkg); | 1098 | struct cfq_group *cfqg = NULL; |
1099 | void *key = cfqd; | ||
1100 | struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info; | ||
1101 | unsigned int major, minor; | ||
1344 | 1102 | ||
1345 | cfq_init_cfqg_base(cfqg); | 1103 | /* |
1346 | cfqg->weight = blkg->blkcg->cfq_weight; | 1104 | * This is the common case when there are no blkio cgroups. |
1105 | * Avoid lookup in this case | ||
1106 | */ | ||
1107 | if (blkcg == &blkio_root_cgroup) | ||
1108 | cfqg = &cfqd->root_group; | ||
1109 | else | ||
1110 | cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key)); | ||
1111 | |||
1112 | if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) { | ||
1113 | sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); | ||
1114 | cfqg->blkg.dev = MKDEV(major, minor); | ||
1115 | } | ||
1116 | |||
1117 | return cfqg; | ||
1347 | } | 1118 | } |
1348 | 1119 | ||
1349 | /* | 1120 | /* |
1350 | * Search for the cfq group current task belongs to. request_queue lock must | 1121 | * Search for the cfq group current task belongs to. request_queue lock must |
1351 | * be held. | 1122 | * be held. |
1352 | */ | 1123 | */ |
1353 | static struct cfq_group *cfq_lookup_create_cfqg(struct cfq_data *cfqd, | 1124 | static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd) |
1354 | struct blkcg *blkcg) | ||
1355 | { | 1125 | { |
1126 | struct blkio_cgroup *blkcg; | ||
1127 | struct cfq_group *cfqg = NULL, *__cfqg = NULL; | ||
1356 | struct request_queue *q = cfqd->queue; | 1128 | struct request_queue *q = cfqd->queue; |
1357 | struct cfq_group *cfqg = NULL; | ||
1358 | 1129 | ||
1359 | /* avoid lookup for the common case where there's no blkcg */ | 1130 | rcu_read_lock(); |
1360 | if (blkcg == &blkcg_root) { | 1131 | blkcg = task_blkio_cgroup(current); |
1361 | cfqg = cfqd->root_group; | 1132 | cfqg = cfq_find_cfqg(cfqd, blkcg); |
1362 | } else { | 1133 | if (cfqg) { |
1363 | struct blkcg_gq *blkg; | 1134 | rcu_read_unlock(); |
1135 | return cfqg; | ||
1136 | } | ||
1137 | |||
1138 | /* | ||
1139 | * Need to allocate a group. Allocation of group also needs allocation | ||
1140 | * of per cpu stats which in-turn takes a mutex() and can block. Hence | ||
1141 | * we need to drop rcu lock and queue_lock before we call alloc. | ||
1142 | * | ||
1143 | * Not taking any queue reference here and assuming that queue is | ||
1144 | * around by the time we return. CFQ queue allocation code does | ||
1145 | * the same. It might be racy though. | ||
1146 | */ | ||
1147 | |||
1148 | rcu_read_unlock(); | ||
1149 | spin_unlock_irq(q->queue_lock); | ||
1150 | |||
1151 | cfqg = cfq_alloc_cfqg(cfqd); | ||
1364 | 1152 | ||
1365 | blkg = blkg_lookup_create(blkcg, q); | 1153 | spin_lock_irq(q->queue_lock); |
1366 | if (!IS_ERR(blkg)) | 1154 | |
1367 | cfqg = blkg_to_cfqg(blkg); | 1155 | rcu_read_lock(); |
1156 | blkcg = task_blkio_cgroup(current); | ||
1157 | |||
1158 | /* | ||
1159 | * If some other thread already allocated the group while we were | ||
1160 | * not holding queue lock, free up the group | ||
1161 | */ | ||
1162 | __cfqg = cfq_find_cfqg(cfqd, blkcg); | ||
1163 | |||
1164 | if (__cfqg) { | ||
1165 | kfree(cfqg); | ||
1166 | rcu_read_unlock(); | ||
1167 | return __cfqg; | ||
1368 | } | 1168 | } |
1369 | 1169 | ||
1170 | if (!cfqg) | ||
1171 | cfqg = &cfqd->root_group; | ||
1172 | |||
1173 | cfq_init_add_cfqg_lists(cfqd, cfqg, blkcg); | ||
1174 | rcu_read_unlock(); | ||
1175 | return cfqg; | ||
1176 | } | ||
1177 | |||
1178 | static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg) | ||
1179 | { | ||
1180 | cfqg->ref++; | ||
1370 | return cfqg; | 1181 | return cfqg; |
1371 | } | 1182 | } |
1372 | 1183 | ||
@@ -1374,224 +1185,94 @@ static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) | |||
1374 | { | 1185 | { |
1375 | /* Currently, all async queues are mapped to root group */ | 1186 | /* Currently, all async queues are mapped to root group */ |
1376 | if (!cfq_cfqq_sync(cfqq)) | 1187 | if (!cfq_cfqq_sync(cfqq)) |
1377 | cfqg = cfqq->cfqd->root_group; | 1188 | cfqg = &cfqq->cfqd->root_group; |
1378 | 1189 | ||
1379 | cfqq->cfqg = cfqg; | 1190 | cfqq->cfqg = cfqg; |
1380 | /* cfqq reference on cfqg */ | 1191 | /* cfqq reference on cfqg */ |
1381 | cfqg_get(cfqg); | 1192 | cfqq->cfqg->ref++; |
1382 | } | 1193 | } |
1383 | 1194 | ||
1384 | static u64 cfqg_prfill_weight_device(struct seq_file *sf, | 1195 | static void cfq_put_cfqg(struct cfq_group *cfqg) |
1385 | struct blkg_policy_data *pd, int off) | ||
1386 | { | 1196 | { |
1387 | struct cfq_group *cfqg = pd_to_cfqg(pd); | 1197 | struct cfq_rb_root *st; |
1388 | 1198 | int i, j; | |
1389 | if (!cfqg->dev_weight) | ||
1390 | return 0; | ||
1391 | return __blkg_prfill_u64(sf, pd, cfqg->dev_weight); | ||
1392 | } | ||
1393 | |||
1394 | static int cfqg_print_weight_device(struct cgroup *cgrp, struct cftype *cft, | ||
1395 | struct seq_file *sf) | ||
1396 | { | ||
1397 | blkcg_print_blkgs(sf, cgroup_to_blkcg(cgrp), | ||
1398 | cfqg_prfill_weight_device, &blkcg_policy_cfq, 0, | ||
1399 | false); | ||
1400 | return 0; | ||
1401 | } | ||
1402 | 1199 | ||
1403 | static int cfq_print_weight(struct cgroup *cgrp, struct cftype *cft, | 1200 | BUG_ON(cfqg->ref <= 0); |
1404 | struct seq_file *sf) | 1201 | cfqg->ref--; |
1405 | { | 1202 | if (cfqg->ref) |
1406 | seq_printf(sf, "%u\n", cgroup_to_blkcg(cgrp)->cfq_weight); | 1203 | return; |
1407 | return 0; | 1204 | for_each_cfqg_st(cfqg, i, j, st) |
1205 | BUG_ON(!RB_EMPTY_ROOT(&st->rb)); | ||
1206 | free_percpu(cfqg->blkg.stats_cpu); | ||
1207 | kfree(cfqg); | ||
1408 | } | 1208 | } |
1409 | 1209 | ||
1410 | static int cfqg_set_weight_device(struct cgroup *cgrp, struct cftype *cft, | 1210 | static void cfq_destroy_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg) |
1411 | const char *buf) | ||
1412 | { | 1211 | { |
1413 | struct blkcg *blkcg = cgroup_to_blkcg(cgrp); | 1212 | /* Something wrong if we are trying to remove same group twice */ |
1414 | struct blkg_conf_ctx ctx; | 1213 | BUG_ON(hlist_unhashed(&cfqg->cfqd_node)); |
1415 | struct cfq_group *cfqg; | ||
1416 | int ret; | ||
1417 | 1214 | ||
1418 | ret = blkg_conf_prep(blkcg, &blkcg_policy_cfq, buf, &ctx); | 1215 | hlist_del_init(&cfqg->cfqd_node); |
1419 | if (ret) | ||
1420 | return ret; | ||
1421 | 1216 | ||
1422 | ret = -EINVAL; | 1217 | BUG_ON(cfqd->nr_blkcg_linked_grps <= 0); |
1423 | cfqg = blkg_to_cfqg(ctx.blkg); | 1218 | cfqd->nr_blkcg_linked_grps--; |
1424 | if (!ctx.v || (ctx.v >= CFQ_WEIGHT_MIN && ctx.v <= CFQ_WEIGHT_MAX)) { | ||
1425 | cfqg->dev_weight = ctx.v; | ||
1426 | cfqg->new_weight = cfqg->dev_weight ?: blkcg->cfq_weight; | ||
1427 | ret = 0; | ||
1428 | } | ||
1429 | 1219 | ||
1430 | blkg_conf_finish(&ctx); | 1220 | /* |
1431 | return ret; | 1221 | * Put the reference taken at the time of creation so that when all |
1222 | * queues are gone, group can be destroyed. | ||
1223 | */ | ||
1224 | cfq_put_cfqg(cfqg); | ||
1432 | } | 1225 | } |
1433 | 1226 | ||
1434 | static int cfq_set_weight(struct cgroup *cgrp, struct cftype *cft, u64 val) | 1227 | static void cfq_release_cfq_groups(struct cfq_data *cfqd) |
1435 | { | 1228 | { |
1436 | struct blkcg *blkcg = cgroup_to_blkcg(cgrp); | 1229 | struct hlist_node *pos, *n; |
1437 | struct blkcg_gq *blkg; | 1230 | struct cfq_group *cfqg; |
1438 | struct hlist_node *n; | ||
1439 | |||
1440 | if (val < CFQ_WEIGHT_MIN || val > CFQ_WEIGHT_MAX) | ||
1441 | return -EINVAL; | ||
1442 | |||
1443 | spin_lock_irq(&blkcg->lock); | ||
1444 | blkcg->cfq_weight = (unsigned int)val; | ||
1445 | |||
1446 | hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) { | ||
1447 | struct cfq_group *cfqg = blkg_to_cfqg(blkg); | ||
1448 | 1231 | ||
1449 | if (cfqg && !cfqg->dev_weight) | 1232 | hlist_for_each_entry_safe(cfqg, pos, n, &cfqd->cfqg_list, cfqd_node) { |
1450 | cfqg->new_weight = blkcg->cfq_weight; | 1233 | /* |
1234 | * If cgroup removal path got to blk_group first and removed | ||
1235 | * it from cgroup list, then it will take care of destroying | ||
1236 | * cfqg also. | ||
1237 | */ | ||
1238 | if (!cfq_blkiocg_del_blkio_group(&cfqg->blkg)) | ||
1239 | cfq_destroy_cfqg(cfqd, cfqg); | ||
1451 | } | 1240 | } |
1452 | |||
1453 | spin_unlock_irq(&blkcg->lock); | ||
1454 | return 0; | ||
1455 | } | 1241 | } |
1456 | 1242 | ||
1457 | static int cfqg_print_stat(struct cgroup *cgrp, struct cftype *cft, | 1243 | /* |
1458 | struct seq_file *sf) | 1244 | * Blk cgroup controller notification saying that blkio_group object is being |
1459 | { | 1245 | * delinked as associated cgroup object is going away. That also means that |
1460 | struct blkcg *blkcg = cgroup_to_blkcg(cgrp); | 1246 | * no new IO will come in this group. So get rid of this group as soon as |
1461 | 1247 | * any pending IO in the group is finished. | |
1462 | blkcg_print_blkgs(sf, blkcg, blkg_prfill_stat, &blkcg_policy_cfq, | 1248 | * |
1463 | cft->private, false); | 1249 | * This function is called under rcu_read_lock(). key is the rcu protected |
1464 | return 0; | 1250 | * pointer. That means "key" is a valid cfq_data pointer as long as we are rcu |
1465 | } | 1251 | * read lock. |
1466 | 1252 | * | |
1467 | static int cfqg_print_rwstat(struct cgroup *cgrp, struct cftype *cft, | 1253 | * "key" was fetched from blkio_group under blkio_cgroup->lock. That means |
1468 | struct seq_file *sf) | 1254 | * it should not be NULL as even if elevator was exiting, cgroup deltion |
1469 | { | 1255 | * path got to it first. |
1470 | struct blkcg *blkcg = cgroup_to_blkcg(cgrp); | 1256 | */ |
1471 | 1257 | static void cfq_unlink_blkio_group(void *key, struct blkio_group *blkg) | |
1472 | blkcg_print_blkgs(sf, blkcg, blkg_prfill_rwstat, &blkcg_policy_cfq, | ||
1473 | cft->private, true); | ||
1474 | return 0; | ||
1475 | } | ||
1476 | |||
1477 | #ifdef CONFIG_DEBUG_BLK_CGROUP | ||
1478 | static u64 cfqg_prfill_avg_queue_size(struct seq_file *sf, | ||
1479 | struct blkg_policy_data *pd, int off) | ||
1480 | { | 1258 | { |
1481 | struct cfq_group *cfqg = pd_to_cfqg(pd); | 1259 | unsigned long flags; |
1482 | u64 samples = blkg_stat_read(&cfqg->stats.avg_queue_size_samples); | 1260 | struct cfq_data *cfqd = key; |
1483 | u64 v = 0; | ||
1484 | 1261 | ||
1485 | if (samples) { | 1262 | spin_lock_irqsave(cfqd->queue->queue_lock, flags); |
1486 | v = blkg_stat_read(&cfqg->stats.avg_queue_size_sum); | 1263 | cfq_destroy_cfqg(cfqd, cfqg_of_blkg(blkg)); |
1487 | do_div(v, samples); | 1264 | spin_unlock_irqrestore(cfqd->queue->queue_lock, flags); |
1488 | } | ||
1489 | __blkg_prfill_u64(sf, pd, v); | ||
1490 | return 0; | ||
1491 | } | 1265 | } |
1492 | 1266 | ||
1493 | /* print avg_queue_size */ | 1267 | #else /* GROUP_IOSCHED */ |
1494 | static int cfqg_print_avg_queue_size(struct cgroup *cgrp, struct cftype *cft, | 1268 | static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd) |
1495 | struct seq_file *sf) | ||
1496 | { | 1269 | { |
1497 | struct blkcg *blkcg = cgroup_to_blkcg(cgrp); | 1270 | return &cfqd->root_group; |
1498 | |||
1499 | blkcg_print_blkgs(sf, blkcg, cfqg_prfill_avg_queue_size, | ||
1500 | &blkcg_policy_cfq, 0, false); | ||
1501 | return 0; | ||
1502 | } | 1271 | } |
1503 | #endif /* CONFIG_DEBUG_BLK_CGROUP */ | ||
1504 | 1272 | ||
1505 | static struct cftype cfq_blkcg_files[] = { | 1273 | static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg) |
1506 | { | ||
1507 | .name = "weight_device", | ||
1508 | .read_seq_string = cfqg_print_weight_device, | ||
1509 | .write_string = cfqg_set_weight_device, | ||
1510 | .max_write_len = 256, | ||
1511 | }, | ||
1512 | { | ||
1513 | .name = "weight", | ||
1514 | .read_seq_string = cfq_print_weight, | ||
1515 | .write_u64 = cfq_set_weight, | ||
1516 | }, | ||
1517 | { | ||
1518 | .name = "time", | ||
1519 | .private = offsetof(struct cfq_group, stats.time), | ||
1520 | .read_seq_string = cfqg_print_stat, | ||
1521 | }, | ||
1522 | { | ||
1523 | .name = "sectors", | ||
1524 | .private = offsetof(struct cfq_group, stats.sectors), | ||
1525 | .read_seq_string = cfqg_print_stat, | ||
1526 | }, | ||
1527 | { | ||
1528 | .name = "io_service_bytes", | ||
1529 | .private = offsetof(struct cfq_group, stats.service_bytes), | ||
1530 | .read_seq_string = cfqg_print_rwstat, | ||
1531 | }, | ||
1532 | { | ||
1533 | .name = "io_serviced", | ||
1534 | .private = offsetof(struct cfq_group, stats.serviced), | ||
1535 | .read_seq_string = cfqg_print_rwstat, | ||
1536 | }, | ||
1537 | { | ||
1538 | .name = "io_service_time", | ||
1539 | .private = offsetof(struct cfq_group, stats.service_time), | ||
1540 | .read_seq_string = cfqg_print_rwstat, | ||
1541 | }, | ||
1542 | { | ||
1543 | .name = "io_wait_time", | ||
1544 | .private = offsetof(struct cfq_group, stats.wait_time), | ||
1545 | .read_seq_string = cfqg_print_rwstat, | ||
1546 | }, | ||
1547 | { | ||
1548 | .name = "io_merged", | ||
1549 | .private = offsetof(struct cfq_group, stats.merged), | ||
1550 | .read_seq_string = cfqg_print_rwstat, | ||
1551 | }, | ||
1552 | { | ||
1553 | .name = "io_queued", | ||
1554 | .private = offsetof(struct cfq_group, stats.queued), | ||
1555 | .read_seq_string = cfqg_print_rwstat, | ||
1556 | }, | ||
1557 | #ifdef CONFIG_DEBUG_BLK_CGROUP | ||
1558 | { | ||
1559 | .name = "avg_queue_size", | ||
1560 | .read_seq_string = cfqg_print_avg_queue_size, | ||
1561 | }, | ||
1562 | { | ||
1563 | .name = "group_wait_time", | ||
1564 | .private = offsetof(struct cfq_group, stats.group_wait_time), | ||
1565 | .read_seq_string = cfqg_print_stat, | ||
1566 | }, | ||
1567 | { | ||
1568 | .name = "idle_time", | ||
1569 | .private = offsetof(struct cfq_group, stats.idle_time), | ||
1570 | .read_seq_string = cfqg_print_stat, | ||
1571 | }, | ||
1572 | { | ||
1573 | .name = "empty_time", | ||
1574 | .private = offsetof(struct cfq_group, stats.empty_time), | ||
1575 | .read_seq_string = cfqg_print_stat, | ||
1576 | }, | ||
1577 | { | ||
1578 | .name = "dequeue", | ||
1579 | .private = offsetof(struct cfq_group, stats.dequeue), | ||
1580 | .read_seq_string = cfqg_print_stat, | ||
1581 | }, | ||
1582 | { | ||
1583 | .name = "unaccounted_time", | ||
1584 | .private = offsetof(struct cfq_group, stats.unaccounted_time), | ||
1585 | .read_seq_string = cfqg_print_stat, | ||
1586 | }, | ||
1587 | #endif /* CONFIG_DEBUG_BLK_CGROUP */ | ||
1588 | { } /* terminate */ | ||
1589 | }; | ||
1590 | #else /* GROUP_IOSCHED */ | ||
1591 | static struct cfq_group *cfq_lookup_create_cfqg(struct cfq_data *cfqd, | ||
1592 | struct blkcg *blkcg) | ||
1593 | { | 1274 | { |
1594 | return cfqd->root_group; | 1275 | return cfqg; |
1595 | } | 1276 | } |
1596 | 1277 | ||
1597 | static inline void | 1278 | static inline void |
@@ -1599,6 +1280,9 @@ cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) { | |||
1599 | cfqq->cfqg = cfqg; | 1280 | cfqq->cfqg = cfqg; |
1600 | } | 1281 | } |
1601 | 1282 | ||
1283 | static void cfq_release_cfq_groups(struct cfq_data *cfqd) {} | ||
1284 | static inline void cfq_put_cfqg(struct cfq_group *cfqg) {} | ||
1285 | |||
1602 | #endif /* GROUP_IOSCHED */ | 1286 | #endif /* GROUP_IOSCHED */ |
1603 | 1287 | ||
1604 | /* | 1288 | /* |
@@ -1865,17 +1549,19 @@ static void cfq_reposition_rq_rb(struct cfq_queue *cfqq, struct request *rq) | |||
1865 | { | 1549 | { |
1866 | elv_rb_del(&cfqq->sort_list, rq); | 1550 | elv_rb_del(&cfqq->sort_list, rq); |
1867 | cfqq->queued[rq_is_sync(rq)]--; | 1551 | cfqq->queued[rq_is_sync(rq)]--; |
1868 | cfqg_stats_update_io_remove(RQ_CFQG(rq), rq->cmd_flags); | 1552 | cfq_blkiocg_update_io_remove_stats(&(RQ_CFQG(rq))->blkg, |
1553 | rq_data_dir(rq), rq_is_sync(rq)); | ||
1869 | cfq_add_rq_rb(rq); | 1554 | cfq_add_rq_rb(rq); |
1870 | cfqg_stats_update_io_add(RQ_CFQG(rq), cfqq->cfqd->serving_group, | 1555 | cfq_blkiocg_update_io_add_stats(&(RQ_CFQG(rq))->blkg, |
1871 | rq->cmd_flags); | 1556 | &cfqq->cfqd->serving_group->blkg, rq_data_dir(rq), |
1557 | rq_is_sync(rq)); | ||
1872 | } | 1558 | } |
1873 | 1559 | ||
1874 | static struct request * | 1560 | static struct request * |
1875 | cfq_find_rq_fmerge(struct cfq_data *cfqd, struct bio *bio) | 1561 | cfq_find_rq_fmerge(struct cfq_data *cfqd, struct bio *bio) |
1876 | { | 1562 | { |
1877 | struct task_struct *tsk = current; | 1563 | struct task_struct *tsk = current; |
1878 | struct cfq_io_cq *cic; | 1564 | struct cfq_io_context *cic; |
1879 | struct cfq_queue *cfqq; | 1565 | struct cfq_queue *cfqq; |
1880 | 1566 | ||
1881 | cic = cfq_cic_lookup(cfqd, tsk->io_context); | 1567 | cic = cfq_cic_lookup(cfqd, tsk->io_context); |
@@ -1924,7 +1610,8 @@ static void cfq_remove_request(struct request *rq) | |||
1924 | cfq_del_rq_rb(rq); | 1610 | cfq_del_rq_rb(rq); |
1925 | 1611 | ||
1926 | cfqq->cfqd->rq_queued--; | 1612 | cfqq->cfqd->rq_queued--; |
1927 | cfqg_stats_update_io_remove(RQ_CFQG(rq), rq->cmd_flags); | 1613 | cfq_blkiocg_update_io_remove_stats(&(RQ_CFQG(rq))->blkg, |
1614 | rq_data_dir(rq), rq_is_sync(rq)); | ||
1928 | if (rq->cmd_flags & REQ_PRIO) { | 1615 | if (rq->cmd_flags & REQ_PRIO) { |
1929 | WARN_ON(!cfqq->prio_pending); | 1616 | WARN_ON(!cfqq->prio_pending); |
1930 | cfqq->prio_pending--; | 1617 | cfqq->prio_pending--; |
@@ -1959,7 +1646,8 @@ static void cfq_merged_request(struct request_queue *q, struct request *req, | |||
1959 | static void cfq_bio_merged(struct request_queue *q, struct request *req, | 1646 | static void cfq_bio_merged(struct request_queue *q, struct request *req, |
1960 | struct bio *bio) | 1647 | struct bio *bio) |
1961 | { | 1648 | { |
1962 | cfqg_stats_update_io_merged(RQ_CFQG(req), bio->bi_rw); | 1649 | cfq_blkiocg_update_io_merged_stats(&(RQ_CFQG(req))->blkg, |
1650 | bio_data_dir(bio), cfq_bio_sync(bio)); | ||
1963 | } | 1651 | } |
1964 | 1652 | ||
1965 | static void | 1653 | static void |
@@ -1967,14 +1655,11 @@ cfq_merged_requests(struct request_queue *q, struct request *rq, | |||
1967 | struct request *next) | 1655 | struct request *next) |
1968 | { | 1656 | { |
1969 | struct cfq_queue *cfqq = RQ_CFQQ(rq); | 1657 | struct cfq_queue *cfqq = RQ_CFQQ(rq); |
1970 | struct cfq_data *cfqd = q->elevator->elevator_data; | ||
1971 | |||
1972 | /* | 1658 | /* |
1973 | * reposition in fifo if next is older than rq | 1659 | * reposition in fifo if next is older than rq |
1974 | */ | 1660 | */ |
1975 | if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) && | 1661 | if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) && |
1976 | time_before(rq_fifo_time(next), rq_fifo_time(rq)) && | 1662 | time_before(rq_fifo_time(next), rq_fifo_time(rq))) { |
1977 | cfqq == RQ_CFQQ(next)) { | ||
1978 | list_move(&rq->queuelist, &next->queuelist); | 1663 | list_move(&rq->queuelist, &next->queuelist); |
1979 | rq_set_fifo_time(rq, rq_fifo_time(next)); | 1664 | rq_set_fifo_time(rq, rq_fifo_time(next)); |
1980 | } | 1665 | } |
@@ -1982,24 +1667,15 @@ cfq_merged_requests(struct request_queue *q, struct request *rq, | |||
1982 | if (cfqq->next_rq == next) | 1667 | if (cfqq->next_rq == next) |
1983 | cfqq->next_rq = rq; | 1668 | cfqq->next_rq = rq; |
1984 | cfq_remove_request(next); | 1669 | cfq_remove_request(next); |
1985 | cfqg_stats_update_io_merged(RQ_CFQG(rq), next->cmd_flags); | 1670 | cfq_blkiocg_update_io_merged_stats(&(RQ_CFQG(rq))->blkg, |
1986 | 1671 | rq_data_dir(next), rq_is_sync(next)); | |
1987 | cfqq = RQ_CFQQ(next); | ||
1988 | /* | ||
1989 | * all requests of this queue are merged to other queues, delete it | ||
1990 | * from the service tree. If it's the active_queue, | ||
1991 | * cfq_dispatch_requests() will choose to expire it or do idle | ||
1992 | */ | ||
1993 | if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list) && | ||
1994 | cfqq != cfqd->active_queue) | ||
1995 | cfq_del_cfqq_rr(cfqd, cfqq); | ||
1996 | } | 1672 | } |
1997 | 1673 | ||
1998 | static int cfq_allow_merge(struct request_queue *q, struct request *rq, | 1674 | static int cfq_allow_merge(struct request_queue *q, struct request *rq, |
1999 | struct bio *bio) | 1675 | struct bio *bio) |
2000 | { | 1676 | { |
2001 | struct cfq_data *cfqd = q->elevator->elevator_data; | 1677 | struct cfq_data *cfqd = q->elevator->elevator_data; |
2002 | struct cfq_io_cq *cic; | 1678 | struct cfq_io_context *cic; |
2003 | struct cfq_queue *cfqq; | 1679 | struct cfq_queue *cfqq; |
2004 | 1680 | ||
2005 | /* | 1681 | /* |
@@ -2009,7 +1685,7 @@ static int cfq_allow_merge(struct request_queue *q, struct request *rq, | |||
2009 | return false; | 1685 | return false; |
2010 | 1686 | ||
2011 | /* | 1687 | /* |
2012 | * Lookup the cfqq that this bio will be queued with and allow | 1688 | * Lookup the cfqq that this bio will be queued with. Allow |
2013 | * merge only if rq is queued there. | 1689 | * merge only if rq is queued there. |
2014 | */ | 1690 | */ |
2015 | cic = cfq_cic_lookup(cfqd, current->io_context); | 1691 | cic = cfq_cic_lookup(cfqd, current->io_context); |
@@ -2023,7 +1699,7 @@ static int cfq_allow_merge(struct request_queue *q, struct request *rq, | |||
2023 | static inline void cfq_del_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq) | 1699 | static inline void cfq_del_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq) |
2024 | { | 1700 | { |
2025 | del_timer(&cfqd->idle_slice_timer); | 1701 | del_timer(&cfqd->idle_slice_timer); |
2026 | cfqg_stats_update_idle_time(cfqq->cfqg); | 1702 | cfq_blkiocg_update_idle_time_stats(&cfqq->cfqg->blkg); |
2027 | } | 1703 | } |
2028 | 1704 | ||
2029 | static void __cfq_set_active_queue(struct cfq_data *cfqd, | 1705 | static void __cfq_set_active_queue(struct cfq_data *cfqd, |
@@ -2032,7 +1708,7 @@ static void __cfq_set_active_queue(struct cfq_data *cfqd, | |||
2032 | if (cfqq) { | 1708 | if (cfqq) { |
2033 | cfq_log_cfqq(cfqd, cfqq, "set_active wl_prio:%d wl_type:%d", | 1709 | cfq_log_cfqq(cfqd, cfqq, "set_active wl_prio:%d wl_type:%d", |
2034 | cfqd->serving_prio, cfqd->serving_type); | 1710 | cfqd->serving_prio, cfqd->serving_type); |
2035 | cfqg_stats_update_avg_queue_size(cfqq->cfqg); | 1711 | cfq_blkiocg_update_avg_queue_size_stats(&cfqq->cfqg->blkg); |
2036 | cfqq->slice_start = 0; | 1712 | cfqq->slice_start = 0; |
2037 | cfqq->dispatch_start = jiffies; | 1713 | cfqq->dispatch_start = jiffies; |
2038 | cfqq->allocated_slice = 0; | 1714 | cfqq->allocated_slice = 0; |
@@ -2098,7 +1774,7 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq, | |||
2098 | cfqd->active_queue = NULL; | 1774 | cfqd->active_queue = NULL; |
2099 | 1775 | ||
2100 | if (cfqd->active_cic) { | 1776 | if (cfqd->active_cic) { |
2101 | put_io_context(cfqd->active_cic->icq.ioc); | 1777 | put_io_context(cfqd->active_cic->ioc); |
2102 | cfqd->active_cic = NULL; | 1778 | cfqd->active_cic = NULL; |
2103 | } | 1779 | } |
2104 | } | 1780 | } |
@@ -2318,7 +1994,7 @@ static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq) | |||
2318 | static void cfq_arm_slice_timer(struct cfq_data *cfqd) | 1994 | static void cfq_arm_slice_timer(struct cfq_data *cfqd) |
2319 | { | 1995 | { |
2320 | struct cfq_queue *cfqq = cfqd->active_queue; | 1996 | struct cfq_queue *cfqq = cfqd->active_queue; |
2321 | struct cfq_io_cq *cic; | 1997 | struct cfq_io_context *cic; |
2322 | unsigned long sl, group_idle = 0; | 1998 | unsigned long sl, group_idle = 0; |
2323 | 1999 | ||
2324 | /* | 2000 | /* |
@@ -2353,7 +2029,7 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd) | |||
2353 | * task has exited, don't wait | 2029 | * task has exited, don't wait |
2354 | */ | 2030 | */ |
2355 | cic = cfqd->active_cic; | 2031 | cic = cfqd->active_cic; |
2356 | if (!cic || !atomic_read(&cic->icq.ioc->active_ref)) | 2032 | if (!cic || !atomic_read(&cic->ioc->nr_tasks)) |
2357 | return; | 2033 | return; |
2358 | 2034 | ||
2359 | /* | 2035 | /* |
@@ -2380,7 +2056,7 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd) | |||
2380 | sl = cfqd->cfq_slice_idle; | 2056 | sl = cfqd->cfq_slice_idle; |
2381 | 2057 | ||
2382 | mod_timer(&cfqd->idle_slice_timer, jiffies + sl); | 2058 | mod_timer(&cfqd->idle_slice_timer, jiffies + sl); |
2383 | cfqg_stats_set_start_idle_time(cfqq->cfqg); | 2059 | cfq_blkiocg_update_set_idle_time_stats(&cfqq->cfqg->blkg); |
2384 | cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu group_idle: %d", sl, | 2060 | cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu group_idle: %d", sl, |
2385 | group_idle ? 1 : 0); | 2061 | group_idle ? 1 : 0); |
2386 | } | 2062 | } |
@@ -2403,7 +2079,8 @@ static void cfq_dispatch_insert(struct request_queue *q, struct request *rq) | |||
2403 | 2079 | ||
2404 | cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++; | 2080 | cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++; |
2405 | cfqq->nr_sectors += blk_rq_sectors(rq); | 2081 | cfqq->nr_sectors += blk_rq_sectors(rq); |
2406 | cfqg_stats_update_dispatch(cfqq->cfqg, blk_rq_bytes(rq), rq->cmd_flags); | 2082 | cfq_blkiocg_update_dispatch_stats(&cfqq->cfqg->blkg, blk_rq_bytes(rq), |
2083 | rq_data_dir(rq), rq_is_sync(rq)); | ||
2407 | } | 2084 | } |
2408 | 2085 | ||
2409 | /* | 2086 | /* |
@@ -2581,8 +2258,7 @@ new_workload: | |||
2581 | * to have higher weight. A more accurate thing would be to | 2258 | * to have higher weight. A more accurate thing would be to |
2582 | * calculate system wide asnc/sync ratio. | 2259 | * calculate system wide asnc/sync ratio. |
2583 | */ | 2260 | */ |
2584 | tmp = cfqd->cfq_target_latency * | 2261 | tmp = cfq_target_latency * cfqg_busy_async_queues(cfqd, cfqg); |
2585 | cfqg_busy_async_queues(cfqd, cfqg); | ||
2586 | tmp = tmp/cfqd->busy_queues; | 2262 | tmp = tmp/cfqd->busy_queues; |
2587 | slice = min_t(unsigned, slice, tmp); | 2263 | slice = min_t(unsigned, slice, tmp); |
2588 | 2264 | ||
@@ -2904,9 +2580,9 @@ static bool cfq_dispatch_request(struct cfq_data *cfqd, struct cfq_queue *cfqq) | |||
2904 | cfq_dispatch_insert(cfqd->queue, rq); | 2580 | cfq_dispatch_insert(cfqd->queue, rq); |
2905 | 2581 | ||
2906 | if (!cfqd->active_cic) { | 2582 | if (!cfqd->active_cic) { |
2907 | struct cfq_io_cq *cic = RQ_CIC(rq); | 2583 | struct cfq_io_context *cic = RQ_CIC(rq); |
2908 | 2584 | ||
2909 | atomic_long_inc(&cic->icq.ioc->refcount); | 2585 | atomic_long_inc(&cic->ioc->refcount); |
2910 | cfqd->active_cic = cic; | 2586 | cfqd->active_cic = cic; |
2911 | } | 2587 | } |
2912 | 2588 | ||
@@ -2986,7 +2662,85 @@ static void cfq_put_queue(struct cfq_queue *cfqq) | |||
2986 | 2662 | ||
2987 | BUG_ON(cfq_cfqq_on_rr(cfqq)); | 2663 | BUG_ON(cfq_cfqq_on_rr(cfqq)); |
2988 | kmem_cache_free(cfq_pool, cfqq); | 2664 | kmem_cache_free(cfq_pool, cfqq); |
2989 | cfqg_put(cfqg); | 2665 | cfq_put_cfqg(cfqg); |
2666 | } | ||
2667 | |||
2668 | /* | ||
2669 | * Call func for each cic attached to this ioc. | ||
2670 | */ | ||
2671 | static void | ||
2672 | call_for_each_cic(struct io_context *ioc, | ||
2673 | void (*func)(struct io_context *, struct cfq_io_context *)) | ||
2674 | { | ||
2675 | struct cfq_io_context *cic; | ||
2676 | struct hlist_node *n; | ||
2677 | |||
2678 | rcu_read_lock(); | ||
2679 | |||
2680 | hlist_for_each_entry_rcu(cic, n, &ioc->cic_list, cic_list) | ||
2681 | func(ioc, cic); | ||
2682 | |||
2683 | rcu_read_unlock(); | ||
2684 | } | ||
2685 | |||
2686 | static void cfq_cic_free_rcu(struct rcu_head *head) | ||
2687 | { | ||
2688 | struct cfq_io_context *cic; | ||
2689 | |||
2690 | cic = container_of(head, struct cfq_io_context, rcu_head); | ||
2691 | |||
2692 | kmem_cache_free(cfq_ioc_pool, cic); | ||
2693 | elv_ioc_count_dec(cfq_ioc_count); | ||
2694 | |||
2695 | if (ioc_gone) { | ||
2696 | /* | ||
2697 | * CFQ scheduler is exiting, grab exit lock and check | ||
2698 | * the pending io context count. If it hits zero, | ||
2699 | * complete ioc_gone and set it back to NULL | ||
2700 | */ | ||
2701 | spin_lock(&ioc_gone_lock); | ||
2702 | if (ioc_gone && !elv_ioc_count_read(cfq_ioc_count)) { | ||
2703 | complete(ioc_gone); | ||
2704 | ioc_gone = NULL; | ||
2705 | } | ||
2706 | spin_unlock(&ioc_gone_lock); | ||
2707 | } | ||
2708 | } | ||
2709 | |||
2710 | static void cfq_cic_free(struct cfq_io_context *cic) | ||
2711 | { | ||
2712 | call_rcu(&cic->rcu_head, cfq_cic_free_rcu); | ||
2713 | } | ||
2714 | |||
2715 | static void cic_free_func(struct io_context *ioc, struct cfq_io_context *cic) | ||
2716 | { | ||
2717 | unsigned long flags; | ||
2718 | unsigned long dead_key = (unsigned long) cic->key; | ||
2719 | |||
2720 | BUG_ON(!(dead_key & CIC_DEAD_KEY)); | ||
2721 | |||
2722 | spin_lock_irqsave(&ioc->lock, flags); | ||
2723 | radix_tree_delete(&ioc->radix_root, dead_key >> CIC_DEAD_INDEX_SHIFT); | ||
2724 | hlist_del_rcu(&cic->cic_list); | ||
2725 | spin_unlock_irqrestore(&ioc->lock, flags); | ||
2726 | |||
2727 | cfq_cic_free(cic); | ||
2728 | } | ||
2729 | |||
2730 | /* | ||
2731 | * Must be called with rcu_read_lock() held or preemption otherwise disabled. | ||
2732 | * Only two callers of this - ->dtor() which is called with the rcu_read_lock(), | ||
2733 | * and ->trim() which is called with the task lock held | ||
2734 | */ | ||
2735 | static void cfq_free_io_context(struct io_context *ioc) | ||
2736 | { | ||
2737 | /* | ||
2738 | * ioc->refcount is zero here, or we are called from elv_unregister(), | ||
2739 | * so no more cic's are allowed to be linked into this ioc. So it | ||
2740 | * should be ok to iterate over the known list, we will see all cic's | ||
2741 | * since no new ones are added. | ||
2742 | */ | ||
2743 | call_for_each_cic(ioc, cic_free_func); | ||
2990 | } | 2744 | } |
2991 | 2745 | ||
2992 | static void cfq_put_cooperator(struct cfq_queue *cfqq) | 2746 | static void cfq_put_cooperator(struct cfq_queue *cfqq) |
@@ -3022,17 +2776,27 @@ static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq) | |||
3022 | cfq_put_queue(cfqq); | 2776 | cfq_put_queue(cfqq); |
3023 | } | 2777 | } |
3024 | 2778 | ||
3025 | static void cfq_init_icq(struct io_cq *icq) | 2779 | static void __cfq_exit_single_io_context(struct cfq_data *cfqd, |
2780 | struct cfq_io_context *cic) | ||
3026 | { | 2781 | { |
3027 | struct cfq_io_cq *cic = icq_to_cic(icq); | 2782 | struct io_context *ioc = cic->ioc; |
3028 | 2783 | ||
3029 | cic->ttime.last_end_request = jiffies; | 2784 | list_del_init(&cic->queue_list); |
3030 | } | ||
3031 | 2785 | ||
3032 | static void cfq_exit_icq(struct io_cq *icq) | 2786 | /* |
3033 | { | 2787 | * Make sure dead mark is seen for dead queues |
3034 | struct cfq_io_cq *cic = icq_to_cic(icq); | 2788 | */ |
3035 | struct cfq_data *cfqd = cic_to_cfqd(cic); | 2789 | smp_wmb(); |
2790 | cic->key = cfqd_dead_key(cfqd); | ||
2791 | |||
2792 | rcu_read_lock(); | ||
2793 | if (rcu_dereference(ioc->ioc_data) == cic) { | ||
2794 | rcu_read_unlock(); | ||
2795 | spin_lock(&ioc->lock); | ||
2796 | rcu_assign_pointer(ioc->ioc_data, NULL); | ||
2797 | spin_unlock(&ioc->lock); | ||
2798 | } else | ||
2799 | rcu_read_unlock(); | ||
3036 | 2800 | ||
3037 | if (cic->cfqq[BLK_RW_ASYNC]) { | 2801 | if (cic->cfqq[BLK_RW_ASYNC]) { |
3038 | cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_ASYNC]); | 2802 | cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_ASYNC]); |
@@ -3045,7 +2809,58 @@ static void cfq_exit_icq(struct io_cq *icq) | |||
3045 | } | 2809 | } |
3046 | } | 2810 | } |
3047 | 2811 | ||
3048 | static void cfq_init_prio_data(struct cfq_queue *cfqq, struct cfq_io_cq *cic) | 2812 | static void cfq_exit_single_io_context(struct io_context *ioc, |
2813 | struct cfq_io_context *cic) | ||
2814 | { | ||
2815 | struct cfq_data *cfqd = cic_to_cfqd(cic); | ||
2816 | |||
2817 | if (cfqd) { | ||
2818 | struct request_queue *q = cfqd->queue; | ||
2819 | unsigned long flags; | ||
2820 | |||
2821 | spin_lock_irqsave(q->queue_lock, flags); | ||
2822 | |||
2823 | /* | ||
2824 | * Ensure we get a fresh copy of the ->key to prevent | ||
2825 | * race between exiting task and queue | ||
2826 | */ | ||
2827 | smp_read_barrier_depends(); | ||
2828 | if (cic->key == cfqd) | ||
2829 | __cfq_exit_single_io_context(cfqd, cic); | ||
2830 | |||
2831 | spin_unlock_irqrestore(q->queue_lock, flags); | ||
2832 | } | ||
2833 | } | ||
2834 | |||
2835 | /* | ||
2836 | * The process that ioc belongs to has exited, we need to clean up | ||
2837 | * and put the internal structures we have that belongs to that process. | ||
2838 | */ | ||
2839 | static void cfq_exit_io_context(struct io_context *ioc) | ||
2840 | { | ||
2841 | call_for_each_cic(ioc, cfq_exit_single_io_context); | ||
2842 | } | ||
2843 | |||
2844 | static struct cfq_io_context * | ||
2845 | cfq_alloc_io_context(struct cfq_data *cfqd, gfp_t gfp_mask) | ||
2846 | { | ||
2847 | struct cfq_io_context *cic; | ||
2848 | |||
2849 | cic = kmem_cache_alloc_node(cfq_ioc_pool, gfp_mask | __GFP_ZERO, | ||
2850 | cfqd->queue->node); | ||
2851 | if (cic) { | ||
2852 | cic->ttime.last_end_request = jiffies; | ||
2853 | INIT_LIST_HEAD(&cic->queue_list); | ||
2854 | INIT_HLIST_NODE(&cic->cic_list); | ||
2855 | cic->dtor = cfq_free_io_context; | ||
2856 | cic->exit = cfq_exit_io_context; | ||
2857 | elv_ioc_count_inc(cfq_ioc_count); | ||
2858 | } | ||
2859 | |||
2860 | return cic; | ||
2861 | } | ||
2862 | |||
2863 | static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc) | ||
3049 | { | 2864 | { |
3050 | struct task_struct *tsk = current; | 2865 | struct task_struct *tsk = current; |
3051 | int ioprio_class; | 2866 | int ioprio_class; |
@@ -3053,7 +2868,7 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq, struct cfq_io_cq *cic) | |||
3053 | if (!cfq_cfqq_prio_changed(cfqq)) | 2868 | if (!cfq_cfqq_prio_changed(cfqq)) |
3054 | return; | 2869 | return; |
3055 | 2870 | ||
3056 | ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio); | 2871 | ioprio_class = IOPRIO_PRIO_CLASS(ioc->ioprio); |
3057 | switch (ioprio_class) { | 2872 | switch (ioprio_class) { |
3058 | default: | 2873 | default: |
3059 | printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class); | 2874 | printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class); |
@@ -3065,11 +2880,11 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq, struct cfq_io_cq *cic) | |||
3065 | cfqq->ioprio_class = task_nice_ioclass(tsk); | 2880 | cfqq->ioprio_class = task_nice_ioclass(tsk); |
3066 | break; | 2881 | break; |
3067 | case IOPRIO_CLASS_RT: | 2882 | case IOPRIO_CLASS_RT: |
3068 | cfqq->ioprio = IOPRIO_PRIO_DATA(cic->ioprio); | 2883 | cfqq->ioprio = task_ioprio(ioc); |
3069 | cfqq->ioprio_class = IOPRIO_CLASS_RT; | 2884 | cfqq->ioprio_class = IOPRIO_CLASS_RT; |
3070 | break; | 2885 | break; |
3071 | case IOPRIO_CLASS_BE: | 2886 | case IOPRIO_CLASS_BE: |
3072 | cfqq->ioprio = IOPRIO_PRIO_DATA(cic->ioprio); | 2887 | cfqq->ioprio = task_ioprio(ioc); |
3073 | cfqq->ioprio_class = IOPRIO_CLASS_BE; | 2888 | cfqq->ioprio_class = IOPRIO_CLASS_BE; |
3074 | break; | 2889 | break; |
3075 | case IOPRIO_CLASS_IDLE: | 2890 | case IOPRIO_CLASS_IDLE: |
@@ -3087,24 +2902,22 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq, struct cfq_io_cq *cic) | |||
3087 | cfq_clear_cfqq_prio_changed(cfqq); | 2902 | cfq_clear_cfqq_prio_changed(cfqq); |
3088 | } | 2903 | } |
3089 | 2904 | ||
3090 | static void check_ioprio_changed(struct cfq_io_cq *cic, struct bio *bio) | 2905 | static void changed_ioprio(struct io_context *ioc, struct cfq_io_context *cic) |
3091 | { | 2906 | { |
3092 | int ioprio = cic->icq.ioc->ioprio; | ||
3093 | struct cfq_data *cfqd = cic_to_cfqd(cic); | 2907 | struct cfq_data *cfqd = cic_to_cfqd(cic); |
3094 | struct cfq_queue *cfqq; | 2908 | struct cfq_queue *cfqq; |
2909 | unsigned long flags; | ||
3095 | 2910 | ||
3096 | /* | 2911 | if (unlikely(!cfqd)) |
3097 | * Check whether ioprio has changed. The condition may trigger | ||
3098 | * spuriously on a newly created cic but there's no harm. | ||
3099 | */ | ||
3100 | if (unlikely(!cfqd) || likely(cic->ioprio == ioprio)) | ||
3101 | return; | 2912 | return; |
3102 | 2913 | ||
2914 | spin_lock_irqsave(cfqd->queue->queue_lock, flags); | ||
2915 | |||
3103 | cfqq = cic->cfqq[BLK_RW_ASYNC]; | 2916 | cfqq = cic->cfqq[BLK_RW_ASYNC]; |
3104 | if (cfqq) { | 2917 | if (cfqq) { |
3105 | struct cfq_queue *new_cfqq; | 2918 | struct cfq_queue *new_cfqq; |
3106 | new_cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic, bio, | 2919 | new_cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic->ioc, |
3107 | GFP_ATOMIC); | 2920 | GFP_ATOMIC); |
3108 | if (new_cfqq) { | 2921 | if (new_cfqq) { |
3109 | cic->cfqq[BLK_RW_ASYNC] = new_cfqq; | 2922 | cic->cfqq[BLK_RW_ASYNC] = new_cfqq; |
3110 | cfq_put_queue(cfqq); | 2923 | cfq_put_queue(cfqq); |
@@ -3115,7 +2928,13 @@ static void check_ioprio_changed(struct cfq_io_cq *cic, struct bio *bio) | |||
3115 | if (cfqq) | 2928 | if (cfqq) |
3116 | cfq_mark_cfqq_prio_changed(cfqq); | 2929 | cfq_mark_cfqq_prio_changed(cfqq); |
3117 | 2930 | ||
3118 | cic->ioprio = ioprio; | 2931 | spin_unlock_irqrestore(cfqd->queue->queue_lock, flags); |
2932 | } | ||
2933 | |||
2934 | static void cfq_ioc_set_ioprio(struct io_context *ioc) | ||
2935 | { | ||
2936 | call_for_each_cic(ioc, changed_ioprio); | ||
2937 | ioc->ioprio_changed = 0; | ||
3119 | } | 2938 | } |
3120 | 2939 | ||
3121 | static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq, | 2940 | static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq, |
@@ -3139,24 +2958,20 @@ static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq, | |||
3139 | } | 2958 | } |
3140 | 2959 | ||
3141 | #ifdef CONFIG_CFQ_GROUP_IOSCHED | 2960 | #ifdef CONFIG_CFQ_GROUP_IOSCHED |
3142 | static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) | 2961 | static void changed_cgroup(struct io_context *ioc, struct cfq_io_context *cic) |
3143 | { | 2962 | { |
2963 | struct cfq_queue *sync_cfqq = cic_to_cfqq(cic, 1); | ||
3144 | struct cfq_data *cfqd = cic_to_cfqd(cic); | 2964 | struct cfq_data *cfqd = cic_to_cfqd(cic); |
3145 | struct cfq_queue *sync_cfqq; | 2965 | unsigned long flags; |
3146 | uint64_t id; | 2966 | struct request_queue *q; |
3147 | |||
3148 | rcu_read_lock(); | ||
3149 | id = bio_blkcg(bio)->id; | ||
3150 | rcu_read_unlock(); | ||
3151 | 2967 | ||
3152 | /* | 2968 | if (unlikely(!cfqd)) |
3153 | * Check whether blkcg has changed. The condition may trigger | ||
3154 | * spuriously on a newly created cic but there's no harm. | ||
3155 | */ | ||
3156 | if (unlikely(!cfqd) || likely(cic->blkcg_id == id)) | ||
3157 | return; | 2969 | return; |
3158 | 2970 | ||
3159 | sync_cfqq = cic_to_cfqq(cic, 1); | 2971 | q = cfqd->queue; |
2972 | |||
2973 | spin_lock_irqsave(q->queue_lock, flags); | ||
2974 | |||
3160 | if (sync_cfqq) { | 2975 | if (sync_cfqq) { |
3161 | /* | 2976 | /* |
3162 | * Drop reference to sync queue. A new sync queue will be | 2977 | * Drop reference to sync queue. A new sync queue will be |
@@ -3167,25 +2982,28 @@ static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) | |||
3167 | cfq_put_queue(sync_cfqq); | 2982 | cfq_put_queue(sync_cfqq); |
3168 | } | 2983 | } |
3169 | 2984 | ||
3170 | cic->blkcg_id = id; | 2985 | spin_unlock_irqrestore(q->queue_lock, flags); |
2986 | } | ||
2987 | |||
2988 | static void cfq_ioc_set_cgroup(struct io_context *ioc) | ||
2989 | { | ||
2990 | call_for_each_cic(ioc, changed_cgroup); | ||
2991 | ioc->cgroup_changed = 0; | ||
3171 | } | 2992 | } |
3172 | #else | ||
3173 | static inline void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) { } | ||
3174 | #endif /* CONFIG_CFQ_GROUP_IOSCHED */ | 2993 | #endif /* CONFIG_CFQ_GROUP_IOSCHED */ |
3175 | 2994 | ||
3176 | static struct cfq_queue * | 2995 | static struct cfq_queue * |
3177 | cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic, | 2996 | cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync, |
3178 | struct bio *bio, gfp_t gfp_mask) | 2997 | struct io_context *ioc, gfp_t gfp_mask) |
3179 | { | 2998 | { |
3180 | struct blkcg *blkcg; | ||
3181 | struct cfq_queue *cfqq, *new_cfqq = NULL; | 2999 | struct cfq_queue *cfqq, *new_cfqq = NULL; |
3000 | struct cfq_io_context *cic; | ||
3182 | struct cfq_group *cfqg; | 3001 | struct cfq_group *cfqg; |
3183 | 3002 | ||
3184 | retry: | 3003 | retry: |
3185 | rcu_read_lock(); | 3004 | cfqg = cfq_get_cfqg(cfqd); |
3186 | 3005 | cic = cfq_cic_lookup(cfqd, ioc); | |
3187 | blkcg = bio_blkcg(bio); | 3006 | /* cic always exists here */ |
3188 | cfqg = cfq_lookup_create_cfqg(cfqd, blkcg); | ||
3189 | cfqq = cic_to_cfqq(cic, is_sync); | 3007 | cfqq = cic_to_cfqq(cic, is_sync); |
3190 | 3008 | ||
3191 | /* | 3009 | /* |
@@ -3198,7 +3016,6 @@ retry: | |||
3198 | cfqq = new_cfqq; | 3016 | cfqq = new_cfqq; |
3199 | new_cfqq = NULL; | 3017 | new_cfqq = NULL; |
3200 | } else if (gfp_mask & __GFP_WAIT) { | 3018 | } else if (gfp_mask & __GFP_WAIT) { |
3201 | rcu_read_unlock(); | ||
3202 | spin_unlock_irq(cfqd->queue->queue_lock); | 3019 | spin_unlock_irq(cfqd->queue->queue_lock); |
3203 | new_cfqq = kmem_cache_alloc_node(cfq_pool, | 3020 | new_cfqq = kmem_cache_alloc_node(cfq_pool, |
3204 | gfp_mask | __GFP_ZERO, | 3021 | gfp_mask | __GFP_ZERO, |
@@ -3214,7 +3031,7 @@ retry: | |||
3214 | 3031 | ||
3215 | if (cfqq) { | 3032 | if (cfqq) { |
3216 | cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync); | 3033 | cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync); |
3217 | cfq_init_prio_data(cfqq, cic); | 3034 | cfq_init_prio_data(cfqq, ioc); |
3218 | cfq_link_cfqq_cfqg(cfqq, cfqg); | 3035 | cfq_link_cfqq_cfqg(cfqq, cfqg); |
3219 | cfq_log_cfqq(cfqd, cfqq, "alloced"); | 3036 | cfq_log_cfqq(cfqd, cfqq, "alloced"); |
3220 | } else | 3037 | } else |
@@ -3224,7 +3041,6 @@ retry: | |||
3224 | if (new_cfqq) | 3041 | if (new_cfqq) |
3225 | kmem_cache_free(cfq_pool, new_cfqq); | 3042 | kmem_cache_free(cfq_pool, new_cfqq); |
3226 | 3043 | ||
3227 | rcu_read_unlock(); | ||
3228 | return cfqq; | 3044 | return cfqq; |
3229 | } | 3045 | } |
3230 | 3046 | ||
@@ -3234,9 +3050,6 @@ cfq_async_queue_prio(struct cfq_data *cfqd, int ioprio_class, int ioprio) | |||
3234 | switch (ioprio_class) { | 3050 | switch (ioprio_class) { |
3235 | case IOPRIO_CLASS_RT: | 3051 | case IOPRIO_CLASS_RT: |
3236 | return &cfqd->async_cfqq[0][ioprio]; | 3052 | return &cfqd->async_cfqq[0][ioprio]; |
3237 | case IOPRIO_CLASS_NONE: | ||
3238 | ioprio = IOPRIO_NORM; | ||
3239 | /* fall through */ | ||
3240 | case IOPRIO_CLASS_BE: | 3053 | case IOPRIO_CLASS_BE: |
3241 | return &cfqd->async_cfqq[1][ioprio]; | 3054 | return &cfqd->async_cfqq[1][ioprio]; |
3242 | case IOPRIO_CLASS_IDLE: | 3055 | case IOPRIO_CLASS_IDLE: |
@@ -3247,11 +3060,11 @@ cfq_async_queue_prio(struct cfq_data *cfqd, int ioprio_class, int ioprio) | |||
3247 | } | 3060 | } |
3248 | 3061 | ||
3249 | static struct cfq_queue * | 3062 | static struct cfq_queue * |
3250 | cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic, | 3063 | cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct io_context *ioc, |
3251 | struct bio *bio, gfp_t gfp_mask) | 3064 | gfp_t gfp_mask) |
3252 | { | 3065 | { |
3253 | const int ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio); | 3066 | const int ioprio = task_ioprio(ioc); |
3254 | const int ioprio = IOPRIO_PRIO_DATA(cic->ioprio); | 3067 | const int ioprio_class = task_ioprio_class(ioc); |
3255 | struct cfq_queue **async_cfqq = NULL; | 3068 | struct cfq_queue **async_cfqq = NULL; |
3256 | struct cfq_queue *cfqq = NULL; | 3069 | struct cfq_queue *cfqq = NULL; |
3257 | 3070 | ||
@@ -3261,7 +3074,7 @@ cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic, | |||
3261 | } | 3074 | } |
3262 | 3075 | ||
3263 | if (!cfqq) | 3076 | if (!cfqq) |
3264 | cfqq = cfq_find_alloc_queue(cfqd, is_sync, cic, bio, gfp_mask); | 3077 | cfqq = cfq_find_alloc_queue(cfqd, is_sync, ioc, gfp_mask); |
3265 | 3078 | ||
3266 | /* | 3079 | /* |
3267 | * pin the queue now that it's allocated, scheduler exit will prune it | 3080 | * pin the queue now that it's allocated, scheduler exit will prune it |
@@ -3275,6 +3088,160 @@ cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic, | |||
3275 | return cfqq; | 3088 | return cfqq; |
3276 | } | 3089 | } |
3277 | 3090 | ||
3091 | /* | ||
3092 | * We drop cfq io contexts lazily, so we may find a dead one. | ||
3093 | */ | ||
3094 | static void | ||
3095 | cfq_drop_dead_cic(struct cfq_data *cfqd, struct io_context *ioc, | ||
3096 | struct cfq_io_context *cic) | ||
3097 | { | ||
3098 | unsigned long flags; | ||
3099 | |||
3100 | WARN_ON(!list_empty(&cic->queue_list)); | ||
3101 | BUG_ON(cic->key != cfqd_dead_key(cfqd)); | ||
3102 | |||
3103 | spin_lock_irqsave(&ioc->lock, flags); | ||
3104 | |||
3105 | BUG_ON(rcu_dereference_check(ioc->ioc_data, | ||
3106 | lockdep_is_held(&ioc->lock)) == cic); | ||
3107 | |||
3108 | radix_tree_delete(&ioc->radix_root, cfqd->cic_index); | ||
3109 | hlist_del_rcu(&cic->cic_list); | ||
3110 | spin_unlock_irqrestore(&ioc->lock, flags); | ||
3111 | |||
3112 | cfq_cic_free(cic); | ||
3113 | } | ||
3114 | |||
3115 | static struct cfq_io_context * | ||
3116 | cfq_cic_lookup(struct cfq_data *cfqd, struct io_context *ioc) | ||
3117 | { | ||
3118 | struct cfq_io_context *cic; | ||
3119 | unsigned long flags; | ||
3120 | |||
3121 | if (unlikely(!ioc)) | ||
3122 | return NULL; | ||
3123 | |||
3124 | rcu_read_lock(); | ||
3125 | |||
3126 | /* | ||
3127 | * we maintain a last-hit cache, to avoid browsing over the tree | ||
3128 | */ | ||
3129 | cic = rcu_dereference(ioc->ioc_data); | ||
3130 | if (cic && cic->key == cfqd) { | ||
3131 | rcu_read_unlock(); | ||
3132 | return cic; | ||
3133 | } | ||
3134 | |||
3135 | do { | ||
3136 | cic = radix_tree_lookup(&ioc->radix_root, cfqd->cic_index); | ||
3137 | rcu_read_unlock(); | ||
3138 | if (!cic) | ||
3139 | break; | ||
3140 | if (unlikely(cic->key != cfqd)) { | ||
3141 | cfq_drop_dead_cic(cfqd, ioc, cic); | ||
3142 | rcu_read_lock(); | ||
3143 | continue; | ||
3144 | } | ||
3145 | |||
3146 | spin_lock_irqsave(&ioc->lock, flags); | ||
3147 | rcu_assign_pointer(ioc->ioc_data, cic); | ||
3148 | spin_unlock_irqrestore(&ioc->lock, flags); | ||
3149 | break; | ||
3150 | } while (1); | ||
3151 | |||
3152 | return cic; | ||
3153 | } | ||
3154 | |||
3155 | /* | ||
3156 | * Add cic into ioc, using cfqd as the search key. This enables us to lookup | ||
3157 | * the process specific cfq io context when entered from the block layer. | ||
3158 | * Also adds the cic to a per-cfqd list, used when this queue is removed. | ||
3159 | */ | ||
3160 | static int cfq_cic_link(struct cfq_data *cfqd, struct io_context *ioc, | ||
3161 | struct cfq_io_context *cic, gfp_t gfp_mask) | ||
3162 | { | ||
3163 | unsigned long flags; | ||
3164 | int ret; | ||
3165 | |||
3166 | ret = radix_tree_preload(gfp_mask); | ||
3167 | if (!ret) { | ||
3168 | cic->ioc = ioc; | ||
3169 | cic->key = cfqd; | ||
3170 | |||
3171 | spin_lock_irqsave(&ioc->lock, flags); | ||
3172 | ret = radix_tree_insert(&ioc->radix_root, | ||
3173 | cfqd->cic_index, cic); | ||
3174 | if (!ret) | ||
3175 | hlist_add_head_rcu(&cic->cic_list, &ioc->cic_list); | ||
3176 | spin_unlock_irqrestore(&ioc->lock, flags); | ||
3177 | |||
3178 | radix_tree_preload_end(); | ||
3179 | |||
3180 | if (!ret) { | ||
3181 | spin_lock_irqsave(cfqd->queue->queue_lock, flags); | ||
3182 | list_add(&cic->queue_list, &cfqd->cic_list); | ||
3183 | spin_unlock_irqrestore(cfqd->queue->queue_lock, flags); | ||
3184 | } | ||
3185 | } | ||
3186 | |||
3187 | if (ret && ret != -EEXIST) | ||
3188 | printk(KERN_ERR "cfq: cic link failed!\n"); | ||
3189 | |||
3190 | return ret; | ||
3191 | } | ||
3192 | |||
3193 | /* | ||
3194 | * Setup general io context and cfq io context. There can be several cfq | ||
3195 | * io contexts per general io context, if this process is doing io to more | ||
3196 | * than one device managed by cfq. | ||
3197 | */ | ||
3198 | static struct cfq_io_context * | ||
3199 | cfq_get_io_context(struct cfq_data *cfqd, gfp_t gfp_mask) | ||
3200 | { | ||
3201 | struct io_context *ioc = NULL; | ||
3202 | struct cfq_io_context *cic; | ||
3203 | int ret; | ||
3204 | |||
3205 | might_sleep_if(gfp_mask & __GFP_WAIT); | ||
3206 | |||
3207 | ioc = get_io_context(gfp_mask, cfqd->queue->node); | ||
3208 | if (!ioc) | ||
3209 | return NULL; | ||
3210 | |||
3211 | retry: | ||
3212 | cic = cfq_cic_lookup(cfqd, ioc); | ||
3213 | if (cic) | ||
3214 | goto out; | ||
3215 | |||
3216 | cic = cfq_alloc_io_context(cfqd, gfp_mask); | ||
3217 | if (cic == NULL) | ||
3218 | goto err; | ||
3219 | |||
3220 | ret = cfq_cic_link(cfqd, ioc, cic, gfp_mask); | ||
3221 | if (ret == -EEXIST) { | ||
3222 | /* someone has linked cic to ioc already */ | ||
3223 | cfq_cic_free(cic); | ||
3224 | goto retry; | ||
3225 | } else if (ret) | ||
3226 | goto err_free; | ||
3227 | |||
3228 | out: | ||
3229 | smp_read_barrier_depends(); | ||
3230 | if (unlikely(ioc->ioprio_changed)) | ||
3231 | cfq_ioc_set_ioprio(ioc); | ||
3232 | |||
3233 | #ifdef CONFIG_CFQ_GROUP_IOSCHED | ||
3234 | if (unlikely(ioc->cgroup_changed)) | ||
3235 | cfq_ioc_set_cgroup(ioc); | ||
3236 | #endif | ||
3237 | return cic; | ||
3238 | err_free: | ||
3239 | cfq_cic_free(cic); | ||
3240 | err: | ||
3241 | put_io_context(ioc); | ||
3242 | return NULL; | ||
3243 | } | ||
3244 | |||
3278 | static void | 3245 | static void |
3279 | __cfq_update_io_thinktime(struct cfq_ttime *ttime, unsigned long slice_idle) | 3246 | __cfq_update_io_thinktime(struct cfq_ttime *ttime, unsigned long slice_idle) |
3280 | { | 3247 | { |
@@ -3288,7 +3255,7 @@ __cfq_update_io_thinktime(struct cfq_ttime *ttime, unsigned long slice_idle) | |||
3288 | 3255 | ||
3289 | static void | 3256 | static void |
3290 | cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_queue *cfqq, | 3257 | cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_queue *cfqq, |
3291 | struct cfq_io_cq *cic) | 3258 | struct cfq_io_context *cic) |
3292 | { | 3259 | { |
3293 | if (cfq_cfqq_sync(cfqq)) { | 3260 | if (cfq_cfqq_sync(cfqq)) { |
3294 | __cfq_update_io_thinktime(&cic->ttime, cfqd->cfq_slice_idle); | 3261 | __cfq_update_io_thinktime(&cic->ttime, cfqd->cfq_slice_idle); |
@@ -3326,7 +3293,7 @@ cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_queue *cfqq, | |||
3326 | */ | 3293 | */ |
3327 | static void | 3294 | static void |
3328 | cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq, | 3295 | cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq, |
3329 | struct cfq_io_cq *cic) | 3296 | struct cfq_io_context *cic) |
3330 | { | 3297 | { |
3331 | int old_idle, enable_idle; | 3298 | int old_idle, enable_idle; |
3332 | 3299 | ||
@@ -3343,9 +3310,8 @@ cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq, | |||
3343 | 3310 | ||
3344 | if (cfqq->next_rq && (cfqq->next_rq->cmd_flags & REQ_NOIDLE)) | 3311 | if (cfqq->next_rq && (cfqq->next_rq->cmd_flags & REQ_NOIDLE)) |
3345 | enable_idle = 0; | 3312 | enable_idle = 0; |
3346 | else if (!atomic_read(&cic->icq.ioc->active_ref) || | 3313 | else if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle || |
3347 | !cfqd->cfq_slice_idle || | 3314 | (!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq))) |
3348 | (!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq))) | ||
3349 | enable_idle = 0; | 3315 | enable_idle = 0; |
3350 | else if (sample_valid(cic->ttime.ttime_samples)) { | 3316 | else if (sample_valid(cic->ttime.ttime_samples)) { |
3351 | if (cic->ttime.ttime_mean > cfqd->cfq_slice_idle) | 3317 | if (cic->ttime.ttime_mean > cfqd->cfq_slice_idle) |
@@ -3445,7 +3411,7 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq, | |||
3445 | */ | 3411 | */ |
3446 | static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq) | 3412 | static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq) |
3447 | { | 3413 | { |
3448 | enum wl_type_t old_type = cfqq_type(cfqd->active_queue); | 3414 | struct cfq_queue *old_cfqq = cfqd->active_queue; |
3449 | 3415 | ||
3450 | cfq_log_cfqq(cfqd, cfqq, "preempt"); | 3416 | cfq_log_cfqq(cfqd, cfqq, "preempt"); |
3451 | cfq_slice_expired(cfqd, 1); | 3417 | cfq_slice_expired(cfqd, 1); |
@@ -3454,7 +3420,7 @@ static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq) | |||
3454 | * workload type is changed, don't save slice, otherwise preempt | 3420 | * workload type is changed, don't save slice, otherwise preempt |
3455 | * doesn't happen | 3421 | * doesn't happen |
3456 | */ | 3422 | */ |
3457 | if (old_type != cfqq_type(cfqq)) | 3423 | if (cfqq_type(old_cfqq) != cfqq_type(cfqq)) |
3458 | cfqq->cfqg->saved_workload_slice = 0; | 3424 | cfqq->cfqg->saved_workload_slice = 0; |
3459 | 3425 | ||
3460 | /* | 3426 | /* |
@@ -3477,7 +3443,7 @@ static void | |||
3477 | cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq, | 3443 | cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq, |
3478 | struct request *rq) | 3444 | struct request *rq) |
3479 | { | 3445 | { |
3480 | struct cfq_io_cq *cic = RQ_CIC(rq); | 3446 | struct cfq_io_context *cic = RQ_CIC(rq); |
3481 | 3447 | ||
3482 | cfqd->rq_queued++; | 3448 | cfqd->rq_queued++; |
3483 | if (rq->cmd_flags & REQ_PRIO) | 3449 | if (rq->cmd_flags & REQ_PRIO) |
@@ -3507,7 +3473,8 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq, | |||
3507 | cfq_clear_cfqq_wait_request(cfqq); | 3473 | cfq_clear_cfqq_wait_request(cfqq); |
3508 | __blk_run_queue(cfqd->queue); | 3474 | __blk_run_queue(cfqd->queue); |
3509 | } else { | 3475 | } else { |
3510 | cfqg_stats_update_idle_time(cfqq->cfqg); | 3476 | cfq_blkiocg_update_idle_time_stats( |
3477 | &cfqq->cfqg->blkg); | ||
3511 | cfq_mark_cfqq_must_dispatch(cfqq); | 3478 | cfq_mark_cfqq_must_dispatch(cfqq); |
3512 | } | 3479 | } |
3513 | } | 3480 | } |
@@ -3529,13 +3496,14 @@ static void cfq_insert_request(struct request_queue *q, struct request *rq) | |||
3529 | struct cfq_queue *cfqq = RQ_CFQQ(rq); | 3496 | struct cfq_queue *cfqq = RQ_CFQQ(rq); |
3530 | 3497 | ||
3531 | cfq_log_cfqq(cfqd, cfqq, "insert_request"); | 3498 | cfq_log_cfqq(cfqd, cfqq, "insert_request"); |
3532 | cfq_init_prio_data(cfqq, RQ_CIC(rq)); | 3499 | cfq_init_prio_data(cfqq, RQ_CIC(rq)->ioc); |
3533 | 3500 | ||
3534 | rq_set_fifo_time(rq, jiffies + cfqd->cfq_fifo_expire[rq_is_sync(rq)]); | 3501 | rq_set_fifo_time(rq, jiffies + cfqd->cfq_fifo_expire[rq_is_sync(rq)]); |
3535 | list_add_tail(&rq->queuelist, &cfqq->fifo); | 3502 | list_add_tail(&rq->queuelist, &cfqq->fifo); |
3536 | cfq_add_rq_rb(rq); | 3503 | cfq_add_rq_rb(rq); |
3537 | cfqg_stats_update_io_add(RQ_CFQG(rq), cfqd->serving_group, | 3504 | cfq_blkiocg_update_io_add_stats(&(RQ_CFQG(rq))->blkg, |
3538 | rq->cmd_flags); | 3505 | &cfqd->serving_group->blkg, rq_data_dir(rq), |
3506 | rq_is_sync(rq)); | ||
3539 | cfq_rq_enqueued(cfqd, cfqq, rq); | 3507 | cfq_rq_enqueued(cfqd, cfqq, rq); |
3540 | } | 3508 | } |
3541 | 3509 | ||
@@ -3578,7 +3546,7 @@ static void cfq_update_hw_tag(struct cfq_data *cfqd) | |||
3578 | 3546 | ||
3579 | static bool cfq_should_wait_busy(struct cfq_data *cfqd, struct cfq_queue *cfqq) | 3547 | static bool cfq_should_wait_busy(struct cfq_data *cfqd, struct cfq_queue *cfqq) |
3580 | { | 3548 | { |
3581 | struct cfq_io_cq *cic = cfqd->active_cic; | 3549 | struct cfq_io_context *cic = cfqd->active_cic; |
3582 | 3550 | ||
3583 | /* If the queue already has requests, don't wait */ | 3551 | /* If the queue already has requests, don't wait */ |
3584 | if (!RB_EMPTY_ROOT(&cfqq->sort_list)) | 3552 | if (!RB_EMPTY_ROOT(&cfqq->sort_list)) |
@@ -3631,8 +3599,9 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq) | |||
3631 | cfqd->rq_in_driver--; | 3599 | cfqd->rq_in_driver--; |
3632 | cfqq->dispatched--; | 3600 | cfqq->dispatched--; |
3633 | (RQ_CFQG(rq))->dispatched--; | 3601 | (RQ_CFQG(rq))->dispatched--; |
3634 | cfqg_stats_update_completion(cfqq->cfqg, rq_start_time_ns(rq), | 3602 | cfq_blkiocg_update_completion_stats(&cfqq->cfqg->blkg, |
3635 | rq_io_start_time_ns(rq), rq->cmd_flags); | 3603 | rq_start_time_ns(rq), rq_io_start_time_ns(rq), |
3604 | rq_data_dir(rq), rq_is_sync(rq)); | ||
3636 | 3605 | ||
3637 | cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--; | 3606 | cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--; |
3638 | 3607 | ||
@@ -3714,7 +3683,7 @@ static int cfq_may_queue(struct request_queue *q, int rw) | |||
3714 | { | 3683 | { |
3715 | struct cfq_data *cfqd = q->elevator->elevator_data; | 3684 | struct cfq_data *cfqd = q->elevator->elevator_data; |
3716 | struct task_struct *tsk = current; | 3685 | struct task_struct *tsk = current; |
3717 | struct cfq_io_cq *cic; | 3686 | struct cfq_io_context *cic; |
3718 | struct cfq_queue *cfqq; | 3687 | struct cfq_queue *cfqq; |
3719 | 3688 | ||
3720 | /* | 3689 | /* |
@@ -3729,7 +3698,7 @@ static int cfq_may_queue(struct request_queue *q, int rw) | |||
3729 | 3698 | ||
3730 | cfqq = cic_to_cfqq(cic, rw_is_sync(rw)); | 3699 | cfqq = cic_to_cfqq(cic, rw_is_sync(rw)); |
3731 | if (cfqq) { | 3700 | if (cfqq) { |
3732 | cfq_init_prio_data(cfqq, cic); | 3701 | cfq_init_prio_data(cfqq, cic->ioc); |
3733 | 3702 | ||
3734 | return __cfq_may_queue(cfqq); | 3703 | return __cfq_may_queue(cfqq); |
3735 | } | 3704 | } |
@@ -3750,17 +3719,21 @@ static void cfq_put_request(struct request *rq) | |||
3750 | BUG_ON(!cfqq->allocated[rw]); | 3719 | BUG_ON(!cfqq->allocated[rw]); |
3751 | cfqq->allocated[rw]--; | 3720 | cfqq->allocated[rw]--; |
3752 | 3721 | ||
3722 | put_io_context(RQ_CIC(rq)->ioc); | ||
3723 | |||
3724 | rq->elevator_private[0] = NULL; | ||
3725 | rq->elevator_private[1] = NULL; | ||
3726 | |||
3753 | /* Put down rq reference on cfqg */ | 3727 | /* Put down rq reference on cfqg */ |
3754 | cfqg_put(RQ_CFQG(rq)); | 3728 | cfq_put_cfqg(RQ_CFQG(rq)); |
3755 | rq->elv.priv[0] = NULL; | 3729 | rq->elevator_private[2] = NULL; |
3756 | rq->elv.priv[1] = NULL; | ||
3757 | 3730 | ||
3758 | cfq_put_queue(cfqq); | 3731 | cfq_put_queue(cfqq); |
3759 | } | 3732 | } |
3760 | } | 3733 | } |
3761 | 3734 | ||
3762 | static struct cfq_queue * | 3735 | static struct cfq_queue * |
3763 | cfq_merge_cfqqs(struct cfq_data *cfqd, struct cfq_io_cq *cic, | 3736 | cfq_merge_cfqqs(struct cfq_data *cfqd, struct cfq_io_context *cic, |
3764 | struct cfq_queue *cfqq) | 3737 | struct cfq_queue *cfqq) |
3765 | { | 3738 | { |
3766 | cfq_log_cfqq(cfqd, cfqq, "merging with queue %p", cfqq->new_cfqq); | 3739 | cfq_log_cfqq(cfqd, cfqq, "merging with queue %p", cfqq->new_cfqq); |
@@ -3775,7 +3748,7 @@ cfq_merge_cfqqs(struct cfq_data *cfqd, struct cfq_io_cq *cic, | |||
3775 | * was the last process referring to said cfqq. | 3748 | * was the last process referring to said cfqq. |
3776 | */ | 3749 | */ |
3777 | static struct cfq_queue * | 3750 | static struct cfq_queue * |
3778 | split_cfqq(struct cfq_io_cq *cic, struct cfq_queue *cfqq) | 3751 | split_cfqq(struct cfq_io_context *cic, struct cfq_queue *cfqq) |
3779 | { | 3752 | { |
3780 | if (cfqq_process_refs(cfqq) == 1) { | 3753 | if (cfqq_process_refs(cfqq) == 1) { |
3781 | cfqq->pid = current->pid; | 3754 | cfqq->pid = current->pid; |
@@ -3795,25 +3768,28 @@ split_cfqq(struct cfq_io_cq *cic, struct cfq_queue *cfqq) | |||
3795 | * Allocate cfq data structures associated with this request. | 3768 | * Allocate cfq data structures associated with this request. |
3796 | */ | 3769 | */ |
3797 | static int | 3770 | static int |
3798 | cfq_set_request(struct request_queue *q, struct request *rq, struct bio *bio, | 3771 | cfq_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask) |
3799 | gfp_t gfp_mask) | ||
3800 | { | 3772 | { |
3801 | struct cfq_data *cfqd = q->elevator->elevator_data; | 3773 | struct cfq_data *cfqd = q->elevator->elevator_data; |
3802 | struct cfq_io_cq *cic = icq_to_cic(rq->elv.icq); | 3774 | struct cfq_io_context *cic; |
3803 | const int rw = rq_data_dir(rq); | 3775 | const int rw = rq_data_dir(rq); |
3804 | const bool is_sync = rq_is_sync(rq); | 3776 | const bool is_sync = rq_is_sync(rq); |
3805 | struct cfq_queue *cfqq; | 3777 | struct cfq_queue *cfqq; |
3778 | unsigned long flags; | ||
3806 | 3779 | ||
3807 | might_sleep_if(gfp_mask & __GFP_WAIT); | 3780 | might_sleep_if(gfp_mask & __GFP_WAIT); |
3808 | 3781 | ||
3809 | spin_lock_irq(q->queue_lock); | 3782 | cic = cfq_get_io_context(cfqd, gfp_mask); |
3783 | |||
3784 | spin_lock_irqsave(q->queue_lock, flags); | ||
3785 | |||
3786 | if (!cic) | ||
3787 | goto queue_fail; | ||
3810 | 3788 | ||
3811 | check_ioprio_changed(cic, bio); | ||
3812 | check_blkcg_changed(cic, bio); | ||
3813 | new_queue: | 3789 | new_queue: |
3814 | cfqq = cic_to_cfqq(cic, is_sync); | 3790 | cfqq = cic_to_cfqq(cic, is_sync); |
3815 | if (!cfqq || cfqq == &cfqd->oom_cfqq) { | 3791 | if (!cfqq || cfqq == &cfqd->oom_cfqq) { |
3816 | cfqq = cfq_get_queue(cfqd, is_sync, cic, bio, gfp_mask); | 3792 | cfqq = cfq_get_queue(cfqd, is_sync, cic->ioc, gfp_mask); |
3817 | cic_set_cfqq(cic, cfqq, is_sync); | 3793 | cic_set_cfqq(cic, cfqq, is_sync); |
3818 | } else { | 3794 | } else { |
3819 | /* | 3795 | /* |
@@ -3839,11 +3815,17 @@ new_queue: | |||
3839 | cfqq->allocated[rw]++; | 3815 | cfqq->allocated[rw]++; |
3840 | 3816 | ||
3841 | cfqq->ref++; | 3817 | cfqq->ref++; |
3842 | cfqg_get(cfqq->cfqg); | 3818 | rq->elevator_private[0] = cic; |
3843 | rq->elv.priv[0] = cfqq; | 3819 | rq->elevator_private[1] = cfqq; |
3844 | rq->elv.priv[1] = cfqq->cfqg; | 3820 | rq->elevator_private[2] = cfq_ref_get_cfqg(cfqq->cfqg); |
3845 | spin_unlock_irq(q->queue_lock); | 3821 | spin_unlock_irqrestore(q->queue_lock, flags); |
3846 | return 0; | 3822 | return 0; |
3823 | |||
3824 | queue_fail: | ||
3825 | cfq_schedule_dispatch(cfqd); | ||
3826 | spin_unlock_irqrestore(q->queue_lock, flags); | ||
3827 | cfq_log(cfqd, "set_request fail"); | ||
3828 | return 1; | ||
3847 | } | 3829 | } |
3848 | 3830 | ||
3849 | static void cfq_kick_queue(struct work_struct *work) | 3831 | static void cfq_kick_queue(struct work_struct *work) |
@@ -3938,6 +3920,7 @@ static void cfq_exit_queue(struct elevator_queue *e) | |||
3938 | { | 3920 | { |
3939 | struct cfq_data *cfqd = e->elevator_data; | 3921 | struct cfq_data *cfqd = e->elevator_data; |
3940 | struct request_queue *q = cfqd->queue; | 3922 | struct request_queue *q = cfqd->queue; |
3923 | bool wait = false; | ||
3941 | 3924 | ||
3942 | cfq_shutdown_timer_wq(cfqd); | 3925 | cfq_shutdown_timer_wq(cfqd); |
3943 | 3926 | ||
@@ -3946,54 +3929,139 @@ static void cfq_exit_queue(struct elevator_queue *e) | |||
3946 | if (cfqd->active_queue) | 3929 | if (cfqd->active_queue) |
3947 | __cfq_slice_expired(cfqd, cfqd->active_queue, 0); | 3930 | __cfq_slice_expired(cfqd, cfqd->active_queue, 0); |
3948 | 3931 | ||
3932 | while (!list_empty(&cfqd->cic_list)) { | ||
3933 | struct cfq_io_context *cic = list_entry(cfqd->cic_list.next, | ||
3934 | struct cfq_io_context, | ||
3935 | queue_list); | ||
3936 | |||
3937 | __cfq_exit_single_io_context(cfqd, cic); | ||
3938 | } | ||
3939 | |||
3949 | cfq_put_async_queues(cfqd); | 3940 | cfq_put_async_queues(cfqd); |
3941 | cfq_release_cfq_groups(cfqd); | ||
3942 | |||
3943 | /* | ||
3944 | * If there are groups which we could not unlink from blkcg list, | ||
3945 | * wait for a rcu period for them to be freed. | ||
3946 | */ | ||
3947 | if (cfqd->nr_blkcg_linked_grps) | ||
3948 | wait = true; | ||
3950 | 3949 | ||
3951 | spin_unlock_irq(q->queue_lock); | 3950 | spin_unlock_irq(q->queue_lock); |
3952 | 3951 | ||
3953 | cfq_shutdown_timer_wq(cfqd); | 3952 | cfq_shutdown_timer_wq(cfqd); |
3954 | 3953 | ||
3954 | spin_lock(&cic_index_lock); | ||
3955 | ida_remove(&cic_index_ida, cfqd->cic_index); | ||
3956 | spin_unlock(&cic_index_lock); | ||
3957 | |||
3958 | /* | ||
3959 | * Wait for cfqg->blkg->key accessors to exit their grace periods. | ||
3960 | * Do this wait only if there are other unlinked groups out | ||
3961 | * there. This can happen if cgroup deletion path claimed the | ||
3962 | * responsibility of cleaning up a group before queue cleanup code | ||
3963 | * get to the group. | ||
3964 | * | ||
3965 | * Do not call synchronize_rcu() unconditionally as there are drivers | ||
3966 | * which create/delete request queue hundreds of times during scan/boot | ||
3967 | * and synchronize_rcu() can take significant time and slow down boot. | ||
3968 | */ | ||
3969 | if (wait) | ||
3970 | synchronize_rcu(); | ||
3971 | |||
3955 | #ifdef CONFIG_CFQ_GROUP_IOSCHED | 3972 | #ifdef CONFIG_CFQ_GROUP_IOSCHED |
3956 | blkcg_deactivate_policy(q, &blkcg_policy_cfq); | 3973 | /* Free up per cpu stats for root group */ |
3957 | #else | 3974 | free_percpu(cfqd->root_group.blkg.stats_cpu); |
3958 | kfree(cfqd->root_group); | ||
3959 | #endif | 3975 | #endif |
3960 | kfree(cfqd); | 3976 | kfree(cfqd); |
3961 | } | 3977 | } |
3962 | 3978 | ||
3963 | static int cfq_init_queue(struct request_queue *q) | 3979 | static int cfq_alloc_cic_index(void) |
3980 | { | ||
3981 | int index, error; | ||
3982 | |||
3983 | do { | ||
3984 | if (!ida_pre_get(&cic_index_ida, GFP_KERNEL)) | ||
3985 | return -ENOMEM; | ||
3986 | |||
3987 | spin_lock(&cic_index_lock); | ||
3988 | error = ida_get_new(&cic_index_ida, &index); | ||
3989 | spin_unlock(&cic_index_lock); | ||
3990 | if (error && error != -EAGAIN) | ||
3991 | return error; | ||
3992 | } while (error); | ||
3993 | |||
3994 | return index; | ||
3995 | } | ||
3996 | |||
3997 | static void *cfq_init_queue(struct request_queue *q) | ||
3964 | { | 3998 | { |
3965 | struct cfq_data *cfqd; | 3999 | struct cfq_data *cfqd; |
3966 | struct blkcg_gq *blkg __maybe_unused; | 4000 | int i, j; |
3967 | int i, ret; | 4001 | struct cfq_group *cfqg; |
4002 | struct cfq_rb_root *st; | ||
4003 | |||
4004 | i = cfq_alloc_cic_index(); | ||
4005 | if (i < 0) | ||
4006 | return NULL; | ||
3968 | 4007 | ||
3969 | cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node); | 4008 | cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node); |
3970 | if (!cfqd) | 4009 | if (!cfqd) { |
3971 | return -ENOMEM; | 4010 | spin_lock(&cic_index_lock); |
4011 | ida_remove(&cic_index_ida, i); | ||
4012 | spin_unlock(&cic_index_lock); | ||
4013 | return NULL; | ||
4014 | } | ||
3972 | 4015 | ||
3973 | cfqd->queue = q; | 4016 | /* |
3974 | q->elevator->elevator_data = cfqd; | 4017 | * Don't need take queue_lock in the routine, since we are |
4018 | * initializing the ioscheduler, and nobody is using cfqd | ||
4019 | */ | ||
4020 | cfqd->cic_index = i; | ||
3975 | 4021 | ||
3976 | /* Init root service tree */ | 4022 | /* Init root service tree */ |
3977 | cfqd->grp_service_tree = CFQ_RB_ROOT; | 4023 | cfqd->grp_service_tree = CFQ_RB_ROOT; |
3978 | 4024 | ||
3979 | /* Init root group and prefer root group over other groups by default */ | 4025 | /* Init root group */ |
4026 | cfqg = &cfqd->root_group; | ||
4027 | for_each_cfqg_st(cfqg, i, j, st) | ||
4028 | *st = CFQ_RB_ROOT; | ||
4029 | RB_CLEAR_NODE(&cfqg->rb_node); | ||
4030 | |||
4031 | /* Give preference to root group over other groups */ | ||
4032 | cfqg->weight = 2*BLKIO_WEIGHT_DEFAULT; | ||
4033 | |||
3980 | #ifdef CONFIG_CFQ_GROUP_IOSCHED | 4034 | #ifdef CONFIG_CFQ_GROUP_IOSCHED |
3981 | ret = blkcg_activate_policy(q, &blkcg_policy_cfq); | 4035 | /* |
3982 | if (ret) | 4036 | * Set root group reference to 2. One reference will be dropped when |
3983 | goto out_free; | 4037 | * all groups on cfqd->cfqg_list are being deleted during queue exit. |
4038 | * Other reference will remain there as we don't want to delete this | ||
4039 | * group as it is statically allocated and gets destroyed when | ||
4040 | * throtl_data goes away. | ||
4041 | */ | ||
4042 | cfqg->ref = 2; | ||
3984 | 4043 | ||
3985 | cfqd->root_group = blkg_to_cfqg(q->root_blkg); | 4044 | if (blkio_alloc_blkg_stats(&cfqg->blkg)) { |
3986 | #else | 4045 | kfree(cfqg); |
3987 | ret = -ENOMEM; | ||
3988 | cfqd->root_group = kzalloc_node(sizeof(*cfqd->root_group), | ||
3989 | GFP_KERNEL, cfqd->queue->node); | ||
3990 | if (!cfqd->root_group) | ||
3991 | goto out_free; | ||
3992 | 4046 | ||
3993 | cfq_init_cfqg_base(cfqd->root_group); | 4047 | spin_lock(&cic_index_lock); |
3994 | #endif | 4048 | ida_remove(&cic_index_ida, cfqd->cic_index); |
3995 | cfqd->root_group->weight = 2 * CFQ_WEIGHT_DEFAULT; | 4049 | spin_unlock(&cic_index_lock); |
4050 | |||
4051 | kfree(cfqd); | ||
4052 | return NULL; | ||
4053 | } | ||
4054 | |||
4055 | rcu_read_lock(); | ||
4056 | |||
4057 | cfq_blkiocg_add_blkio_group(&blkio_root_cgroup, &cfqg->blkg, | ||
4058 | (void *)cfqd, 0); | ||
4059 | rcu_read_unlock(); | ||
4060 | cfqd->nr_blkcg_linked_grps++; | ||
3996 | 4061 | ||
4062 | /* Add group on cfqd->cfqg_list */ | ||
4063 | hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list); | ||
4064 | #endif | ||
3997 | /* | 4065 | /* |
3998 | * Not strictly needed (since RB_ROOT just clears the node and we | 4066 | * Not strictly needed (since RB_ROOT just clears the node and we |
3999 | * zeroed cfqd on alloc), but better be safe in case someone decides | 4067 | * zeroed cfqd on alloc), but better be safe in case someone decides |
@@ -4005,17 +4073,15 @@ static int cfq_init_queue(struct request_queue *q) | |||
4005 | /* | 4073 | /* |
4006 | * Our fallback cfqq if cfq_find_alloc_queue() runs into OOM issues. | 4074 | * Our fallback cfqq if cfq_find_alloc_queue() runs into OOM issues. |
4007 | * Grab a permanent reference to it, so that the normal code flow | 4075 | * Grab a permanent reference to it, so that the normal code flow |
4008 | * will not attempt to free it. oom_cfqq is linked to root_group | 4076 | * will not attempt to free it. |
4009 | * but shouldn't hold a reference as it'll never be unlinked. Lose | ||
4010 | * the reference from linking right away. | ||
4011 | */ | 4077 | */ |
4012 | cfq_init_cfqq(cfqd, &cfqd->oom_cfqq, 1, 0); | 4078 | cfq_init_cfqq(cfqd, &cfqd->oom_cfqq, 1, 0); |
4013 | cfqd->oom_cfqq.ref++; | 4079 | cfqd->oom_cfqq.ref++; |
4080 | cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, &cfqd->root_group); | ||
4014 | 4081 | ||
4015 | spin_lock_irq(q->queue_lock); | 4082 | INIT_LIST_HEAD(&cfqd->cic_list); |
4016 | cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, cfqd->root_group); | 4083 | |
4017 | cfqg_put(cfqd->root_group); | 4084 | cfqd->queue = q; |
4018 | spin_unlock_irq(q->queue_lock); | ||
4019 | 4085 | ||
4020 | init_timer(&cfqd->idle_slice_timer); | 4086 | init_timer(&cfqd->idle_slice_timer); |
4021 | cfqd->idle_slice_timer.function = cfq_idle_slice_timer; | 4087 | cfqd->idle_slice_timer.function = cfq_idle_slice_timer; |
@@ -4030,7 +4096,6 @@ static int cfq_init_queue(struct request_queue *q) | |||
4030 | cfqd->cfq_back_penalty = cfq_back_penalty; | 4096 | cfqd->cfq_back_penalty = cfq_back_penalty; |
4031 | cfqd->cfq_slice[0] = cfq_slice_async; | 4097 | cfqd->cfq_slice[0] = cfq_slice_async; |
4032 | cfqd->cfq_slice[1] = cfq_slice_sync; | 4098 | cfqd->cfq_slice[1] = cfq_slice_sync; |
4033 | cfqd->cfq_target_latency = cfq_target_latency; | ||
4034 | cfqd->cfq_slice_async_rq = cfq_slice_async_rq; | 4099 | cfqd->cfq_slice_async_rq = cfq_slice_async_rq; |
4035 | cfqd->cfq_slice_idle = cfq_slice_idle; | 4100 | cfqd->cfq_slice_idle = cfq_slice_idle; |
4036 | cfqd->cfq_group_idle = cfq_group_idle; | 4101 | cfqd->cfq_group_idle = cfq_group_idle; |
@@ -4041,11 +4106,35 @@ static int cfq_init_queue(struct request_queue *q) | |||
4041 | * second, in order to have larger depth for async operations. | 4106 | * second, in order to have larger depth for async operations. |
4042 | */ | 4107 | */ |
4043 | cfqd->last_delayed_sync = jiffies - HZ; | 4108 | cfqd->last_delayed_sync = jiffies - HZ; |
4044 | return 0; | 4109 | return cfqd; |
4110 | } | ||
4045 | 4111 | ||
4046 | out_free: | 4112 | static void cfq_slab_kill(void) |
4047 | kfree(cfqd); | 4113 | { |
4048 | return ret; | 4114 | /* |
4115 | * Caller already ensured that pending RCU callbacks are completed, | ||
4116 | * so we should have no busy allocations at this point. | ||
4117 | */ | ||
4118 | if (cfq_pool) | ||
4119 | kmem_cache_destroy(cfq_pool); | ||
4120 | if (cfq_ioc_pool) | ||
4121 | kmem_cache_destroy(cfq_ioc_pool); | ||
4122 | } | ||
4123 | |||
4124 | static int __init cfq_slab_setup(void) | ||
4125 | { | ||
4126 | cfq_pool = KMEM_CACHE(cfq_queue, 0); | ||
4127 | if (!cfq_pool) | ||
4128 | goto fail; | ||
4129 | |||
4130 | cfq_ioc_pool = KMEM_CACHE(cfq_io_context, 0); | ||
4131 | if (!cfq_ioc_pool) | ||
4132 | goto fail; | ||
4133 | |||
4134 | return 0; | ||
4135 | fail: | ||
4136 | cfq_slab_kill(); | ||
4137 | return -ENOMEM; | ||
4049 | } | 4138 | } |
4050 | 4139 | ||
4051 | /* | 4140 | /* |
@@ -4086,7 +4175,6 @@ SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1); | |||
4086 | SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1); | 4175 | SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1); |
4087 | SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0); | 4176 | SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0); |
4088 | SHOW_FUNCTION(cfq_low_latency_show, cfqd->cfq_latency, 0); | 4177 | SHOW_FUNCTION(cfq_low_latency_show, cfqd->cfq_latency, 0); |
4089 | SHOW_FUNCTION(cfq_target_latency_show, cfqd->cfq_target_latency, 1); | ||
4090 | #undef SHOW_FUNCTION | 4178 | #undef SHOW_FUNCTION |
4091 | 4179 | ||
4092 | #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ | 4180 | #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ |
@@ -4120,7 +4208,6 @@ STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1); | |||
4120 | STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1, | 4208 | STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1, |
4121 | UINT_MAX, 0); | 4209 | UINT_MAX, 0); |
4122 | STORE_FUNCTION(cfq_low_latency_store, &cfqd->cfq_latency, 0, 1, 0); | 4210 | STORE_FUNCTION(cfq_low_latency_store, &cfqd->cfq_latency, 0, 1, 0); |
4123 | STORE_FUNCTION(cfq_target_latency_store, &cfqd->cfq_target_latency, 1, UINT_MAX, 1); | ||
4124 | #undef STORE_FUNCTION | 4211 | #undef STORE_FUNCTION |
4125 | 4212 | ||
4126 | #define CFQ_ATTR(name) \ | 4213 | #define CFQ_ATTR(name) \ |
@@ -4138,7 +4225,6 @@ static struct elv_fs_entry cfq_attrs[] = { | |||
4138 | CFQ_ATTR(slice_idle), | 4225 | CFQ_ATTR(slice_idle), |
4139 | CFQ_ATTR(group_idle), | 4226 | CFQ_ATTR(group_idle), |
4140 | CFQ_ATTR(low_latency), | 4227 | CFQ_ATTR(low_latency), |
4141 | CFQ_ATTR(target_latency), | ||
4142 | __ATTR_NULL | 4228 | __ATTR_NULL |
4143 | }; | 4229 | }; |
4144 | 4230 | ||
@@ -4156,35 +4242,32 @@ static struct elevator_type iosched_cfq = { | |||
4156 | .elevator_completed_req_fn = cfq_completed_request, | 4242 | .elevator_completed_req_fn = cfq_completed_request, |
4157 | .elevator_former_req_fn = elv_rb_former_request, | 4243 | .elevator_former_req_fn = elv_rb_former_request, |
4158 | .elevator_latter_req_fn = elv_rb_latter_request, | 4244 | .elevator_latter_req_fn = elv_rb_latter_request, |
4159 | .elevator_init_icq_fn = cfq_init_icq, | ||
4160 | .elevator_exit_icq_fn = cfq_exit_icq, | ||
4161 | .elevator_set_req_fn = cfq_set_request, | 4245 | .elevator_set_req_fn = cfq_set_request, |
4162 | .elevator_put_req_fn = cfq_put_request, | 4246 | .elevator_put_req_fn = cfq_put_request, |
4163 | .elevator_may_queue_fn = cfq_may_queue, | 4247 | .elevator_may_queue_fn = cfq_may_queue, |
4164 | .elevator_init_fn = cfq_init_queue, | 4248 | .elevator_init_fn = cfq_init_queue, |
4165 | .elevator_exit_fn = cfq_exit_queue, | 4249 | .elevator_exit_fn = cfq_exit_queue, |
4250 | .trim = cfq_free_io_context, | ||
4166 | }, | 4251 | }, |
4167 | .icq_size = sizeof(struct cfq_io_cq), | ||
4168 | .icq_align = __alignof__(struct cfq_io_cq), | ||
4169 | .elevator_attrs = cfq_attrs, | 4252 | .elevator_attrs = cfq_attrs, |
4170 | .elevator_name = "cfq", | 4253 | .elevator_name = "cfq", |
4171 | .elevator_owner = THIS_MODULE, | 4254 | .elevator_owner = THIS_MODULE, |
4172 | }; | 4255 | }; |
4173 | 4256 | ||
4174 | #ifdef CONFIG_CFQ_GROUP_IOSCHED | 4257 | #ifdef CONFIG_CFQ_GROUP_IOSCHED |
4175 | static struct blkcg_policy blkcg_policy_cfq = { | 4258 | static struct blkio_policy_type blkio_policy_cfq = { |
4176 | .pd_size = sizeof(struct cfq_group), | 4259 | .ops = { |
4177 | .cftypes = cfq_blkcg_files, | 4260 | .blkio_unlink_group_fn = cfq_unlink_blkio_group, |
4178 | 4261 | .blkio_update_group_weight_fn = cfq_update_blkio_group_weight, | |
4179 | .pd_init_fn = cfq_pd_init, | 4262 | }, |
4180 | .pd_reset_stats_fn = cfq_pd_reset_stats, | 4263 | .plid = BLKIO_POLICY_PROP, |
4181 | }; | 4264 | }; |
4265 | #else | ||
4266 | static struct blkio_policy_type blkio_policy_cfq; | ||
4182 | #endif | 4267 | #endif |
4183 | 4268 | ||
4184 | static int __init cfq_init(void) | 4269 | static int __init cfq_init(void) |
4185 | { | 4270 | { |
4186 | int ret; | ||
4187 | |||
4188 | /* | 4271 | /* |
4189 | * could be 0 on HZ < 1000 setups | 4272 | * could be 0 on HZ < 1000 setups |
4190 | */ | 4273 | */ |
@@ -4196,41 +4279,35 @@ static int __init cfq_init(void) | |||
4196 | #ifdef CONFIG_CFQ_GROUP_IOSCHED | 4279 | #ifdef CONFIG_CFQ_GROUP_IOSCHED |
4197 | if (!cfq_group_idle) | 4280 | if (!cfq_group_idle) |
4198 | cfq_group_idle = 1; | 4281 | cfq_group_idle = 1; |
4199 | |||
4200 | ret = blkcg_policy_register(&blkcg_policy_cfq); | ||
4201 | if (ret) | ||
4202 | return ret; | ||
4203 | #else | 4282 | #else |
4204 | cfq_group_idle = 0; | 4283 | cfq_group_idle = 0; |
4205 | #endif | 4284 | #endif |
4285 | if (cfq_slab_setup()) | ||
4286 | return -ENOMEM; | ||
4206 | 4287 | ||
4207 | ret = -ENOMEM; | 4288 | elv_register(&iosched_cfq); |
4208 | cfq_pool = KMEM_CACHE(cfq_queue, 0); | 4289 | blkio_policy_register(&blkio_policy_cfq); |
4209 | if (!cfq_pool) | ||
4210 | goto err_pol_unreg; | ||
4211 | |||
4212 | ret = elv_register(&iosched_cfq); | ||
4213 | if (ret) | ||
4214 | goto err_free_pool; | ||
4215 | 4290 | ||
4216 | return 0; | 4291 | return 0; |
4217 | |||
4218 | err_free_pool: | ||
4219 | kmem_cache_destroy(cfq_pool); | ||
4220 | err_pol_unreg: | ||
4221 | #ifdef CONFIG_CFQ_GROUP_IOSCHED | ||
4222 | blkcg_policy_unregister(&blkcg_policy_cfq); | ||
4223 | #endif | ||
4224 | return ret; | ||
4225 | } | 4292 | } |
4226 | 4293 | ||
4227 | static void __exit cfq_exit(void) | 4294 | static void __exit cfq_exit(void) |
4228 | { | 4295 | { |
4229 | #ifdef CONFIG_CFQ_GROUP_IOSCHED | 4296 | DECLARE_COMPLETION_ONSTACK(all_gone); |
4230 | blkcg_policy_unregister(&blkcg_policy_cfq); | 4297 | blkio_policy_unregister(&blkio_policy_cfq); |
4231 | #endif | ||
4232 | elv_unregister(&iosched_cfq); | 4298 | elv_unregister(&iosched_cfq); |
4233 | kmem_cache_destroy(cfq_pool); | 4299 | ioc_gone = &all_gone; |
4300 | /* ioc_gone's update must be visible before reading ioc_count */ | ||
4301 | smp_wmb(); | ||
4302 | |||
4303 | /* | ||
4304 | * this also protects us from entering cfq_slab_kill() with | ||
4305 | * pending RCU callbacks | ||
4306 | */ | ||
4307 | if (elv_ioc_count_read(cfq_ioc_count)) | ||
4308 | wait_for_completion(&all_gone); | ||
4309 | ida_destroy(&cic_index_ida); | ||
4310 | cfq_slab_kill(); | ||
4234 | } | 4311 | } |
4235 | 4312 | ||
4236 | module_init(cfq_init); | 4313 | module_init(cfq_init); |
diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c index 7c668c8a6f9..7b725020823 100644 --- a/block/compat_ioctl.c +++ b/block/compat_ioctl.c | |||
@@ -719,9 +719,6 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg) | |||
719 | case BLKSECTGET: | 719 | case BLKSECTGET: |
720 | return compat_put_ushort(arg, | 720 | return compat_put_ushort(arg, |
721 | queue_max_sectors(bdev_get_queue(bdev))); | 721 | queue_max_sectors(bdev_get_queue(bdev))); |
722 | case BLKROTATIONAL: | ||
723 | return compat_put_ushort(arg, | ||
724 | !blk_queue_nonrot(bdev_get_queue(bdev))); | ||
725 | case BLKRASET: /* compatible, but no compat_ptr (!) */ | 722 | case BLKRASET: /* compatible, but no compat_ptr (!) */ |
726 | case BLKFRASET: | 723 | case BLKFRASET: |
727 | if (!capable(CAP_SYS_ADMIN)) | 724 | if (!capable(CAP_SYS_ADMIN)) |
diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c index 90037b5eb17..c644137d9cd 100644 --- a/block/deadline-iosched.c +++ b/block/deadline-iosched.c | |||
@@ -230,7 +230,7 @@ static inline int deadline_check_fifo(struct deadline_data *dd, int ddir) | |||
230 | /* | 230 | /* |
231 | * rq is expired! | 231 | * rq is expired! |
232 | */ | 232 | */ |
233 | if (time_after_eq(jiffies, rq_fifo_time(rq))) | 233 | if (time_after(jiffies, rq_fifo_time(rq))) |
234 | return 1; | 234 | return 1; |
235 | 235 | ||
236 | return 0; | 236 | return 0; |
@@ -337,13 +337,13 @@ static void deadline_exit_queue(struct elevator_queue *e) | |||
337 | /* | 337 | /* |
338 | * initialize elevator private data (deadline_data). | 338 | * initialize elevator private data (deadline_data). |
339 | */ | 339 | */ |
340 | static int deadline_init_queue(struct request_queue *q) | 340 | static void *deadline_init_queue(struct request_queue *q) |
341 | { | 341 | { |
342 | struct deadline_data *dd; | 342 | struct deadline_data *dd; |
343 | 343 | ||
344 | dd = kmalloc_node(sizeof(*dd), GFP_KERNEL | __GFP_ZERO, q->node); | 344 | dd = kmalloc_node(sizeof(*dd), GFP_KERNEL | __GFP_ZERO, q->node); |
345 | if (!dd) | 345 | if (!dd) |
346 | return -ENOMEM; | 346 | return NULL; |
347 | 347 | ||
348 | INIT_LIST_HEAD(&dd->fifo_list[READ]); | 348 | INIT_LIST_HEAD(&dd->fifo_list[READ]); |
349 | INIT_LIST_HEAD(&dd->fifo_list[WRITE]); | 349 | INIT_LIST_HEAD(&dd->fifo_list[WRITE]); |
@@ -354,9 +354,7 @@ static int deadline_init_queue(struct request_queue *q) | |||
354 | dd->writes_starved = writes_starved; | 354 | dd->writes_starved = writes_starved; |
355 | dd->front_merges = 1; | 355 | dd->front_merges = 1; |
356 | dd->fifo_batch = fifo_batch; | 356 | dd->fifo_batch = fifo_batch; |
357 | 357 | return dd; | |
358 | q->elevator->elevator_data = dd; | ||
359 | return 0; | ||
360 | } | 358 | } |
361 | 359 | ||
362 | /* | 360 | /* |
@@ -450,7 +448,9 @@ static struct elevator_type iosched_deadline = { | |||
450 | 448 | ||
451 | static int __init deadline_init(void) | 449 | static int __init deadline_init(void) |
452 | { | 450 | { |
453 | return elv_register(&iosched_deadline); | 451 | elv_register(&iosched_deadline); |
452 | |||
453 | return 0; | ||
454 | } | 454 | } |
455 | 455 | ||
456 | static void __exit deadline_exit(void) | 456 | static void __exit deadline_exit(void) |
diff --git a/block/elevator.c b/block/elevator.c index 9edba1b8323..a3b64bc71d8 100644 --- a/block/elevator.c +++ b/block/elevator.c | |||
@@ -31,6 +31,7 @@ | |||
31 | #include <linux/slab.h> | 31 | #include <linux/slab.h> |
32 | #include <linux/init.h> | 32 | #include <linux/init.h> |
33 | #include <linux/compiler.h> | 33 | #include <linux/compiler.h> |
34 | #include <linux/delay.h> | ||
34 | #include <linux/blktrace_api.h> | 35 | #include <linux/blktrace_api.h> |
35 | #include <linux/hash.h> | 36 | #include <linux/hash.h> |
36 | #include <linux/uaccess.h> | 37 | #include <linux/uaccess.h> |
@@ -38,7 +39,6 @@ | |||
38 | #include <trace/events/block.h> | 39 | #include <trace/events/block.h> |
39 | 40 | ||
40 | #include "blk.h" | 41 | #include "blk.h" |
41 | #include "blk-cgroup.h" | ||
42 | 42 | ||
43 | static DEFINE_SPINLOCK(elv_list_lock); | 43 | static DEFINE_SPINLOCK(elv_list_lock); |
44 | static LIST_HEAD(elv_list); | 44 | static LIST_HEAD(elv_list); |
@@ -62,8 +62,8 @@ static int elv_iosched_allow_merge(struct request *rq, struct bio *bio) | |||
62 | struct request_queue *q = rq->q; | 62 | struct request_queue *q = rq->q; |
63 | struct elevator_queue *e = q->elevator; | 63 | struct elevator_queue *e = q->elevator; |
64 | 64 | ||
65 | if (e->type->ops.elevator_allow_merge_fn) | 65 | if (e->ops->elevator_allow_merge_fn) |
66 | return e->type->ops.elevator_allow_merge_fn(q, rq, bio); | 66 | return e->ops->elevator_allow_merge_fn(q, rq, bio); |
67 | 67 | ||
68 | return 1; | 68 | return 1; |
69 | } | 69 | } |
@@ -71,9 +71,39 @@ static int elv_iosched_allow_merge(struct request *rq, struct bio *bio) | |||
71 | /* | 71 | /* |
72 | * can we safely merge with this request? | 72 | * can we safely merge with this request? |
73 | */ | 73 | */ |
74 | bool elv_rq_merge_ok(struct request *rq, struct bio *bio) | 74 | int elv_rq_merge_ok(struct request *rq, struct bio *bio) |
75 | { | 75 | { |
76 | if (!blk_rq_merge_ok(rq, bio)) | 76 | if (!rq_mergeable(rq)) |
77 | return 0; | ||
78 | |||
79 | /* | ||
80 | * Don't merge file system requests and discard requests | ||
81 | */ | ||
82 | if ((bio->bi_rw & REQ_DISCARD) != (rq->bio->bi_rw & REQ_DISCARD)) | ||
83 | return 0; | ||
84 | |||
85 | /* | ||
86 | * Don't merge discard requests and secure discard requests | ||
87 | */ | ||
88 | if ((bio->bi_rw & REQ_SECURE) != (rq->bio->bi_rw & REQ_SECURE)) | ||
89 | return 0; | ||
90 | |||
91 | /* | ||
92 | * different data direction or already started, don't merge | ||
93 | */ | ||
94 | if (bio_data_dir(bio) != rq_data_dir(rq)) | ||
95 | return 0; | ||
96 | |||
97 | /* | ||
98 | * must be same device and not a special request | ||
99 | */ | ||
100 | if (rq->rq_disk != bio->bi_bdev->bd_disk || rq->special) | ||
101 | return 0; | ||
102 | |||
103 | /* | ||
104 | * only merge integrity protected bio into ditto rq | ||
105 | */ | ||
106 | if (bio_integrity(bio) != blk_integrity_rq(rq)) | ||
77 | return 0; | 107 | return 0; |
78 | 108 | ||
79 | if (!elv_iosched_allow_merge(rq, bio)) | 109 | if (!elv_iosched_allow_merge(rq, bio)) |
@@ -83,6 +113,23 @@ bool elv_rq_merge_ok(struct request *rq, struct bio *bio) | |||
83 | } | 113 | } |
84 | EXPORT_SYMBOL(elv_rq_merge_ok); | 114 | EXPORT_SYMBOL(elv_rq_merge_ok); |
85 | 115 | ||
116 | int elv_try_merge(struct request *__rq, struct bio *bio) | ||
117 | { | ||
118 | int ret = ELEVATOR_NO_MERGE; | ||
119 | |||
120 | /* | ||
121 | * we can merge and sequence is ok, check if it's possible | ||
122 | */ | ||
123 | if (elv_rq_merge_ok(__rq, bio)) { | ||
124 | if (blk_rq_pos(__rq) + blk_rq_sectors(__rq) == bio->bi_sector) | ||
125 | ret = ELEVATOR_BACK_MERGE; | ||
126 | else if (blk_rq_pos(__rq) - bio_sectors(bio) == bio->bi_sector) | ||
127 | ret = ELEVATOR_FRONT_MERGE; | ||
128 | } | ||
129 | |||
130 | return ret; | ||
131 | } | ||
132 | |||
86 | static struct elevator_type *elevator_find(const char *name) | 133 | static struct elevator_type *elevator_find(const char *name) |
87 | { | 134 | { |
88 | struct elevator_type *e; | 135 | struct elevator_type *e; |
@@ -122,7 +169,20 @@ static struct elevator_type *elevator_get(const char *name) | |||
122 | return e; | 169 | return e; |
123 | } | 170 | } |
124 | 171 | ||
125 | static char chosen_elevator[ELV_NAME_MAX]; | 172 | static void *elevator_init_queue(struct request_queue *q, |
173 | struct elevator_queue *eq) | ||
174 | { | ||
175 | return eq->ops->elevator_init_fn(q); | ||
176 | } | ||
177 | |||
178 | static void elevator_attach(struct request_queue *q, struct elevator_queue *eq, | ||
179 | void *data) | ||
180 | { | ||
181 | q->elevator = eq; | ||
182 | eq->elevator_data = data; | ||
183 | } | ||
184 | |||
185 | static char chosen_elevator[16]; | ||
126 | 186 | ||
127 | static int __init elevator_setup(char *str) | 187 | static int __init elevator_setup(char *str) |
128 | { | 188 | { |
@@ -148,7 +208,8 @@ static struct elevator_queue *elevator_alloc(struct request_queue *q, | |||
148 | if (unlikely(!eq)) | 208 | if (unlikely(!eq)) |
149 | goto err; | 209 | goto err; |
150 | 210 | ||
151 | eq->type = e; | 211 | eq->ops = &e->ops; |
212 | eq->elevator_type = e; | ||
152 | kobject_init(&eq->kobj, &elv_ktype); | 213 | kobject_init(&eq->kobj, &elv_ktype); |
153 | mutex_init(&eq->sysfs_lock); | 214 | mutex_init(&eq->sysfs_lock); |
154 | 215 | ||
@@ -172,7 +233,7 @@ static void elevator_release(struct kobject *kobj) | |||
172 | struct elevator_queue *e; | 233 | struct elevator_queue *e; |
173 | 234 | ||
174 | e = container_of(kobj, struct elevator_queue, kobj); | 235 | e = container_of(kobj, struct elevator_queue, kobj); |
175 | elevator_put(e->type); | 236 | elevator_put(e->elevator_type); |
176 | kfree(e->hash); | 237 | kfree(e->hash); |
177 | kfree(e); | 238 | kfree(e); |
178 | } | 239 | } |
@@ -180,7 +241,8 @@ static void elevator_release(struct kobject *kobj) | |||
180 | int elevator_init(struct request_queue *q, char *name) | 241 | int elevator_init(struct request_queue *q, char *name) |
181 | { | 242 | { |
182 | struct elevator_type *e = NULL; | 243 | struct elevator_type *e = NULL; |
183 | int err; | 244 | struct elevator_queue *eq; |
245 | void *data; | ||
184 | 246 | ||
185 | if (unlikely(q->elevator)) | 247 | if (unlikely(q->elevator)) |
186 | return 0; | 248 | return 0; |
@@ -213,16 +275,17 @@ int elevator_init(struct request_queue *q, char *name) | |||
213 | } | 275 | } |
214 | } | 276 | } |
215 | 277 | ||
216 | q->elevator = elevator_alloc(q, e); | 278 | eq = elevator_alloc(q, e); |
217 | if (!q->elevator) | 279 | if (!eq) |
218 | return -ENOMEM; | 280 | return -ENOMEM; |
219 | 281 | ||
220 | err = e->ops.elevator_init_fn(q); | 282 | data = elevator_init_queue(q, eq); |
221 | if (err) { | 283 | if (!data) { |
222 | kobject_put(&q->elevator->kobj); | 284 | kobject_put(&eq->kobj); |
223 | return err; | 285 | return -ENOMEM; |
224 | } | 286 | } |
225 | 287 | ||
288 | elevator_attach(q, eq, data); | ||
226 | return 0; | 289 | return 0; |
227 | } | 290 | } |
228 | EXPORT_SYMBOL(elevator_init); | 291 | EXPORT_SYMBOL(elevator_init); |
@@ -230,8 +293,9 @@ EXPORT_SYMBOL(elevator_init); | |||
230 | void elevator_exit(struct elevator_queue *e) | 293 | void elevator_exit(struct elevator_queue *e) |
231 | { | 294 | { |
232 | mutex_lock(&e->sysfs_lock); | 295 | mutex_lock(&e->sysfs_lock); |
233 | if (e->type->ops.elevator_exit_fn) | 296 | if (e->ops->elevator_exit_fn) |
234 | e->type->ops.elevator_exit_fn(e); | 297 | e->ops->elevator_exit_fn(e); |
298 | e->ops = NULL; | ||
235 | mutex_unlock(&e->sysfs_lock); | 299 | mutex_unlock(&e->sysfs_lock); |
236 | 300 | ||
237 | kobject_put(&e->kobj); | 301 | kobject_put(&e->kobj); |
@@ -421,8 +485,8 @@ int elv_merge(struct request_queue *q, struct request **req, struct bio *bio) | |||
421 | /* | 485 | /* |
422 | * First try one-hit cache. | 486 | * First try one-hit cache. |
423 | */ | 487 | */ |
424 | if (q->last_merge && elv_rq_merge_ok(q->last_merge, bio)) { | 488 | if (q->last_merge) { |
425 | ret = blk_try_merge(q->last_merge, bio); | 489 | ret = elv_try_merge(q->last_merge, bio); |
426 | if (ret != ELEVATOR_NO_MERGE) { | 490 | if (ret != ELEVATOR_NO_MERGE) { |
427 | *req = q->last_merge; | 491 | *req = q->last_merge; |
428 | return ret; | 492 | return ret; |
@@ -441,8 +505,8 @@ int elv_merge(struct request_queue *q, struct request **req, struct bio *bio) | |||
441 | return ELEVATOR_BACK_MERGE; | 505 | return ELEVATOR_BACK_MERGE; |
442 | } | 506 | } |
443 | 507 | ||
444 | if (e->type->ops.elevator_merge_fn) | 508 | if (e->ops->elevator_merge_fn) |
445 | return e->type->ops.elevator_merge_fn(q, req, bio); | 509 | return e->ops->elevator_merge_fn(q, req, bio); |
446 | 510 | ||
447 | return ELEVATOR_NO_MERGE; | 511 | return ELEVATOR_NO_MERGE; |
448 | } | 512 | } |
@@ -458,7 +522,6 @@ static bool elv_attempt_insert_merge(struct request_queue *q, | |||
458 | struct request *rq) | 522 | struct request *rq) |
459 | { | 523 | { |
460 | struct request *__rq; | 524 | struct request *__rq; |
461 | bool ret; | ||
462 | 525 | ||
463 | if (blk_queue_nomerges(q)) | 526 | if (blk_queue_nomerges(q)) |
464 | return false; | 527 | return false; |
@@ -472,29 +535,22 @@ static bool elv_attempt_insert_merge(struct request_queue *q, | |||
472 | if (blk_queue_noxmerges(q)) | 535 | if (blk_queue_noxmerges(q)) |
473 | return false; | 536 | return false; |
474 | 537 | ||
475 | ret = false; | ||
476 | /* | 538 | /* |
477 | * See if our hash lookup can find a potential backmerge. | 539 | * See if our hash lookup can find a potential backmerge. |
478 | */ | 540 | */ |
479 | while (1) { | 541 | __rq = elv_rqhash_find(q, blk_rq_pos(rq)); |
480 | __rq = elv_rqhash_find(q, blk_rq_pos(rq)); | 542 | if (__rq && blk_attempt_req_merge(q, __rq, rq)) |
481 | if (!__rq || !blk_attempt_req_merge(q, __rq, rq)) | 543 | return true; |
482 | break; | ||
483 | |||
484 | /* The merged request could be merged with others, try again */ | ||
485 | ret = true; | ||
486 | rq = __rq; | ||
487 | } | ||
488 | 544 | ||
489 | return ret; | 545 | return false; |
490 | } | 546 | } |
491 | 547 | ||
492 | void elv_merged_request(struct request_queue *q, struct request *rq, int type) | 548 | void elv_merged_request(struct request_queue *q, struct request *rq, int type) |
493 | { | 549 | { |
494 | struct elevator_queue *e = q->elevator; | 550 | struct elevator_queue *e = q->elevator; |
495 | 551 | ||
496 | if (e->type->ops.elevator_merged_fn) | 552 | if (e->ops->elevator_merged_fn) |
497 | e->type->ops.elevator_merged_fn(q, rq, type); | 553 | e->ops->elevator_merged_fn(q, rq, type); |
498 | 554 | ||
499 | if (type == ELEVATOR_BACK_MERGE) | 555 | if (type == ELEVATOR_BACK_MERGE) |
500 | elv_rqhash_reposition(q, rq); | 556 | elv_rqhash_reposition(q, rq); |
@@ -508,8 +564,8 @@ void elv_merge_requests(struct request_queue *q, struct request *rq, | |||
508 | struct elevator_queue *e = q->elevator; | 564 | struct elevator_queue *e = q->elevator; |
509 | const int next_sorted = next->cmd_flags & REQ_SORTED; | 565 | const int next_sorted = next->cmd_flags & REQ_SORTED; |
510 | 566 | ||
511 | if (next_sorted && e->type->ops.elevator_merge_req_fn) | 567 | if (next_sorted && e->ops->elevator_merge_req_fn) |
512 | e->type->ops.elevator_merge_req_fn(q, rq, next); | 568 | e->ops->elevator_merge_req_fn(q, rq, next); |
513 | 569 | ||
514 | elv_rqhash_reposition(q, rq); | 570 | elv_rqhash_reposition(q, rq); |
515 | 571 | ||
@@ -526,8 +582,8 @@ void elv_bio_merged(struct request_queue *q, struct request *rq, | |||
526 | { | 582 | { |
527 | struct elevator_queue *e = q->elevator; | 583 | struct elevator_queue *e = q->elevator; |
528 | 584 | ||
529 | if (e->type->ops.elevator_bio_merged_fn) | 585 | if (e->ops->elevator_bio_merged_fn) |
530 | e->type->ops.elevator_bio_merged_fn(q, rq, bio); | 586 | e->ops->elevator_bio_merged_fn(q, rq, bio); |
531 | } | 587 | } |
532 | 588 | ||
533 | void elv_requeue_request(struct request_queue *q, struct request *rq) | 589 | void elv_requeue_request(struct request_queue *q, struct request *rq) |
@@ -550,18 +606,45 @@ void elv_requeue_request(struct request_queue *q, struct request *rq) | |||
550 | void elv_drain_elevator(struct request_queue *q) | 606 | void elv_drain_elevator(struct request_queue *q) |
551 | { | 607 | { |
552 | static int printed; | 608 | static int printed; |
553 | 609 | while (q->elevator->ops->elevator_dispatch_fn(q, 1)) | |
554 | lockdep_assert_held(q->queue_lock); | ||
555 | |||
556 | while (q->elevator->type->ops.elevator_dispatch_fn(q, 1)) | ||
557 | ; | 610 | ; |
558 | if (q->nr_sorted && printed++ < 10) { | 611 | if (q->nr_sorted == 0) |
612 | return; | ||
613 | if (printed++ < 10) { | ||
559 | printk(KERN_ERR "%s: forced dispatching is broken " | 614 | printk(KERN_ERR "%s: forced dispatching is broken " |
560 | "(nr_sorted=%u), please report this\n", | 615 | "(nr_sorted=%u), please report this\n", |
561 | q->elevator->type->elevator_name, q->nr_sorted); | 616 | q->elevator->elevator_type->elevator_name, q->nr_sorted); |
617 | } | ||
618 | } | ||
619 | |||
620 | /* | ||
621 | * Call with queue lock held, interrupts disabled | ||
622 | */ | ||
623 | void elv_quiesce_start(struct request_queue *q) | ||
624 | { | ||
625 | if (!q->elevator) | ||
626 | return; | ||
627 | |||
628 | queue_flag_set(QUEUE_FLAG_ELVSWITCH, q); | ||
629 | |||
630 | /* | ||
631 | * make sure we don't have any requests in flight | ||
632 | */ | ||
633 | elv_drain_elevator(q); | ||
634 | while (q->rq.elvpriv) { | ||
635 | __blk_run_queue(q); | ||
636 | spin_unlock_irq(q->queue_lock); | ||
637 | msleep(10); | ||
638 | spin_lock_irq(q->queue_lock); | ||
639 | elv_drain_elevator(q); | ||
562 | } | 640 | } |
563 | } | 641 | } |
564 | 642 | ||
643 | void elv_quiesce_end(struct request_queue *q) | ||
644 | { | ||
645 | queue_flag_clear(QUEUE_FLAG_ELVSWITCH, q); | ||
646 | } | ||
647 | |||
565 | void __elv_add_request(struct request_queue *q, struct request *rq, int where) | 648 | void __elv_add_request(struct request_queue *q, struct request *rq, int where) |
566 | { | 649 | { |
567 | trace_block_rq_insert(q, rq); | 650 | trace_block_rq_insert(q, rq); |
@@ -570,7 +653,8 @@ void __elv_add_request(struct request_queue *q, struct request *rq, int where) | |||
570 | 653 | ||
571 | if (rq->cmd_flags & REQ_SOFTBARRIER) { | 654 | if (rq->cmd_flags & REQ_SOFTBARRIER) { |
572 | /* barriers are scheduling boundary, update end_sector */ | 655 | /* barriers are scheduling boundary, update end_sector */ |
573 | if (rq->cmd_type == REQ_TYPE_FS) { | 656 | if (rq->cmd_type == REQ_TYPE_FS || |
657 | (rq->cmd_flags & REQ_DISCARD)) { | ||
574 | q->end_sector = rq_end_sector(rq); | 658 | q->end_sector = rq_end_sector(rq); |
575 | q->boundary_rq = rq; | 659 | q->boundary_rq = rq; |
576 | } | 660 | } |
@@ -612,7 +696,8 @@ void __elv_add_request(struct request_queue *q, struct request *rq, int where) | |||
612 | if (elv_attempt_insert_merge(q, rq)) | 696 | if (elv_attempt_insert_merge(q, rq)) |
613 | break; | 697 | break; |
614 | case ELEVATOR_INSERT_SORT: | 698 | case ELEVATOR_INSERT_SORT: |
615 | BUG_ON(rq->cmd_type != REQ_TYPE_FS); | 699 | BUG_ON(rq->cmd_type != REQ_TYPE_FS && |
700 | !(rq->cmd_flags & REQ_DISCARD)); | ||
616 | rq->cmd_flags |= REQ_SORTED; | 701 | rq->cmd_flags |= REQ_SORTED; |
617 | q->nr_sorted++; | 702 | q->nr_sorted++; |
618 | if (rq_mergeable(rq)) { | 703 | if (rq_mergeable(rq)) { |
@@ -626,7 +711,7 @@ void __elv_add_request(struct request_queue *q, struct request *rq, int where) | |||
626 | * rq cannot be accessed after calling | 711 | * rq cannot be accessed after calling |
627 | * elevator_add_req_fn. | 712 | * elevator_add_req_fn. |
628 | */ | 713 | */ |
629 | q->elevator->type->ops.elevator_add_req_fn(q, rq); | 714 | q->elevator->ops->elevator_add_req_fn(q, rq); |
630 | break; | 715 | break; |
631 | 716 | ||
632 | case ELEVATOR_INSERT_FLUSH: | 717 | case ELEVATOR_INSERT_FLUSH: |
@@ -655,8 +740,8 @@ struct request *elv_latter_request(struct request_queue *q, struct request *rq) | |||
655 | { | 740 | { |
656 | struct elevator_queue *e = q->elevator; | 741 | struct elevator_queue *e = q->elevator; |
657 | 742 | ||
658 | if (e->type->ops.elevator_latter_req_fn) | 743 | if (e->ops->elevator_latter_req_fn) |
659 | return e->type->ops.elevator_latter_req_fn(q, rq); | 744 | return e->ops->elevator_latter_req_fn(q, rq); |
660 | return NULL; | 745 | return NULL; |
661 | } | 746 | } |
662 | 747 | ||
@@ -664,18 +749,19 @@ struct request *elv_former_request(struct request_queue *q, struct request *rq) | |||
664 | { | 749 | { |
665 | struct elevator_queue *e = q->elevator; | 750 | struct elevator_queue *e = q->elevator; |
666 | 751 | ||
667 | if (e->type->ops.elevator_former_req_fn) | 752 | if (e->ops->elevator_former_req_fn) |
668 | return e->type->ops.elevator_former_req_fn(q, rq); | 753 | return e->ops->elevator_former_req_fn(q, rq); |
669 | return NULL; | 754 | return NULL; |
670 | } | 755 | } |
671 | 756 | ||
672 | int elv_set_request(struct request_queue *q, struct request *rq, | 757 | int elv_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask) |
673 | struct bio *bio, gfp_t gfp_mask) | ||
674 | { | 758 | { |
675 | struct elevator_queue *e = q->elevator; | 759 | struct elevator_queue *e = q->elevator; |
676 | 760 | ||
677 | if (e->type->ops.elevator_set_req_fn) | 761 | if (e->ops->elevator_set_req_fn) |
678 | return e->type->ops.elevator_set_req_fn(q, rq, bio, gfp_mask); | 762 | return e->ops->elevator_set_req_fn(q, rq, gfp_mask); |
763 | |||
764 | rq->elevator_private[0] = NULL; | ||
679 | return 0; | 765 | return 0; |
680 | } | 766 | } |
681 | 767 | ||
@@ -683,16 +769,16 @@ void elv_put_request(struct request_queue *q, struct request *rq) | |||
683 | { | 769 | { |
684 | struct elevator_queue *e = q->elevator; | 770 | struct elevator_queue *e = q->elevator; |
685 | 771 | ||
686 | if (e->type->ops.elevator_put_req_fn) | 772 | if (e->ops->elevator_put_req_fn) |
687 | e->type->ops.elevator_put_req_fn(rq); | 773 | e->ops->elevator_put_req_fn(rq); |
688 | } | 774 | } |
689 | 775 | ||
690 | int elv_may_queue(struct request_queue *q, int rw) | 776 | int elv_may_queue(struct request_queue *q, int rw) |
691 | { | 777 | { |
692 | struct elevator_queue *e = q->elevator; | 778 | struct elevator_queue *e = q->elevator; |
693 | 779 | ||
694 | if (e->type->ops.elevator_may_queue_fn) | 780 | if (e->ops->elevator_may_queue_fn) |
695 | return e->type->ops.elevator_may_queue_fn(q, rw); | 781 | return e->ops->elevator_may_queue_fn(q, rw); |
696 | 782 | ||
697 | return ELV_MQUEUE_MAY; | 783 | return ELV_MQUEUE_MAY; |
698 | } | 784 | } |
@@ -727,8 +813,8 @@ void elv_completed_request(struct request_queue *q, struct request *rq) | |||
727 | if (blk_account_rq(rq)) { | 813 | if (blk_account_rq(rq)) { |
728 | q->in_flight[rq_is_sync(rq)]--; | 814 | q->in_flight[rq_is_sync(rq)]--; |
729 | if ((rq->cmd_flags & REQ_SORTED) && | 815 | if ((rq->cmd_flags & REQ_SORTED) && |
730 | e->type->ops.elevator_completed_req_fn) | 816 | e->ops->elevator_completed_req_fn) |
731 | e->type->ops.elevator_completed_req_fn(q, rq); | 817 | e->ops->elevator_completed_req_fn(q, rq); |
732 | } | 818 | } |
733 | } | 819 | } |
734 | 820 | ||
@@ -746,7 +832,7 @@ elv_attr_show(struct kobject *kobj, struct attribute *attr, char *page) | |||
746 | 832 | ||
747 | e = container_of(kobj, struct elevator_queue, kobj); | 833 | e = container_of(kobj, struct elevator_queue, kobj); |
748 | mutex_lock(&e->sysfs_lock); | 834 | mutex_lock(&e->sysfs_lock); |
749 | error = e->type ? entry->show(e, page) : -ENOENT; | 835 | error = e->ops ? entry->show(e, page) : -ENOENT; |
750 | mutex_unlock(&e->sysfs_lock); | 836 | mutex_unlock(&e->sysfs_lock); |
751 | return error; | 837 | return error; |
752 | } | 838 | } |
@@ -764,7 +850,7 @@ elv_attr_store(struct kobject *kobj, struct attribute *attr, | |||
764 | 850 | ||
765 | e = container_of(kobj, struct elevator_queue, kobj); | 851 | e = container_of(kobj, struct elevator_queue, kobj); |
766 | mutex_lock(&e->sysfs_lock); | 852 | mutex_lock(&e->sysfs_lock); |
767 | error = e->type ? entry->store(e, page, length) : -ENOENT; | 853 | error = e->ops ? entry->store(e, page, length) : -ENOENT; |
768 | mutex_unlock(&e->sysfs_lock); | 854 | mutex_unlock(&e->sysfs_lock); |
769 | return error; | 855 | return error; |
770 | } | 856 | } |
@@ -786,7 +872,7 @@ int elv_register_queue(struct request_queue *q) | |||
786 | 872 | ||
787 | error = kobject_add(&e->kobj, &q->kobj, "%s", "iosched"); | 873 | error = kobject_add(&e->kobj, &q->kobj, "%s", "iosched"); |
788 | if (!error) { | 874 | if (!error) { |
789 | struct elv_fs_entry *attr = e->type->elevator_attrs; | 875 | struct elv_fs_entry *attr = e->elevator_type->elevator_attrs; |
790 | if (attr) { | 876 | if (attr) { |
791 | while (attr->attr.name) { | 877 | while (attr->attr.name) { |
792 | if (sysfs_create_file(&e->kobj, &attr->attr)) | 878 | if (sysfs_create_file(&e->kobj, &attr->attr)) |
@@ -801,48 +887,29 @@ int elv_register_queue(struct request_queue *q) | |||
801 | } | 887 | } |
802 | EXPORT_SYMBOL(elv_register_queue); | 888 | EXPORT_SYMBOL(elv_register_queue); |
803 | 889 | ||
804 | void elv_unregister_queue(struct request_queue *q) | 890 | static void __elv_unregister_queue(struct elevator_queue *e) |
805 | { | 891 | { |
806 | if (q) { | 892 | kobject_uevent(&e->kobj, KOBJ_REMOVE); |
807 | struct elevator_queue *e = q->elevator; | 893 | kobject_del(&e->kobj); |
894 | e->registered = 0; | ||
895 | } | ||
808 | 896 | ||
809 | kobject_uevent(&e->kobj, KOBJ_REMOVE); | 897 | void elv_unregister_queue(struct request_queue *q) |
810 | kobject_del(&e->kobj); | 898 | { |
811 | e->registered = 0; | 899 | if (q) |
812 | } | 900 | __elv_unregister_queue(q->elevator); |
813 | } | 901 | } |
814 | EXPORT_SYMBOL(elv_unregister_queue); | 902 | EXPORT_SYMBOL(elv_unregister_queue); |
815 | 903 | ||
816 | int elv_register(struct elevator_type *e) | 904 | void elv_register(struct elevator_type *e) |
817 | { | 905 | { |
818 | char *def = ""; | 906 | char *def = ""; |
819 | 907 | ||
820 | /* create icq_cache if requested */ | ||
821 | if (e->icq_size) { | ||
822 | if (WARN_ON(e->icq_size < sizeof(struct io_cq)) || | ||
823 | WARN_ON(e->icq_align < __alignof__(struct io_cq))) | ||
824 | return -EINVAL; | ||
825 | |||
826 | snprintf(e->icq_cache_name, sizeof(e->icq_cache_name), | ||
827 | "%s_io_cq", e->elevator_name); | ||
828 | e->icq_cache = kmem_cache_create(e->icq_cache_name, e->icq_size, | ||
829 | e->icq_align, 0, NULL); | ||
830 | if (!e->icq_cache) | ||
831 | return -ENOMEM; | ||
832 | } | ||
833 | |||
834 | /* register, don't allow duplicate names */ | ||
835 | spin_lock(&elv_list_lock); | 908 | spin_lock(&elv_list_lock); |
836 | if (elevator_find(e->elevator_name)) { | 909 | BUG_ON(elevator_find(e->elevator_name)); |
837 | spin_unlock(&elv_list_lock); | ||
838 | if (e->icq_cache) | ||
839 | kmem_cache_destroy(e->icq_cache); | ||
840 | return -EBUSY; | ||
841 | } | ||
842 | list_add_tail(&e->list, &elv_list); | 910 | list_add_tail(&e->list, &elv_list); |
843 | spin_unlock(&elv_list_lock); | 911 | spin_unlock(&elv_list_lock); |
844 | 912 | ||
845 | /* print pretty message */ | ||
846 | if (!strcmp(e->elevator_name, chosen_elevator) || | 913 | if (!strcmp(e->elevator_name, chosen_elevator) || |
847 | (!*chosen_elevator && | 914 | (!*chosen_elevator && |
848 | !strcmp(e->elevator_name, CONFIG_DEFAULT_IOSCHED))) | 915 | !strcmp(e->elevator_name, CONFIG_DEFAULT_IOSCHED))) |
@@ -850,26 +917,30 @@ int elv_register(struct elevator_type *e) | |||
850 | 917 | ||
851 | printk(KERN_INFO "io scheduler %s registered%s\n", e->elevator_name, | 918 | printk(KERN_INFO "io scheduler %s registered%s\n", e->elevator_name, |
852 | def); | 919 | def); |
853 | return 0; | ||
854 | } | 920 | } |
855 | EXPORT_SYMBOL_GPL(elv_register); | 921 | EXPORT_SYMBOL_GPL(elv_register); |
856 | 922 | ||
857 | void elv_unregister(struct elevator_type *e) | 923 | void elv_unregister(struct elevator_type *e) |
858 | { | 924 | { |
859 | /* unregister */ | 925 | struct task_struct *g, *p; |
860 | spin_lock(&elv_list_lock); | ||
861 | list_del_init(&e->list); | ||
862 | spin_unlock(&elv_list_lock); | ||
863 | 926 | ||
864 | /* | 927 | /* |
865 | * Destroy icq_cache if it exists. icq's are RCU managed. Make | 928 | * Iterate every thread in the process to remove the io contexts. |
866 | * sure all RCU operations are complete before proceeding. | ||
867 | */ | 929 | */ |
868 | if (e->icq_cache) { | 930 | if (e->ops.trim) { |
869 | rcu_barrier(); | 931 | read_lock(&tasklist_lock); |
870 | kmem_cache_destroy(e->icq_cache); | 932 | do_each_thread(g, p) { |
871 | e->icq_cache = NULL; | 933 | task_lock(p); |
934 | if (p->io_context) | ||
935 | e->ops.trim(p->io_context); | ||
936 | task_unlock(p); | ||
937 | } while_each_thread(g, p); | ||
938 | read_unlock(&tasklist_lock); | ||
872 | } | 939 | } |
940 | |||
941 | spin_lock(&elv_list_lock); | ||
942 | list_del_init(&e->list); | ||
943 | spin_unlock(&elv_list_lock); | ||
873 | } | 944 | } |
874 | EXPORT_SYMBOL_GPL(elv_unregister); | 945 | EXPORT_SYMBOL_GPL(elv_unregister); |
875 | 946 | ||
@@ -881,60 +952,73 @@ EXPORT_SYMBOL_GPL(elv_unregister); | |||
881 | */ | 952 | */ |
882 | static int elevator_switch(struct request_queue *q, struct elevator_type *new_e) | 953 | static int elevator_switch(struct request_queue *q, struct elevator_type *new_e) |
883 | { | 954 | { |
884 | struct elevator_queue *old = q->elevator; | 955 | struct elevator_queue *old_elevator, *e; |
885 | bool registered = old->registered; | 956 | void *data; |
886 | int err; | 957 | int err; |
887 | 958 | ||
888 | /* | 959 | /* |
889 | * Turn on BYPASS and drain all requests w/ elevator private data. | 960 | * Allocate new elevator |
890 | * Block layer doesn't call into a quiesced elevator - all requests | ||
891 | * are directly put on the dispatch list without elevator data | ||
892 | * using INSERT_BACK. All requests have SOFTBARRIER set and no | ||
893 | * merge happens either. | ||
894 | */ | 961 | */ |
895 | blk_queue_bypass_start(q); | 962 | e = elevator_alloc(q, new_e); |
963 | if (!e) | ||
964 | return -ENOMEM; | ||
896 | 965 | ||
897 | /* unregister and clear all auxiliary data of the old elevator */ | 966 | data = elevator_init_queue(q, e); |
898 | if (registered) | 967 | if (!data) { |
899 | elv_unregister_queue(q); | 968 | kobject_put(&e->kobj); |
969 | return -ENOMEM; | ||
970 | } | ||
900 | 971 | ||
972 | /* | ||
973 | * Turn on BYPASS and drain all requests w/ elevator private data | ||
974 | */ | ||
901 | spin_lock_irq(q->queue_lock); | 975 | spin_lock_irq(q->queue_lock); |
902 | ioc_clear_queue(q); | 976 | elv_quiesce_start(q); |
903 | spin_unlock_irq(q->queue_lock); | ||
904 | 977 | ||
905 | /* allocate, init and register new elevator */ | 978 | /* |
906 | err = -ENOMEM; | 979 | * Remember old elevator. |
907 | q->elevator = elevator_alloc(q, new_e); | 980 | */ |
908 | if (!q->elevator) | 981 | old_elevator = q->elevator; |
909 | goto fail_init; | ||
910 | 982 | ||
911 | err = new_e->ops.elevator_init_fn(q); | 983 | /* |
912 | if (err) { | 984 | * attach and start new elevator |
913 | kobject_put(&q->elevator->kobj); | 985 | */ |
914 | goto fail_init; | 986 | elevator_attach(q, e, data); |
915 | } | 987 | |
988 | spin_unlock_irq(q->queue_lock); | ||
989 | |||
990 | if (old_elevator->registered) { | ||
991 | __elv_unregister_queue(old_elevator); | ||
916 | 992 | ||
917 | if (registered) { | ||
918 | err = elv_register_queue(q); | 993 | err = elv_register_queue(q); |
919 | if (err) | 994 | if (err) |
920 | goto fail_register; | 995 | goto fail_register; |
921 | } | 996 | } |
922 | 997 | ||
923 | /* done, kill the old one and finish */ | 998 | /* |
924 | elevator_exit(old); | 999 | * finally exit old elevator and turn off BYPASS. |
925 | blk_queue_bypass_end(q); | 1000 | */ |
1001 | elevator_exit(old_elevator); | ||
1002 | spin_lock_irq(q->queue_lock); | ||
1003 | elv_quiesce_end(q); | ||
1004 | spin_unlock_irq(q->queue_lock); | ||
926 | 1005 | ||
927 | blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name); | 1006 | blk_add_trace_msg(q, "elv switch: %s", e->elevator_type->elevator_name); |
928 | 1007 | ||
929 | return 0; | 1008 | return 0; |
930 | 1009 | ||
931 | fail_register: | 1010 | fail_register: |
932 | elevator_exit(q->elevator); | 1011 | /* |
933 | fail_init: | 1012 | * switch failed, exit the new io scheduler and reattach the old |
934 | /* switch failed, restore and re-register old elevator */ | 1013 | * one again (along with re-adding the sysfs dir) |
935 | q->elevator = old; | 1014 | */ |
1015 | elevator_exit(e); | ||
1016 | q->elevator = old_elevator; | ||
936 | elv_register_queue(q); | 1017 | elv_register_queue(q); |
937 | blk_queue_bypass_end(q); | 1018 | |
1019 | spin_lock_irq(q->queue_lock); | ||
1020 | queue_flag_clear(QUEUE_FLAG_ELVSWITCH, q); | ||
1021 | spin_unlock_irq(q->queue_lock); | ||
938 | 1022 | ||
939 | return err; | 1023 | return err; |
940 | } | 1024 | } |
@@ -957,7 +1041,7 @@ int elevator_change(struct request_queue *q, const char *name) | |||
957 | return -EINVAL; | 1041 | return -EINVAL; |
958 | } | 1042 | } |
959 | 1043 | ||
960 | if (!strcmp(elevator_name, q->elevator->type->elevator_name)) { | 1044 | if (!strcmp(elevator_name, q->elevator->elevator_type->elevator_name)) { |
961 | elevator_put(e); | 1045 | elevator_put(e); |
962 | return 0; | 1046 | return 0; |
963 | } | 1047 | } |
@@ -992,7 +1076,7 @@ ssize_t elv_iosched_show(struct request_queue *q, char *name) | |||
992 | if (!q->elevator || !blk_queue_stackable(q)) | 1076 | if (!q->elevator || !blk_queue_stackable(q)) |
993 | return sprintf(name, "none\n"); | 1077 | return sprintf(name, "none\n"); |
994 | 1078 | ||
995 | elv = e->type; | 1079 | elv = e->elevator_type; |
996 | 1080 | ||
997 | spin_lock(&elv_list_lock); | 1081 | spin_lock(&elv_list_lock); |
998 | list_for_each_entry(__e, &elv_list, list) { | 1082 | list_for_each_entry(__e, &elv_list, list) { |
diff --git a/block/genhd.c b/block/genhd.c index 9a289d7c84b..d3834710b95 100644 --- a/block/genhd.c +++ b/block/genhd.c | |||
@@ -15,6 +15,7 @@ | |||
15 | #include <linux/slab.h> | 15 | #include <linux/slab.h> |
16 | #include <linux/kmod.h> | 16 | #include <linux/kmod.h> |
17 | #include <linux/kobj_map.h> | 17 | #include <linux/kobj_map.h> |
18 | #include <linux/buffer_head.h> | ||
18 | #include <linux/mutex.h> | 19 | #include <linux/mutex.h> |
19 | #include <linux/idr.h> | 20 | #include <linux/idr.h> |
20 | #include <linux/log2.h> | 21 | #include <linux/log2.h> |
@@ -35,7 +36,6 @@ static DEFINE_IDR(ext_devt_idr); | |||
35 | 36 | ||
36 | static struct device_type disk_type; | 37 | static struct device_type disk_type; |
37 | 38 | ||
38 | static void disk_alloc_events(struct gendisk *disk); | ||
39 | static void disk_add_events(struct gendisk *disk); | 39 | static void disk_add_events(struct gendisk *disk); |
40 | static void disk_del_events(struct gendisk *disk); | 40 | static void disk_del_events(struct gendisk *disk); |
41 | static void disk_release_events(struct gendisk *disk); | 41 | static void disk_release_events(struct gendisk *disk); |
@@ -154,7 +154,7 @@ struct hd_struct *disk_part_iter_next(struct disk_part_iter *piter) | |||
154 | part = rcu_dereference(ptbl->part[piter->idx]); | 154 | part = rcu_dereference(ptbl->part[piter->idx]); |
155 | if (!part) | 155 | if (!part) |
156 | continue; | 156 | continue; |
157 | if (!part_nr_sects_read(part) && | 157 | if (!part->nr_sects && |
158 | !(piter->flags & DISK_PITER_INCL_EMPTY) && | 158 | !(piter->flags & DISK_PITER_INCL_EMPTY) && |
159 | !(piter->flags & DISK_PITER_INCL_EMPTY_PART0 && | 159 | !(piter->flags & DISK_PITER_INCL_EMPTY_PART0 && |
160 | piter->idx == 0)) | 160 | piter->idx == 0)) |
@@ -191,7 +191,7 @@ EXPORT_SYMBOL_GPL(disk_part_iter_exit); | |||
191 | static inline int sector_in_part(struct hd_struct *part, sector_t sector) | 191 | static inline int sector_in_part(struct hd_struct *part, sector_t sector) |
192 | { | 192 | { |
193 | return part->start_sect <= sector && | 193 | return part->start_sect <= sector && |
194 | sector < part->start_sect + part_nr_sects_read(part); | 194 | sector < part->start_sect + part->nr_sects; |
195 | } | 195 | } |
196 | 196 | ||
197 | /** | 197 | /** |
@@ -507,7 +507,7 @@ static int exact_lock(dev_t devt, void *data) | |||
507 | return 0; | 507 | return 0; |
508 | } | 508 | } |
509 | 509 | ||
510 | static void register_disk(struct gendisk *disk) | 510 | void register_disk(struct gendisk *disk) |
511 | { | 511 | { |
512 | struct device *ddev = disk_to_dev(disk); | 512 | struct device *ddev = disk_to_dev(disk); |
513 | struct block_device *bdev; | 513 | struct block_device *bdev; |
@@ -536,7 +536,7 @@ static void register_disk(struct gendisk *disk) | |||
536 | disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj); | 536 | disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj); |
537 | 537 | ||
538 | /* No minors to use for partitions */ | 538 | /* No minors to use for partitions */ |
539 | if (!disk_part_scan_enabled(disk)) | 539 | if (!disk_partitionable(disk)) |
540 | goto exit; | 540 | goto exit; |
541 | 541 | ||
542 | /* No such device (e.g., media were just removed) */ | 542 | /* No such device (e.g., media were just removed) */ |
@@ -602,8 +602,6 @@ void add_disk(struct gendisk *disk) | |||
602 | disk->major = MAJOR(devt); | 602 | disk->major = MAJOR(devt); |
603 | disk->first_minor = MINOR(devt); | 603 | disk->first_minor = MINOR(devt); |
604 | 604 | ||
605 | disk_alloc_events(disk); | ||
606 | |||
607 | /* Register BDI before referencing it from bdev */ | 605 | /* Register BDI before referencing it from bdev */ |
608 | bdi = &disk->queue->backing_dev_info; | 606 | bdi = &disk->queue->backing_dev_info; |
609 | bdi_register_dev(bdi, disk_devt(disk)); | 607 | bdi_register_dev(bdi, disk_devt(disk)); |
@@ -617,7 +615,7 @@ void add_disk(struct gendisk *disk) | |||
617 | * Take an extra ref on queue which will be put on disk_release() | 615 | * Take an extra ref on queue which will be put on disk_release() |
618 | * so that it sticks around as long as @disk is there. | 616 | * so that it sticks around as long as @disk is there. |
619 | */ | 617 | */ |
620 | WARN_ON_ONCE(!blk_get_queue(disk->queue)); | 618 | WARN_ON_ONCE(blk_get_queue(disk->queue)); |
621 | 619 | ||
622 | retval = sysfs_create_link(&disk_to_dev(disk)->kobj, &bdi->dev->kobj, | 620 | retval = sysfs_create_link(&disk_to_dev(disk)->kobj, &bdi->dev->kobj, |
623 | "bdi"); | 621 | "bdi"); |
@@ -743,6 +741,7 @@ void __init printk_all_partitions(void) | |||
743 | struct hd_struct *part; | 741 | struct hd_struct *part; |
744 | char name_buf[BDEVNAME_SIZE]; | 742 | char name_buf[BDEVNAME_SIZE]; |
745 | char devt_buf[BDEVT_SIZE]; | 743 | char devt_buf[BDEVT_SIZE]; |
744 | u8 uuid[PARTITION_META_INFO_UUIDLTH * 2 + 1]; | ||
746 | 745 | ||
747 | /* | 746 | /* |
748 | * Don't show empty devices or things that have been | 747 | * Don't show empty devices or things that have been |
@@ -761,11 +760,14 @@ void __init printk_all_partitions(void) | |||
761 | while ((part = disk_part_iter_next(&piter))) { | 760 | while ((part = disk_part_iter_next(&piter))) { |
762 | bool is_part0 = part == &disk->part0; | 761 | bool is_part0 = part == &disk->part0; |
763 | 762 | ||
763 | uuid[0] = 0; | ||
764 | if (part->info) | ||
765 | part_unpack_uuid(part->info->uuid, uuid); | ||
766 | |||
764 | printk("%s%s %10llu %s %s", is_part0 ? "" : " ", | 767 | printk("%s%s %10llu %s %s", is_part0 ? "" : " ", |
765 | bdevt_str(part_devt(part), devt_buf), | 768 | bdevt_str(part_devt(part), devt_buf), |
766 | (unsigned long long)part_nr_sects_read(part) >> 1 | 769 | (unsigned long long)part->nr_sects >> 1, |
767 | , disk_name(disk, part->partno, name_buf), | 770 | disk_name(disk, part->partno, name_buf), uuid); |
768 | part->info ? part->info->uuid : ""); | ||
769 | if (is_part0) { | 771 | if (is_part0) { |
770 | if (disk->driverfs_dev != NULL && | 772 | if (disk->driverfs_dev != NULL && |
771 | disk->driverfs_dev->driver != NULL) | 773 | disk->driverfs_dev->driver != NULL) |
@@ -829,7 +831,7 @@ static void disk_seqf_stop(struct seq_file *seqf, void *v) | |||
829 | 831 | ||
830 | static void *show_partition_start(struct seq_file *seqf, loff_t *pos) | 832 | static void *show_partition_start(struct seq_file *seqf, loff_t *pos) |
831 | { | 833 | { |
832 | void *p; | 834 | static void *p; |
833 | 835 | ||
834 | p = disk_seqf_start(seqf, pos); | 836 | p = disk_seqf_start(seqf, pos); |
835 | if (!IS_ERR_OR_NULL(p) && !*pos) | 837 | if (!IS_ERR_OR_NULL(p) && !*pos) |
@@ -845,7 +847,7 @@ static int show_partition(struct seq_file *seqf, void *v) | |||
845 | char buf[BDEVNAME_SIZE]; | 847 | char buf[BDEVNAME_SIZE]; |
846 | 848 | ||
847 | /* Don't show non-partitionable removeable devices or empty devices */ | 849 | /* Don't show non-partitionable removeable devices or empty devices */ |
848 | if (!get_capacity(sgp) || (!disk_max_parts(sgp) && | 850 | if (!get_capacity(sgp) || (!disk_partitionable(sgp) && |
849 | (sgp->flags & GENHD_FL_REMOVABLE))) | 851 | (sgp->flags & GENHD_FL_REMOVABLE))) |
850 | return 0; | 852 | return 0; |
851 | if (sgp->flags & GENHD_FL_SUPPRESS_PARTITION_INFO) | 853 | if (sgp->flags & GENHD_FL_SUPPRESS_PARTITION_INFO) |
@@ -856,7 +858,7 @@ static int show_partition(struct seq_file *seqf, void *v) | |||
856 | while ((part = disk_part_iter_next(&piter))) | 858 | while ((part = disk_part_iter_next(&piter))) |
857 | seq_printf(seqf, "%4d %7d %10llu %s\n", | 859 | seq_printf(seqf, "%4d %7d %10llu %s\n", |
858 | MAJOR(part_devt(part)), MINOR(part_devt(part)), | 860 | MAJOR(part_devt(part)), MINOR(part_devt(part)), |
859 | (unsigned long long)part_nr_sects_read(part) >> 1, | 861 | (unsigned long long)part->nr_sects >> 1, |
860 | disk_name(sgp, part->partno, buf)); | 862 | disk_name(sgp, part->partno, buf)); |
861 | disk_part_iter_exit(&piter); | 863 | disk_part_iter_exit(&piter); |
862 | 864 | ||
@@ -1103,11 +1105,27 @@ static void disk_release(struct device *dev) | |||
1103 | blk_put_queue(disk->queue); | 1105 | blk_put_queue(disk->queue); |
1104 | kfree(disk); | 1106 | kfree(disk); |
1105 | } | 1107 | } |
1108 | |||
1109 | static int disk_uevent(struct device *dev, struct kobj_uevent_env *env) | ||
1110 | { | ||
1111 | struct gendisk *disk = dev_to_disk(dev); | ||
1112 | struct disk_part_iter piter; | ||
1113 | struct hd_struct *part; | ||
1114 | int cnt = 0; | ||
1115 | |||
1116 | disk_part_iter_init(&piter, disk, 0); | ||
1117 | while((part = disk_part_iter_next(&piter))) | ||
1118 | cnt++; | ||
1119 | disk_part_iter_exit(&piter); | ||
1120 | add_uevent_var(env, "NPARTS=%u", cnt); | ||
1121 | return 0; | ||
1122 | } | ||
1123 | |||
1106 | struct class block_class = { | 1124 | struct class block_class = { |
1107 | .name = "block", | 1125 | .name = "block", |
1108 | }; | 1126 | }; |
1109 | 1127 | ||
1110 | static char *block_devnode(struct device *dev, umode_t *mode) | 1128 | static char *block_devnode(struct device *dev, mode_t *mode) |
1111 | { | 1129 | { |
1112 | struct gendisk *disk = dev_to_disk(dev); | 1130 | struct gendisk *disk = dev_to_disk(dev); |
1113 | 1131 | ||
@@ -1121,6 +1139,7 @@ static struct device_type disk_type = { | |||
1121 | .groups = disk_attr_groups, | 1139 | .groups = disk_attr_groups, |
1122 | .release = disk_release, | 1140 | .release = disk_release, |
1123 | .devnode = block_devnode, | 1141 | .devnode = block_devnode, |
1142 | .uevent = disk_uevent, | ||
1124 | }; | 1143 | }; |
1125 | 1144 | ||
1126 | #ifdef CONFIG_PROC_FS | 1145 | #ifdef CONFIG_PROC_FS |
@@ -1239,7 +1258,7 @@ EXPORT_SYMBOL(blk_lookup_devt); | |||
1239 | 1258 | ||
1240 | struct gendisk *alloc_disk(int minors) | 1259 | struct gendisk *alloc_disk(int minors) |
1241 | { | 1260 | { |
1242 | return alloc_disk_node(minors, NUMA_NO_NODE); | 1261 | return alloc_disk_node(minors, -1); |
1243 | } | 1262 | } |
1244 | EXPORT_SYMBOL(alloc_disk); | 1263 | EXPORT_SYMBOL(alloc_disk); |
1245 | 1264 | ||
@@ -1262,16 +1281,6 @@ struct gendisk *alloc_disk_node(int minors, int node_id) | |||
1262 | } | 1281 | } |
1263 | disk->part_tbl->part[0] = &disk->part0; | 1282 | disk->part_tbl->part[0] = &disk->part0; |
1264 | 1283 | ||
1265 | /* | ||
1266 | * set_capacity() and get_capacity() currently don't use | ||
1267 | * seqcounter to read/update the part0->nr_sects. Still init | ||
1268 | * the counter as we can read the sectors in IO submission | ||
1269 | * patch using seqence counters. | ||
1270 | * | ||
1271 | * TODO: Ideally set_capacity() and get_capacity() should be | ||
1272 | * converted to make use of bd_mutex and sequence counters. | ||
1273 | */ | ||
1274 | seqcount_init(&disk->part0.nr_sects_seq); | ||
1275 | hd_ref_init(&disk->part0); | 1284 | hd_ref_init(&disk->part0); |
1276 | 1285 | ||
1277 | disk->minors = minors; | 1286 | disk->minors = minors; |
@@ -1484,9 +1493,9 @@ static void __disk_unblock_events(struct gendisk *disk, bool check_now) | |||
1484 | intv = disk_events_poll_jiffies(disk); | 1493 | intv = disk_events_poll_jiffies(disk); |
1485 | set_timer_slack(&ev->dwork.timer, intv / 4); | 1494 | set_timer_slack(&ev->dwork.timer, intv / 4); |
1486 | if (check_now) | 1495 | if (check_now) |
1487 | queue_delayed_work(system_freezable_wq, &ev->dwork, 0); | 1496 | queue_delayed_work(system_nrt_wq, &ev->dwork, 0); |
1488 | else if (intv) | 1497 | else if (intv) |
1489 | queue_delayed_work(system_freezable_wq, &ev->dwork, intv); | 1498 | queue_delayed_work(system_nrt_wq, &ev->dwork, intv); |
1490 | out_unlock: | 1499 | out_unlock: |
1491 | spin_unlock_irqrestore(&ev->lock, flags); | 1500 | spin_unlock_irqrestore(&ev->lock, flags); |
1492 | } | 1501 | } |
@@ -1528,8 +1537,10 @@ void disk_flush_events(struct gendisk *disk, unsigned int mask) | |||
1528 | 1537 | ||
1529 | spin_lock_irq(&ev->lock); | 1538 | spin_lock_irq(&ev->lock); |
1530 | ev->clearing |= mask; | 1539 | ev->clearing |= mask; |
1531 | if (!ev->block) | 1540 | if (!ev->block) { |
1532 | mod_delayed_work(system_freezable_wq, &ev->dwork, 0); | 1541 | cancel_delayed_work(&ev->dwork); |
1542 | queue_delayed_work(system_nrt_wq, &ev->dwork, 0); | ||
1543 | } | ||
1533 | spin_unlock_irq(&ev->lock); | 1544 | spin_unlock_irq(&ev->lock); |
1534 | } | 1545 | } |
1535 | 1546 | ||
@@ -1565,7 +1576,7 @@ unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask) | |||
1565 | 1576 | ||
1566 | /* uncondtionally schedule event check and wait for it to finish */ | 1577 | /* uncondtionally schedule event check and wait for it to finish */ |
1567 | disk_block_events(disk); | 1578 | disk_block_events(disk); |
1568 | queue_delayed_work(system_freezable_wq, &ev->dwork, 0); | 1579 | queue_delayed_work(system_nrt_wq, &ev->dwork, 0); |
1569 | flush_delayed_work(&ev->dwork); | 1580 | flush_delayed_work(&ev->dwork); |
1570 | __disk_unblock_events(disk, false); | 1581 | __disk_unblock_events(disk, false); |
1571 | 1582 | ||
@@ -1602,7 +1613,7 @@ static void disk_events_workfn(struct work_struct *work) | |||
1602 | 1613 | ||
1603 | intv = disk_events_poll_jiffies(disk); | 1614 | intv = disk_events_poll_jiffies(disk); |
1604 | if (!ev->block && intv) | 1615 | if (!ev->block && intv) |
1605 | queue_delayed_work(system_freezable_wq, &ev->dwork, intv); | 1616 | queue_delayed_work(system_nrt_wq, &ev->dwork, intv); |
1606 | 1617 | ||
1607 | spin_unlock_irq(&ev->lock); | 1618 | spin_unlock_irq(&ev->lock); |
1608 | 1619 | ||
@@ -1740,9 +1751,9 @@ module_param_cb(events_dfl_poll_msecs, &disk_events_dfl_poll_msecs_param_ops, | |||
1740 | &disk_events_dfl_poll_msecs, 0644); | 1751 | &disk_events_dfl_poll_msecs, 0644); |
1741 | 1752 | ||
1742 | /* | 1753 | /* |
1743 | * disk_{alloc|add|del|release}_events - initialize and destroy disk_events. | 1754 | * disk_{add|del|release}_events - initialize and destroy disk_events. |
1744 | */ | 1755 | */ |
1745 | static void disk_alloc_events(struct gendisk *disk) | 1756 | static void disk_add_events(struct gendisk *disk) |
1746 | { | 1757 | { |
1747 | struct disk_events *ev; | 1758 | struct disk_events *ev; |
1748 | 1759 | ||
@@ -1755,6 +1766,16 @@ static void disk_alloc_events(struct gendisk *disk) | |||
1755 | return; | 1766 | return; |
1756 | } | 1767 | } |
1757 | 1768 | ||
1769 | if (sysfs_create_files(&disk_to_dev(disk)->kobj, | ||
1770 | disk_events_attrs) < 0) { | ||
1771 | pr_warn("%s: failed to create sysfs files for events\n", | ||
1772 | disk->disk_name); | ||
1773 | kfree(ev); | ||
1774 | return; | ||
1775 | } | ||
1776 | |||
1777 | disk->ev = ev; | ||
1778 | |||
1758 | INIT_LIST_HEAD(&ev->node); | 1779 | INIT_LIST_HEAD(&ev->node); |
1759 | ev->disk = disk; | 1780 | ev->disk = disk; |
1760 | spin_lock_init(&ev->lock); | 1781 | spin_lock_init(&ev->lock); |
@@ -1763,21 +1784,8 @@ static void disk_alloc_events(struct gendisk *disk) | |||
1763 | ev->poll_msecs = -1; | 1784 | ev->poll_msecs = -1; |
1764 | INIT_DELAYED_WORK(&ev->dwork, disk_events_workfn); | 1785 | INIT_DELAYED_WORK(&ev->dwork, disk_events_workfn); |
1765 | 1786 | ||
1766 | disk->ev = ev; | ||
1767 | } | ||
1768 | |||
1769 | static void disk_add_events(struct gendisk *disk) | ||
1770 | { | ||
1771 | if (!disk->ev) | ||
1772 | return; | ||
1773 | |||
1774 | /* FIXME: error handling */ | ||
1775 | if (sysfs_create_files(&disk_to_dev(disk)->kobj, disk_events_attrs) < 0) | ||
1776 | pr_warn("%s: failed to create sysfs files for events\n", | ||
1777 | disk->disk_name); | ||
1778 | |||
1779 | mutex_lock(&disk_events_mutex); | 1787 | mutex_lock(&disk_events_mutex); |
1780 | list_add_tail(&disk->ev->node, &disk_events); | 1788 | list_add_tail(&ev->node, &disk_events); |
1781 | mutex_unlock(&disk_events_mutex); | 1789 | mutex_unlock(&disk_events_mutex); |
1782 | 1790 | ||
1783 | /* | 1791 | /* |
diff --git a/block/ioctl.c b/block/ioctl.c index a31d91d9bc5..1124cd29726 100644 --- a/block/ioctl.c +++ b/block/ioctl.c | |||
@@ -1,11 +1,10 @@ | |||
1 | #include <linux/capability.h> | 1 | #include <linux/capability.h> |
2 | #include <linux/blkdev.h> | 2 | #include <linux/blkdev.h> |
3 | #include <linux/export.h> | ||
4 | #include <linux/gfp.h> | 3 | #include <linux/gfp.h> |
5 | #include <linux/blkpg.h> | 4 | #include <linux/blkpg.h> |
6 | #include <linux/hdreg.h> | 5 | #include <linux/hdreg.h> |
7 | #include <linux/backing-dev.h> | 6 | #include <linux/backing-dev.h> |
8 | #include <linux/fs.h> | 7 | #include <linux/buffer_head.h> |
9 | #include <linux/blktrace_api.h> | 8 | #include <linux/blktrace_api.h> |
10 | #include <asm/uaccess.h> | 9 | #include <asm/uaccess.h> |
11 | 10 | ||
@@ -13,7 +12,7 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user | |||
13 | { | 12 | { |
14 | struct block_device *bdevp; | 13 | struct block_device *bdevp; |
15 | struct gendisk *disk; | 14 | struct gendisk *disk; |
16 | struct hd_struct *part, *lpart; | 15 | struct hd_struct *part; |
17 | struct blkpg_ioctl_arg a; | 16 | struct blkpg_ioctl_arg a; |
18 | struct blkpg_partition p; | 17 | struct blkpg_partition p; |
19 | struct disk_part_iter piter; | 18 | struct disk_part_iter piter; |
@@ -36,12 +35,12 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user | |||
36 | case BLKPG_ADD_PARTITION: | 35 | case BLKPG_ADD_PARTITION: |
37 | start = p.start >> 9; | 36 | start = p.start >> 9; |
38 | length = p.length >> 9; | 37 | length = p.length >> 9; |
39 | /* check for fit in a hd_struct */ | 38 | /* check for fit in a hd_struct */ |
40 | if (sizeof(sector_t) == sizeof(long) && | 39 | if (sizeof(sector_t) == sizeof(long) && |
41 | sizeof(long long) > sizeof(long)) { | 40 | sizeof(long long) > sizeof(long)) { |
42 | long pstart = start, plength = length; | 41 | long pstart = start, plength = length; |
43 | if (pstart != start || plength != length | 42 | if (pstart != start || plength != length |
44 | || pstart < 0 || plength < 0 || partno > 65535) | 43 | || pstart < 0 || plength < 0) |
45 | return -EINVAL; | 44 | return -EINVAL; |
46 | } | 45 | } |
47 | 46 | ||
@@ -92,59 +91,6 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user | |||
92 | bdput(bdevp); | 91 | bdput(bdevp); |
93 | 92 | ||
94 | return 0; | 93 | return 0; |
95 | case BLKPG_RESIZE_PARTITION: | ||
96 | start = p.start >> 9; | ||
97 | /* new length of partition in bytes */ | ||
98 | length = p.length >> 9; | ||
99 | /* check for fit in a hd_struct */ | ||
100 | if (sizeof(sector_t) == sizeof(long) && | ||
101 | sizeof(long long) > sizeof(long)) { | ||
102 | long pstart = start, plength = length; | ||
103 | if (pstart != start || plength != length | ||
104 | || pstart < 0 || plength < 0) | ||
105 | return -EINVAL; | ||
106 | } | ||
107 | part = disk_get_part(disk, partno); | ||
108 | if (!part) | ||
109 | return -ENXIO; | ||
110 | bdevp = bdget(part_devt(part)); | ||
111 | if (!bdevp) { | ||
112 | disk_put_part(part); | ||
113 | return -ENOMEM; | ||
114 | } | ||
115 | mutex_lock(&bdevp->bd_mutex); | ||
116 | mutex_lock_nested(&bdev->bd_mutex, 1); | ||
117 | if (start != part->start_sect) { | ||
118 | mutex_unlock(&bdevp->bd_mutex); | ||
119 | mutex_unlock(&bdev->bd_mutex); | ||
120 | bdput(bdevp); | ||
121 | disk_put_part(part); | ||
122 | return -EINVAL; | ||
123 | } | ||
124 | /* overlap? */ | ||
125 | disk_part_iter_init(&piter, disk, | ||
126 | DISK_PITER_INCL_EMPTY); | ||
127 | while ((lpart = disk_part_iter_next(&piter))) { | ||
128 | if (lpart->partno != partno && | ||
129 | !(start + length <= lpart->start_sect || | ||
130 | start >= lpart->start_sect + lpart->nr_sects) | ||
131 | ) { | ||
132 | disk_part_iter_exit(&piter); | ||
133 | mutex_unlock(&bdevp->bd_mutex); | ||
134 | mutex_unlock(&bdev->bd_mutex); | ||
135 | bdput(bdevp); | ||
136 | disk_put_part(part); | ||
137 | return -EBUSY; | ||
138 | } | ||
139 | } | ||
140 | disk_part_iter_exit(&piter); | ||
141 | part_nr_sects_write(part, (sector_t)length); | ||
142 | i_size_write(bdevp->bd_inode, p.length); | ||
143 | mutex_unlock(&bdevp->bd_mutex); | ||
144 | mutex_unlock(&bdev->bd_mutex); | ||
145 | bdput(bdevp); | ||
146 | disk_put_part(part); | ||
147 | return 0; | ||
148 | default: | 94 | default: |
149 | return -EINVAL; | 95 | return -EINVAL; |
150 | } | 96 | } |
@@ -155,7 +101,7 @@ static int blkdev_reread_part(struct block_device *bdev) | |||
155 | struct gendisk *disk = bdev->bd_disk; | 101 | struct gendisk *disk = bdev->bd_disk; |
156 | int res; | 102 | int res; |
157 | 103 | ||
158 | if (!disk_part_scan_enabled(disk) || bdev != bdev->bd_contains) | 104 | if (!disk_partitionable(disk) || bdev != bdev->bd_contains) |
159 | return -EINVAL; | 105 | return -EINVAL; |
160 | if (!capable(CAP_SYS_ADMIN)) | 106 | if (!capable(CAP_SYS_ADMIN)) |
161 | return -EACCES; | 107 | return -EACCES; |
@@ -185,22 +131,6 @@ static int blk_ioctl_discard(struct block_device *bdev, uint64_t start, | |||
185 | return blkdev_issue_discard(bdev, start, len, GFP_KERNEL, flags); | 131 | return blkdev_issue_discard(bdev, start, len, GFP_KERNEL, flags); |
186 | } | 132 | } |
187 | 133 | ||
188 | static int blk_ioctl_zeroout(struct block_device *bdev, uint64_t start, | ||
189 | uint64_t len) | ||
190 | { | ||
191 | if (start & 511) | ||
192 | return -EINVAL; | ||
193 | if (len & 511) | ||
194 | return -EINVAL; | ||
195 | start >>= 9; | ||
196 | len >>= 9; | ||
197 | |||
198 | if (start + len > (i_size_read(bdev->bd_inode) >> 9)) | ||
199 | return -EINVAL; | ||
200 | |||
201 | return blkdev_issue_zeroout(bdev, start, len, GFP_KERNEL); | ||
202 | } | ||
203 | |||
204 | static int put_ushort(unsigned long arg, unsigned short val) | 134 | static int put_ushort(unsigned long arg, unsigned short val) |
205 | { | 135 | { |
206 | return put_user(val, (unsigned short __user *)arg); | 136 | return put_user(val, (unsigned short __user *)arg); |
@@ -249,26 +179,6 @@ int __blkdev_driver_ioctl(struct block_device *bdev, fmode_t mode, | |||
249 | EXPORT_SYMBOL_GPL(__blkdev_driver_ioctl); | 179 | EXPORT_SYMBOL_GPL(__blkdev_driver_ioctl); |
250 | 180 | ||
251 | /* | 181 | /* |
252 | * Is it an unrecognized ioctl? The correct returns are either | ||
253 | * ENOTTY (final) or ENOIOCTLCMD ("I don't know this one, try a | ||
254 | * fallback"). ENOIOCTLCMD gets turned into ENOTTY by the ioctl | ||
255 | * code before returning. | ||
256 | * | ||
257 | * Confused drivers sometimes return EINVAL, which is wrong. It | ||
258 | * means "I understood the ioctl command, but the parameters to | ||
259 | * it were wrong". | ||
260 | * | ||
261 | * We should aim to just fix the broken drivers, the EINVAL case | ||
262 | * should go away. | ||
263 | */ | ||
264 | static inline int is_unrecognized_ioctl(int ret) | ||
265 | { | ||
266 | return ret == -EINVAL || | ||
267 | ret == -ENOTTY || | ||
268 | ret == -ENOIOCTLCMD; | ||
269 | } | ||
270 | |||
271 | /* | ||
272 | * always keep this in sync with compat_blkdev_ioctl() | 182 | * always keep this in sync with compat_blkdev_ioctl() |
273 | */ | 183 | */ |
274 | int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, | 184 | int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, |
@@ -285,7 +195,8 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, | |||
285 | return -EACCES; | 195 | return -EACCES; |
286 | 196 | ||
287 | ret = __blkdev_driver_ioctl(bdev, mode, cmd, arg); | 197 | ret = __blkdev_driver_ioctl(bdev, mode, cmd, arg); |
288 | if (!is_unrecognized_ioctl(ret)) | 198 | /* -EINVAL to handle old uncorrected drivers */ |
199 | if (ret != -EINVAL && ret != -ENOTTY) | ||
289 | return ret; | 200 | return ret; |
290 | 201 | ||
291 | fsync_bdev(bdev); | 202 | fsync_bdev(bdev); |
@@ -294,7 +205,8 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, | |||
294 | 205 | ||
295 | case BLKROSET: | 206 | case BLKROSET: |
296 | ret = __blkdev_driver_ioctl(bdev, mode, cmd, arg); | 207 | ret = __blkdev_driver_ioctl(bdev, mode, cmd, arg); |
297 | if (!is_unrecognized_ioctl(ret)) | 208 | /* -EINVAL to handle old uncorrected drivers */ |
209 | if (ret != -EINVAL && ret != -ENOTTY) | ||
298 | return ret; | 210 | return ret; |
299 | if (!capable(CAP_SYS_ADMIN)) | 211 | if (!capable(CAP_SYS_ADMIN)) |
300 | return -EACCES; | 212 | return -EACCES; |
@@ -316,17 +228,6 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, | |||
316 | return blk_ioctl_discard(bdev, range[0], range[1], | 228 | return blk_ioctl_discard(bdev, range[0], range[1], |
317 | cmd == BLKSECDISCARD); | 229 | cmd == BLKSECDISCARD); |
318 | } | 230 | } |
319 | case BLKZEROOUT: { | ||
320 | uint64_t range[2]; | ||
321 | |||
322 | if (!(mode & FMODE_WRITE)) | ||
323 | return -EBADF; | ||
324 | |||
325 | if (copy_from_user(range, (void __user *)arg, sizeof(range))) | ||
326 | return -EFAULT; | ||
327 | |||
328 | return blk_ioctl_zeroout(bdev, range[0], range[1]); | ||
329 | } | ||
330 | 231 | ||
331 | case HDIO_GETGEO: { | 232 | case HDIO_GETGEO: { |
332 | struct hd_geometry geo; | 233 | struct hd_geometry geo; |
@@ -376,8 +277,6 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, | |||
376 | return put_uint(arg, bdev_discard_zeroes_data(bdev)); | 277 | return put_uint(arg, bdev_discard_zeroes_data(bdev)); |
377 | case BLKSECTGET: | 278 | case BLKSECTGET: |
378 | return put_ushort(arg, queue_max_sectors(bdev_get_queue(bdev))); | 279 | return put_ushort(arg, queue_max_sectors(bdev_get_queue(bdev))); |
379 | case BLKROTATIONAL: | ||
380 | return put_ushort(arg, !blk_queue_nonrot(bdev_get_queue(bdev))); | ||
381 | case BLKRASET: | 280 | case BLKRASET: |
382 | case BLKFRASET: | 281 | case BLKFRASET: |
383 | if(!capable(CAP_SYS_ADMIN)) | 282 | if(!capable(CAP_SYS_ADMIN)) |
diff --git a/block/noop-iosched.c b/block/noop-iosched.c index 5d1bf70e33d..06389e9ef96 100644 --- a/block/noop-iosched.c +++ b/block/noop-iosched.c | |||
@@ -59,17 +59,15 @@ noop_latter_request(struct request_queue *q, struct request *rq) | |||
59 | return list_entry(rq->queuelist.next, struct request, queuelist); | 59 | return list_entry(rq->queuelist.next, struct request, queuelist); |
60 | } | 60 | } |
61 | 61 | ||
62 | static int noop_init_queue(struct request_queue *q) | 62 | static void *noop_init_queue(struct request_queue *q) |
63 | { | 63 | { |
64 | struct noop_data *nd; | 64 | struct noop_data *nd; |
65 | 65 | ||
66 | nd = kmalloc_node(sizeof(*nd), GFP_KERNEL, q->node); | 66 | nd = kmalloc_node(sizeof(*nd), GFP_KERNEL, q->node); |
67 | if (!nd) | 67 | if (!nd) |
68 | return -ENOMEM; | 68 | return NULL; |
69 | |||
70 | INIT_LIST_HEAD(&nd->queue); | 69 | INIT_LIST_HEAD(&nd->queue); |
71 | q->elevator->elevator_data = nd; | 70 | return nd; |
72 | return 0; | ||
73 | } | 71 | } |
74 | 72 | ||
75 | static void noop_exit_queue(struct elevator_queue *e) | 73 | static void noop_exit_queue(struct elevator_queue *e) |
@@ -96,7 +94,9 @@ static struct elevator_type elevator_noop = { | |||
96 | 94 | ||
97 | static int __init noop_init(void) | 95 | static int __init noop_init(void) |
98 | { | 96 | { |
99 | return elv_register(&elevator_noop); | 97 | elv_register(&elevator_noop); |
98 | |||
99 | return 0; | ||
100 | } | 100 | } |
101 | 101 | ||
102 | static void __exit noop_exit(void) | 102 | static void __exit noop_exit(void) |
diff --git a/block/partition-generic.c b/block/partition-generic.c deleted file mode 100644 index f1d14519cc0..00000000000 --- a/block/partition-generic.c +++ /dev/null | |||
@@ -1,571 +0,0 @@ | |||
1 | /* | ||
2 | * Code extracted from drivers/block/genhd.c | ||
3 | * Copyright (C) 1991-1998 Linus Torvalds | ||
4 | * Re-organised Feb 1998 Russell King | ||
5 | * | ||
6 | * We now have independent partition support from the | ||
7 | * block drivers, which allows all the partition code to | ||
8 | * be grouped in one location, and it to be mostly self | ||
9 | * contained. | ||
10 | */ | ||
11 | |||
12 | #include <linux/init.h> | ||
13 | #include <linux/module.h> | ||
14 | #include <linux/fs.h> | ||
15 | #include <linux/slab.h> | ||
16 | #include <linux/kmod.h> | ||
17 | #include <linux/ctype.h> | ||
18 | #include <linux/genhd.h> | ||
19 | #include <linux/blktrace_api.h> | ||
20 | |||
21 | #include "partitions/check.h" | ||
22 | |||
23 | #ifdef CONFIG_BLK_DEV_MD | ||
24 | extern void md_autodetect_dev(dev_t dev); | ||
25 | #endif | ||
26 | |||
27 | /* | ||
28 | * disk_name() is used by partition check code and the genhd driver. | ||
29 | * It formats the devicename of the indicated disk into | ||
30 | * the supplied buffer (of size at least 32), and returns | ||
31 | * a pointer to that same buffer (for convenience). | ||
32 | */ | ||
33 | |||
34 | char *disk_name(struct gendisk *hd, int partno, char *buf) | ||
35 | { | ||
36 | if (!partno) | ||
37 | snprintf(buf, BDEVNAME_SIZE, "%s", hd->disk_name); | ||
38 | else if (isdigit(hd->disk_name[strlen(hd->disk_name)-1])) | ||
39 | snprintf(buf, BDEVNAME_SIZE, "%sp%d", hd->disk_name, partno); | ||
40 | else | ||
41 | snprintf(buf, BDEVNAME_SIZE, "%s%d", hd->disk_name, partno); | ||
42 | |||
43 | return buf; | ||
44 | } | ||
45 | |||
46 | const char *bdevname(struct block_device *bdev, char *buf) | ||
47 | { | ||
48 | return disk_name(bdev->bd_disk, bdev->bd_part->partno, buf); | ||
49 | } | ||
50 | |||
51 | EXPORT_SYMBOL(bdevname); | ||
52 | |||
53 | /* | ||
54 | * There's very little reason to use this, you should really | ||
55 | * have a struct block_device just about everywhere and use | ||
56 | * bdevname() instead. | ||
57 | */ | ||
58 | const char *__bdevname(dev_t dev, char *buffer) | ||
59 | { | ||
60 | scnprintf(buffer, BDEVNAME_SIZE, "unknown-block(%u,%u)", | ||
61 | MAJOR(dev), MINOR(dev)); | ||
62 | return buffer; | ||
63 | } | ||
64 | |||
65 | EXPORT_SYMBOL(__bdevname); | ||
66 | |||
67 | static ssize_t part_partition_show(struct device *dev, | ||
68 | struct device_attribute *attr, char *buf) | ||
69 | { | ||
70 | struct hd_struct *p = dev_to_part(dev); | ||
71 | |||
72 | return sprintf(buf, "%d\n", p->partno); | ||
73 | } | ||
74 | |||
75 | static ssize_t part_start_show(struct device *dev, | ||
76 | struct device_attribute *attr, char *buf) | ||
77 | { | ||
78 | struct hd_struct *p = dev_to_part(dev); | ||
79 | |||
80 | return sprintf(buf, "%llu\n",(unsigned long long)p->start_sect); | ||
81 | } | ||
82 | |||
83 | ssize_t part_size_show(struct device *dev, | ||
84 | struct device_attribute *attr, char *buf) | ||
85 | { | ||
86 | struct hd_struct *p = dev_to_part(dev); | ||
87 | return sprintf(buf, "%llu\n",(unsigned long long)part_nr_sects_read(p)); | ||
88 | } | ||
89 | |||
90 | static ssize_t part_ro_show(struct device *dev, | ||
91 | struct device_attribute *attr, char *buf) | ||
92 | { | ||
93 | struct hd_struct *p = dev_to_part(dev); | ||
94 | return sprintf(buf, "%d\n", p->policy ? 1 : 0); | ||
95 | } | ||
96 | |||
97 | static ssize_t part_alignment_offset_show(struct device *dev, | ||
98 | struct device_attribute *attr, char *buf) | ||
99 | { | ||
100 | struct hd_struct *p = dev_to_part(dev); | ||
101 | return sprintf(buf, "%llu\n", (unsigned long long)p->alignment_offset); | ||
102 | } | ||
103 | |||
104 | static ssize_t part_discard_alignment_show(struct device *dev, | ||
105 | struct device_attribute *attr, char *buf) | ||
106 | { | ||
107 | struct hd_struct *p = dev_to_part(dev); | ||
108 | return sprintf(buf, "%u\n", p->discard_alignment); | ||
109 | } | ||
110 | |||
111 | ssize_t part_stat_show(struct device *dev, | ||
112 | struct device_attribute *attr, char *buf) | ||
113 | { | ||
114 | struct hd_struct *p = dev_to_part(dev); | ||
115 | int cpu; | ||
116 | |||
117 | cpu = part_stat_lock(); | ||
118 | part_round_stats(cpu, p); | ||
119 | part_stat_unlock(); | ||
120 | return sprintf(buf, | ||
121 | "%8lu %8lu %8llu %8u " | ||
122 | "%8lu %8lu %8llu %8u " | ||
123 | "%8u %8u %8u" | ||
124 | "\n", | ||
125 | part_stat_read(p, ios[READ]), | ||
126 | part_stat_read(p, merges[READ]), | ||
127 | (unsigned long long)part_stat_read(p, sectors[READ]), | ||
128 | jiffies_to_msecs(part_stat_read(p, ticks[READ])), | ||
129 | part_stat_read(p, ios[WRITE]), | ||
130 | part_stat_read(p, merges[WRITE]), | ||
131 | (unsigned long long)part_stat_read(p, sectors[WRITE]), | ||
132 | jiffies_to_msecs(part_stat_read(p, ticks[WRITE])), | ||
133 | part_in_flight(p), | ||
134 | jiffies_to_msecs(part_stat_read(p, io_ticks)), | ||
135 | jiffies_to_msecs(part_stat_read(p, time_in_queue))); | ||
136 | } | ||
137 | |||
138 | ssize_t part_inflight_show(struct device *dev, | ||
139 | struct device_attribute *attr, char *buf) | ||
140 | { | ||
141 | struct hd_struct *p = dev_to_part(dev); | ||
142 | |||
143 | return sprintf(buf, "%8u %8u\n", atomic_read(&p->in_flight[0]), | ||
144 | atomic_read(&p->in_flight[1])); | ||
145 | } | ||
146 | |||
147 | #ifdef CONFIG_FAIL_MAKE_REQUEST | ||
148 | ssize_t part_fail_show(struct device *dev, | ||
149 | struct device_attribute *attr, char *buf) | ||
150 | { | ||
151 | struct hd_struct *p = dev_to_part(dev); | ||
152 | |||
153 | return sprintf(buf, "%d\n", p->make_it_fail); | ||
154 | } | ||
155 | |||
156 | ssize_t part_fail_store(struct device *dev, | ||
157 | struct device_attribute *attr, | ||
158 | const char *buf, size_t count) | ||
159 | { | ||
160 | struct hd_struct *p = dev_to_part(dev); | ||
161 | int i; | ||
162 | |||
163 | if (count > 0 && sscanf(buf, "%d", &i) > 0) | ||
164 | p->make_it_fail = (i == 0) ? 0 : 1; | ||
165 | |||
166 | return count; | ||
167 | } | ||
168 | #endif | ||
169 | |||
170 | static DEVICE_ATTR(partition, S_IRUGO, part_partition_show, NULL); | ||
171 | static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL); | ||
172 | static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL); | ||
173 | static DEVICE_ATTR(ro, S_IRUGO, part_ro_show, NULL); | ||
174 | static DEVICE_ATTR(alignment_offset, S_IRUGO, part_alignment_offset_show, NULL); | ||
175 | static DEVICE_ATTR(discard_alignment, S_IRUGO, part_discard_alignment_show, | ||
176 | NULL); | ||
177 | static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL); | ||
178 | static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL); | ||
179 | #ifdef CONFIG_FAIL_MAKE_REQUEST | ||
180 | static struct device_attribute dev_attr_fail = | ||
181 | __ATTR(make-it-fail, S_IRUGO|S_IWUSR, part_fail_show, part_fail_store); | ||
182 | #endif | ||
183 | |||
184 | static struct attribute *part_attrs[] = { | ||
185 | &dev_attr_partition.attr, | ||
186 | &dev_attr_start.attr, | ||
187 | &dev_attr_size.attr, | ||
188 | &dev_attr_ro.attr, | ||
189 | &dev_attr_alignment_offset.attr, | ||
190 | &dev_attr_discard_alignment.attr, | ||
191 | &dev_attr_stat.attr, | ||
192 | &dev_attr_inflight.attr, | ||
193 | #ifdef CONFIG_FAIL_MAKE_REQUEST | ||
194 | &dev_attr_fail.attr, | ||
195 | #endif | ||
196 | NULL | ||
197 | }; | ||
198 | |||
199 | static struct attribute_group part_attr_group = { | ||
200 | .attrs = part_attrs, | ||
201 | }; | ||
202 | |||
203 | static const struct attribute_group *part_attr_groups[] = { | ||
204 | &part_attr_group, | ||
205 | #ifdef CONFIG_BLK_DEV_IO_TRACE | ||
206 | &blk_trace_attr_group, | ||
207 | #endif | ||
208 | NULL | ||
209 | }; | ||
210 | |||
211 | static void part_release(struct device *dev) | ||
212 | { | ||
213 | struct hd_struct *p = dev_to_part(dev); | ||
214 | free_part_stats(p); | ||
215 | free_part_info(p); | ||
216 | kfree(p); | ||
217 | } | ||
218 | |||
219 | struct device_type part_type = { | ||
220 | .name = "partition", | ||
221 | .groups = part_attr_groups, | ||
222 | .release = part_release, | ||
223 | }; | ||
224 | |||
225 | static void delete_partition_rcu_cb(struct rcu_head *head) | ||
226 | { | ||
227 | struct hd_struct *part = container_of(head, struct hd_struct, rcu_head); | ||
228 | |||
229 | part->start_sect = 0; | ||
230 | part->nr_sects = 0; | ||
231 | part_stat_set_all(part, 0); | ||
232 | put_device(part_to_dev(part)); | ||
233 | } | ||
234 | |||
235 | void __delete_partition(struct hd_struct *part) | ||
236 | { | ||
237 | call_rcu(&part->rcu_head, delete_partition_rcu_cb); | ||
238 | } | ||
239 | |||
240 | void delete_partition(struct gendisk *disk, int partno) | ||
241 | { | ||
242 | struct disk_part_tbl *ptbl = disk->part_tbl; | ||
243 | struct hd_struct *part; | ||
244 | |||
245 | if (partno >= ptbl->len) | ||
246 | return; | ||
247 | |||
248 | part = ptbl->part[partno]; | ||
249 | if (!part) | ||
250 | return; | ||
251 | |||
252 | blk_free_devt(part_devt(part)); | ||
253 | rcu_assign_pointer(ptbl->part[partno], NULL); | ||
254 | rcu_assign_pointer(ptbl->last_lookup, NULL); | ||
255 | kobject_put(part->holder_dir); | ||
256 | device_del(part_to_dev(part)); | ||
257 | |||
258 | hd_struct_put(part); | ||
259 | } | ||
260 | |||
261 | static ssize_t whole_disk_show(struct device *dev, | ||
262 | struct device_attribute *attr, char *buf) | ||
263 | { | ||
264 | return 0; | ||
265 | } | ||
266 | static DEVICE_ATTR(whole_disk, S_IRUSR | S_IRGRP | S_IROTH, | ||
267 | whole_disk_show, NULL); | ||
268 | |||
269 | struct hd_struct *add_partition(struct gendisk *disk, int partno, | ||
270 | sector_t start, sector_t len, int flags, | ||
271 | struct partition_meta_info *info) | ||
272 | { | ||
273 | struct hd_struct *p; | ||
274 | dev_t devt = MKDEV(0, 0); | ||
275 | struct device *ddev = disk_to_dev(disk); | ||
276 | struct device *pdev; | ||
277 | struct disk_part_tbl *ptbl; | ||
278 | const char *dname; | ||
279 | int err; | ||
280 | |||
281 | err = disk_expand_part_tbl(disk, partno); | ||
282 | if (err) | ||
283 | return ERR_PTR(err); | ||
284 | ptbl = disk->part_tbl; | ||
285 | |||
286 | if (ptbl->part[partno]) | ||
287 | return ERR_PTR(-EBUSY); | ||
288 | |||
289 | p = kzalloc(sizeof(*p), GFP_KERNEL); | ||
290 | if (!p) | ||
291 | return ERR_PTR(-EBUSY); | ||
292 | |||
293 | if (!init_part_stats(p)) { | ||
294 | err = -ENOMEM; | ||
295 | goto out_free; | ||
296 | } | ||
297 | |||
298 | seqcount_init(&p->nr_sects_seq); | ||
299 | pdev = part_to_dev(p); | ||
300 | |||
301 | p->start_sect = start; | ||
302 | p->alignment_offset = | ||
303 | queue_limit_alignment_offset(&disk->queue->limits, start); | ||
304 | p->discard_alignment = | ||
305 | queue_limit_discard_alignment(&disk->queue->limits, start); | ||
306 | p->nr_sects = len; | ||
307 | p->partno = partno; | ||
308 | p->policy = get_disk_ro(disk); | ||
309 | |||
310 | if (info) { | ||
311 | struct partition_meta_info *pinfo = alloc_part_info(disk); | ||
312 | if (!pinfo) | ||
313 | goto out_free_stats; | ||
314 | memcpy(pinfo, info, sizeof(*info)); | ||
315 | p->info = pinfo; | ||
316 | } | ||
317 | |||
318 | dname = dev_name(ddev); | ||
319 | if (isdigit(dname[strlen(dname) - 1])) | ||
320 | dev_set_name(pdev, "%sp%d", dname, partno); | ||
321 | else | ||
322 | dev_set_name(pdev, "%s%d", dname, partno); | ||
323 | |||
324 | device_initialize(pdev); | ||
325 | pdev->class = &block_class; | ||
326 | pdev->type = &part_type; | ||
327 | pdev->parent = ddev; | ||
328 | |||
329 | err = blk_alloc_devt(p, &devt); | ||
330 | if (err) | ||
331 | goto out_free_info; | ||
332 | pdev->devt = devt; | ||
333 | |||
334 | /* delay uevent until 'holders' subdir is created */ | ||
335 | dev_set_uevent_suppress(pdev, 1); | ||
336 | err = device_add(pdev); | ||
337 | if (err) | ||
338 | goto out_put; | ||
339 | |||
340 | err = -ENOMEM; | ||
341 | p->holder_dir = kobject_create_and_add("holders", &pdev->kobj); | ||
342 | if (!p->holder_dir) | ||
343 | goto out_del; | ||
344 | |||
345 | dev_set_uevent_suppress(pdev, 0); | ||
346 | if (flags & ADDPART_FLAG_WHOLEDISK) { | ||
347 | err = device_create_file(pdev, &dev_attr_whole_disk); | ||
348 | if (err) | ||
349 | goto out_del; | ||
350 | } | ||
351 | |||
352 | /* everything is up and running, commence */ | ||
353 | rcu_assign_pointer(ptbl->part[partno], p); | ||
354 | |||
355 | /* suppress uevent if the disk suppresses it */ | ||
356 | if (!dev_get_uevent_suppress(ddev)) | ||
357 | kobject_uevent(&pdev->kobj, KOBJ_ADD); | ||
358 | |||
359 | hd_ref_init(p); | ||
360 | return p; | ||
361 | |||
362 | out_free_info: | ||
363 | free_part_info(p); | ||
364 | out_free_stats: | ||
365 | free_part_stats(p); | ||
366 | out_free: | ||
367 | kfree(p); | ||
368 | return ERR_PTR(err); | ||
369 | out_del: | ||
370 | kobject_put(p->holder_dir); | ||
371 | device_del(pdev); | ||
372 | out_put: | ||
373 | put_device(pdev); | ||
374 | blk_free_devt(devt); | ||
375 | return ERR_PTR(err); | ||
376 | } | ||
377 | |||
378 | static bool disk_unlock_native_capacity(struct gendisk *disk) | ||
379 | { | ||
380 | const struct block_device_operations *bdops = disk->fops; | ||
381 | |||
382 | if (bdops->unlock_native_capacity && | ||
383 | !(disk->flags & GENHD_FL_NATIVE_CAPACITY)) { | ||
384 | printk(KERN_CONT "enabling native capacity\n"); | ||
385 | bdops->unlock_native_capacity(disk); | ||
386 | disk->flags |= GENHD_FL_NATIVE_CAPACITY; | ||
387 | return true; | ||
388 | } else { | ||
389 | printk(KERN_CONT "truncated\n"); | ||
390 | return false; | ||
391 | } | ||
392 | } | ||
393 | |||
394 | static int drop_partitions(struct gendisk *disk, struct block_device *bdev) | ||
395 | { | ||
396 | struct disk_part_iter piter; | ||
397 | struct hd_struct *part; | ||
398 | int res; | ||
399 | |||
400 | if (bdev->bd_part_count) | ||
401 | return -EBUSY; | ||
402 | res = invalidate_partition(disk, 0); | ||
403 | if (res) | ||
404 | return res; | ||
405 | |||
406 | disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY); | ||
407 | while ((part = disk_part_iter_next(&piter))) | ||
408 | delete_partition(disk, part->partno); | ||
409 | disk_part_iter_exit(&piter); | ||
410 | |||
411 | return 0; | ||
412 | } | ||
413 | |||
414 | int rescan_partitions(struct gendisk *disk, struct block_device *bdev) | ||
415 | { | ||
416 | struct parsed_partitions *state = NULL; | ||
417 | struct hd_struct *part; | ||
418 | int p, highest, res; | ||
419 | rescan: | ||
420 | if (state && !IS_ERR(state)) { | ||
421 | kfree(state); | ||
422 | state = NULL; | ||
423 | } | ||
424 | |||
425 | res = drop_partitions(disk, bdev); | ||
426 | if (res) | ||
427 | return res; | ||
428 | |||
429 | if (disk->fops->revalidate_disk) | ||
430 | disk->fops->revalidate_disk(disk); | ||
431 | check_disk_size_change(disk, bdev); | ||
432 | bdev->bd_invalidated = 0; | ||
433 | if (!get_capacity(disk) || !(state = check_partition(disk, bdev))) | ||
434 | return 0; | ||
435 | if (IS_ERR(state)) { | ||
436 | /* | ||
437 | * I/O error reading the partition table. If any | ||
438 | * partition code tried to read beyond EOD, retry | ||
439 | * after unlocking native capacity. | ||
440 | */ | ||
441 | if (PTR_ERR(state) == -ENOSPC) { | ||
442 | printk(KERN_WARNING "%s: partition table beyond EOD, ", | ||
443 | disk->disk_name); | ||
444 | if (disk_unlock_native_capacity(disk)) | ||
445 | goto rescan; | ||
446 | } | ||
447 | return -EIO; | ||
448 | } | ||
449 | /* | ||
450 | * If any partition code tried to read beyond EOD, try | ||
451 | * unlocking native capacity even if partition table is | ||
452 | * successfully read as we could be missing some partitions. | ||
453 | */ | ||
454 | if (state->access_beyond_eod) { | ||
455 | printk(KERN_WARNING | ||
456 | "%s: partition table partially beyond EOD, ", | ||
457 | disk->disk_name); | ||
458 | if (disk_unlock_native_capacity(disk)) | ||
459 | goto rescan; | ||
460 | } | ||
461 | |||
462 | /* tell userspace that the media / partition table may have changed */ | ||
463 | kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE); | ||
464 | |||
465 | /* Detect the highest partition number and preallocate | ||
466 | * disk->part_tbl. This is an optimization and not strictly | ||
467 | * necessary. | ||
468 | */ | ||
469 | for (p = 1, highest = 0; p < state->limit; p++) | ||
470 | if (state->parts[p].size) | ||
471 | highest = p; | ||
472 | |||
473 | disk_expand_part_tbl(disk, highest); | ||
474 | |||
475 | /* add partitions */ | ||
476 | for (p = 1; p < state->limit; p++) { | ||
477 | sector_t size, from; | ||
478 | struct partition_meta_info *info = NULL; | ||
479 | |||
480 | size = state->parts[p].size; | ||
481 | if (!size) | ||
482 | continue; | ||
483 | |||
484 | from = state->parts[p].from; | ||
485 | if (from >= get_capacity(disk)) { | ||
486 | printk(KERN_WARNING | ||
487 | "%s: p%d start %llu is beyond EOD, ", | ||
488 | disk->disk_name, p, (unsigned long long) from); | ||
489 | if (disk_unlock_native_capacity(disk)) | ||
490 | goto rescan; | ||
491 | continue; | ||
492 | } | ||
493 | |||
494 | if (from + size > get_capacity(disk)) { | ||
495 | printk(KERN_WARNING | ||
496 | "%s: p%d size %llu extends beyond EOD, ", | ||
497 | disk->disk_name, p, (unsigned long long) size); | ||
498 | |||
499 | if (disk_unlock_native_capacity(disk)) { | ||
500 | /* free state and restart */ | ||
501 | goto rescan; | ||
502 | } else { | ||
503 | /* | ||
504 | * we can not ignore partitions of broken tables | ||
505 | * created by for example camera firmware, but | ||
506 | * we limit them to the end of the disk to avoid | ||
507 | * creating invalid block devices | ||
508 | */ | ||
509 | size = get_capacity(disk) - from; | ||
510 | } | ||
511 | } | ||
512 | |||
513 | if (state->parts[p].has_info) | ||
514 | info = &state->parts[p].info; | ||
515 | part = add_partition(disk, p, from, size, | ||
516 | state->parts[p].flags, | ||
517 | &state->parts[p].info); | ||
518 | if (IS_ERR(part)) { | ||
519 | printk(KERN_ERR " %s: p%d could not be added: %ld\n", | ||
520 | disk->disk_name, p, -PTR_ERR(part)); | ||
521 | continue; | ||
522 | } | ||
523 | #ifdef CONFIG_BLK_DEV_MD | ||
524 | if (state->parts[p].flags & ADDPART_FLAG_RAID) | ||
525 | md_autodetect_dev(part_to_dev(part)->devt); | ||
526 | #endif | ||
527 | } | ||
528 | kfree(state); | ||
529 | return 0; | ||
530 | } | ||
531 | |||
532 | int invalidate_partitions(struct gendisk *disk, struct block_device *bdev) | ||
533 | { | ||
534 | int res; | ||
535 | |||
536 | if (!bdev->bd_invalidated) | ||
537 | return 0; | ||
538 | |||
539 | res = drop_partitions(disk, bdev); | ||
540 | if (res) | ||
541 | return res; | ||
542 | |||
543 | set_capacity(disk, 0); | ||
544 | check_disk_size_change(disk, bdev); | ||
545 | bdev->bd_invalidated = 0; | ||
546 | /* tell userspace that the media / partition table may have changed */ | ||
547 | kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE); | ||
548 | |||
549 | return 0; | ||
550 | } | ||
551 | |||
552 | unsigned char *read_dev_sector(struct block_device *bdev, sector_t n, Sector *p) | ||
553 | { | ||
554 | struct address_space *mapping = bdev->bd_inode->i_mapping; | ||
555 | struct page *page; | ||
556 | |||
557 | page = read_mapping_page(mapping, (pgoff_t)(n >> (PAGE_CACHE_SHIFT-9)), | ||
558 | NULL); | ||
559 | if (!IS_ERR(page)) { | ||
560 | if (PageError(page)) | ||
561 | goto fail; | ||
562 | p->v = page; | ||
563 | return (unsigned char *)page_address(page) + ((n & ((1 << (PAGE_CACHE_SHIFT - 9)) - 1)) << 9); | ||
564 | fail: | ||
565 | page_cache_release(page); | ||
566 | } | ||
567 | p->v = NULL; | ||
568 | return NULL; | ||
569 | } | ||
570 | |||
571 | EXPORT_SYMBOL(read_dev_sector); | ||
diff --git a/block/partitions/Kconfig b/block/partitions/Kconfig deleted file mode 100644 index 75a54e1adbb..00000000000 --- a/block/partitions/Kconfig +++ /dev/null | |||
@@ -1,251 +0,0 @@ | |||
1 | # | ||
2 | # Partition configuration | ||
3 | # | ||
4 | config PARTITION_ADVANCED | ||
5 | bool "Advanced partition selection" | ||
6 | help | ||
7 | Say Y here if you would like to use hard disks under Linux which | ||
8 | were partitioned under an operating system running on a different | ||
9 | architecture than your Linux system. | ||
10 | |||
11 | Note that the answer to this question won't directly affect the | ||
12 | kernel: saying N will just cause the configurator to skip all | ||
13 | the questions about foreign partitioning schemes. | ||
14 | |||
15 | If unsure, say N. | ||
16 | |||
17 | config ACORN_PARTITION | ||
18 | bool "Acorn partition support" if PARTITION_ADVANCED | ||
19 | default y if ARCH_ACORN | ||
20 | help | ||
21 | Support hard disks partitioned under Acorn operating systems. | ||
22 | |||
23 | config ACORN_PARTITION_CUMANA | ||
24 | bool "Cumana partition support" if PARTITION_ADVANCED | ||
25 | default y if ARCH_ACORN | ||
26 | depends on ACORN_PARTITION | ||
27 | help | ||
28 | Say Y here if you would like to use hard disks under Linux which | ||
29 | were partitioned using the Cumana interface on Acorn machines. | ||
30 | |||
31 | config ACORN_PARTITION_EESOX | ||
32 | bool "EESOX partition support" if PARTITION_ADVANCED | ||
33 | default y if ARCH_ACORN | ||
34 | depends on ACORN_PARTITION | ||
35 | |||
36 | config ACORN_PARTITION_ICS | ||
37 | bool "ICS partition support" if PARTITION_ADVANCED | ||
38 | default y if ARCH_ACORN | ||
39 | depends on ACORN_PARTITION | ||
40 | help | ||
41 | Say Y here if you would like to use hard disks under Linux which | ||
42 | were partitioned using the ICS interface on Acorn machines. | ||
43 | |||
44 | config ACORN_PARTITION_ADFS | ||
45 | bool "Native filecore partition support" if PARTITION_ADVANCED | ||
46 | default y if ARCH_ACORN | ||
47 | depends on ACORN_PARTITION | ||
48 | help | ||
49 | The Acorn Disc Filing System is the standard file system of the | ||
50 | RiscOS operating system which runs on Acorn's ARM-based Risc PC | ||
51 | systems and the Acorn Archimedes range of machines. If you say | ||
52 | `Y' here, Linux will support disk partitions created under ADFS. | ||
53 | |||
54 | config ACORN_PARTITION_POWERTEC | ||
55 | bool "PowerTec partition support" if PARTITION_ADVANCED | ||
56 | default y if ARCH_ACORN | ||
57 | depends on ACORN_PARTITION | ||
58 | help | ||
59 | Support reading partition tables created on Acorn machines using | ||
60 | the PowerTec SCSI drive. | ||
61 | |||
62 | config ACORN_PARTITION_RISCIX | ||
63 | bool "RISCiX partition support" if PARTITION_ADVANCED | ||
64 | default y if ARCH_ACORN | ||
65 | depends on ACORN_PARTITION | ||
66 | help | ||
67 | Once upon a time, there was a native Unix port for the Acorn series | ||
68 | of machines called RISCiX. If you say 'Y' here, Linux will be able | ||
69 | to read disks partitioned under RISCiX. | ||
70 | |||
71 | config OSF_PARTITION | ||
72 | bool "Alpha OSF partition support" if PARTITION_ADVANCED | ||
73 | default y if ALPHA | ||
74 | help | ||
75 | Say Y here if you would like to use hard disks under Linux which | ||
76 | were partitioned on an Alpha machine. | ||
77 | |||
78 | config AMIGA_PARTITION | ||
79 | bool "Amiga partition table support" if PARTITION_ADVANCED | ||
80 | default y if (AMIGA || AFFS_FS=y) | ||
81 | help | ||
82 | Say Y here if you would like to use hard disks under Linux which | ||
83 | were partitioned under AmigaOS. | ||
84 | |||
85 | config ATARI_PARTITION | ||
86 | bool "Atari partition table support" if PARTITION_ADVANCED | ||
87 | default y if ATARI | ||
88 | help | ||
89 | Say Y here if you would like to use hard disks under Linux which | ||
90 | were partitioned under the Atari OS. | ||
91 | |||
92 | config IBM_PARTITION | ||
93 | bool "IBM disk label and partition support" | ||
94 | depends on PARTITION_ADVANCED && S390 | ||
95 | help | ||
96 | Say Y here if you would like to be able to read the hard disk | ||
97 | partition table format used by IBM DASD disks operating under CMS. | ||
98 | Otherwise, say N. | ||
99 | |||
100 | config MAC_PARTITION | ||
101 | bool "Macintosh partition map support" if PARTITION_ADVANCED | ||
102 | default y if (MAC || PPC_PMAC) | ||
103 | help | ||
104 | Say Y here if you would like to use hard disks under Linux which | ||
105 | were partitioned on a Macintosh. | ||
106 | |||
107 | config MSDOS_PARTITION | ||
108 | bool "PC BIOS (MSDOS partition tables) support" if PARTITION_ADVANCED | ||
109 | default y | ||
110 | help | ||
111 | Say Y here. | ||
112 | |||
113 | config BSD_DISKLABEL | ||
114 | bool "BSD disklabel (FreeBSD partition tables) support" | ||
115 | depends on PARTITION_ADVANCED && MSDOS_PARTITION | ||
116 | help | ||
117 | FreeBSD uses its own hard disk partition scheme on your PC. It | ||
118 | requires only one entry in the primary partition table of your disk | ||
119 | and manages it similarly to DOS extended partitions, putting in its | ||
120 | first sector a new partition table in BSD disklabel format. Saying Y | ||
121 | here allows you to read these disklabels and further mount FreeBSD | ||
122 | partitions from within Linux if you have also said Y to "UFS | ||
123 | file system support", above. If you don't know what all this is | ||
124 | about, say N. | ||
125 | |||
126 | config MINIX_SUBPARTITION | ||
127 | bool "Minix subpartition support" | ||
128 | depends on PARTITION_ADVANCED && MSDOS_PARTITION | ||
129 | help | ||
130 | Minix 2.0.0/2.0.2 subpartition table support for Linux. | ||
131 | Say Y here if you want to mount and use Minix 2.0.0/2.0.2 | ||
132 | subpartitions. | ||
133 | |||
134 | config SOLARIS_X86_PARTITION | ||
135 | bool "Solaris (x86) partition table support" | ||
136 | depends on PARTITION_ADVANCED && MSDOS_PARTITION | ||
137 | help | ||
138 | Like most systems, Solaris x86 uses its own hard disk partition | ||
139 | table format, incompatible with all others. Saying Y here allows you | ||
140 | to read these partition tables and further mount Solaris x86 | ||
141 | partitions from within Linux if you have also said Y to "UFS | ||
142 | file system support", above. | ||
143 | |||
144 | config UNIXWARE_DISKLABEL | ||
145 | bool "Unixware slices support" | ||
146 | depends on PARTITION_ADVANCED && MSDOS_PARTITION | ||
147 | ---help--- | ||
148 | Like some systems, UnixWare uses its own slice table inside a | ||
149 | partition (VTOC - Virtual Table of Contents). Its format is | ||
150 | incompatible with all other OSes. Saying Y here allows you to read | ||
151 | VTOC and further mount UnixWare partitions read-only from within | ||
152 | Linux if you have also said Y to "UFS file system support" or | ||
153 | "System V and Coherent file system support", above. | ||
154 | |||
155 | This is mainly used to carry data from a UnixWare box to your | ||
156 | Linux box via a removable medium like magneto-optical, ZIP or | ||
157 | removable IDE drives. Note, however, that a good portable way to | ||
158 | transport files and directories between unixes (and even other | ||
159 | operating systems) is given by the tar program ("man tar" or | ||
160 | preferably "info tar"). | ||
161 | |||
162 | If you don't know what all this is about, say N. | ||
163 | |||
164 | config LDM_PARTITION | ||
165 | bool "Windows Logical Disk Manager (Dynamic Disk) support" | ||
166 | depends on PARTITION_ADVANCED | ||
167 | ---help--- | ||
168 | Say Y here if you would like to use hard disks under Linux which | ||
169 | were partitioned using Windows 2000's/XP's or Vista's Logical Disk | ||
170 | Manager. They are also known as "Dynamic Disks". | ||
171 | |||
172 | Note this driver only supports Dynamic Disks with a protective MBR | ||
173 | label, i.e. DOS partition table. It does not support GPT labelled | ||
174 | Dynamic Disks yet as can be created with Vista. | ||
175 | |||
176 | Windows 2000 introduced the concept of Dynamic Disks to get around | ||
177 | the limitations of the PC's partitioning scheme. The Logical Disk | ||
178 | Manager allows the user to repartition a disk and create spanned, | ||
179 | mirrored, striped or RAID volumes, all without the need for | ||
180 | rebooting. | ||
181 | |||
182 | Normal partitions are now called Basic Disks under Windows 2000, XP, | ||
183 | and Vista. | ||
184 | |||
185 | For a fuller description read <file:Documentation/ldm.txt>. | ||
186 | |||
187 | If unsure, say N. | ||
188 | |||
189 | config LDM_DEBUG | ||
190 | bool "Windows LDM extra logging" | ||
191 | depends on LDM_PARTITION | ||
192 | help | ||
193 | Say Y here if you would like LDM to log verbosely. This could be | ||
194 | helpful if the driver doesn't work as expected and you'd like to | ||
195 | report a bug. | ||
196 | |||
197 | If unsure, say N. | ||
198 | |||
199 | config SGI_PARTITION | ||
200 | bool "SGI partition support" if PARTITION_ADVANCED | ||
201 | default y if DEFAULT_SGI_PARTITION | ||
202 | help | ||
203 | Say Y here if you would like to be able to read the hard disk | ||
204 | partition table format used by SGI machines. | ||
205 | |||
206 | config ULTRIX_PARTITION | ||
207 | bool "Ultrix partition table support" if PARTITION_ADVANCED | ||
208 | default y if MACH_DECSTATION | ||
209 | help | ||
210 | Say Y here if you would like to be able to read the hard disk | ||
211 | partition table format used by DEC (now Compaq) Ultrix machines. | ||
212 | Otherwise, say N. | ||
213 | |||
214 | config SUN_PARTITION | ||
215 | bool "Sun partition tables support" if PARTITION_ADVANCED | ||
216 | default y if (SPARC || SUN3 || SUN3X) | ||
217 | ---help--- | ||
218 | Like most systems, SunOS uses its own hard disk partition table | ||
219 | format, incompatible with all others. Saying Y here allows you to | ||
220 | read these partition tables and further mount SunOS partitions from | ||
221 | within Linux if you have also said Y to "UFS file system support", | ||
222 | above. This is mainly used to carry data from a SPARC under SunOS to | ||
223 | your Linux box via a removable medium like magneto-optical or ZIP | ||
224 | drives; note however that a good portable way to transport files and | ||
225 | directories between unixes (and even other operating systems) is | ||
226 | given by the tar program ("man tar" or preferably "info tar"). If | ||
227 | you don't know what all this is about, say N. | ||
228 | |||
229 | config KARMA_PARTITION | ||
230 | bool "Karma Partition support" | ||
231 | depends on PARTITION_ADVANCED | ||
232 | help | ||
233 | Say Y here if you would like to mount the Rio Karma MP3 player, as it | ||
234 | uses a proprietary partition table. | ||
235 | |||
236 | config EFI_PARTITION | ||
237 | bool "EFI GUID Partition support" if PARTITION_ADVANCED | ||
238 | default y | ||
239 | select CRC32 | ||
240 | help | ||
241 | Say Y here if you would like to use hard disks under Linux which | ||
242 | were partitioned using EFI GPT. | ||
243 | |||
244 | config SYSV68_PARTITION | ||
245 | bool "SYSV68 partition table support" if PARTITION_ADVANCED | ||
246 | default y if VME | ||
247 | help | ||
248 | Say Y here if you would like to be able to read the hard disk | ||
249 | partition table format used by Motorola Delta machines (using | ||
250 | sysv68). | ||
251 | Otherwise, say N. | ||
diff --git a/block/partitions/Makefile b/block/partitions/Makefile deleted file mode 100644 index 03af8eac51d..00000000000 --- a/block/partitions/Makefile +++ /dev/null | |||
@@ -1,20 +0,0 @@ | |||
1 | # | ||
2 | # Makefile for the linux kernel. | ||
3 | # | ||
4 | |||
5 | obj-$(CONFIG_BLOCK) := check.o | ||
6 | |||
7 | obj-$(CONFIG_ACORN_PARTITION) += acorn.o | ||
8 | obj-$(CONFIG_AMIGA_PARTITION) += amiga.o | ||
9 | obj-$(CONFIG_ATARI_PARTITION) += atari.o | ||
10 | obj-$(CONFIG_MAC_PARTITION) += mac.o | ||
11 | obj-$(CONFIG_LDM_PARTITION) += ldm.o | ||
12 | obj-$(CONFIG_MSDOS_PARTITION) += msdos.o | ||
13 | obj-$(CONFIG_OSF_PARTITION) += osf.o | ||
14 | obj-$(CONFIG_SGI_PARTITION) += sgi.o | ||
15 | obj-$(CONFIG_SUN_PARTITION) += sun.o | ||
16 | obj-$(CONFIG_ULTRIX_PARTITION) += ultrix.o | ||
17 | obj-$(CONFIG_IBM_PARTITION) += ibm.o | ||
18 | obj-$(CONFIG_EFI_PARTITION) += efi.o | ||
19 | obj-$(CONFIG_KARMA_PARTITION) += karma.o | ||
20 | obj-$(CONFIG_SYSV68_PARTITION) += sysv68.o | ||
diff --git a/block/partitions/acorn.c b/block/partitions/acorn.c deleted file mode 100644 index fbeb697374d..00000000000 --- a/block/partitions/acorn.c +++ /dev/null | |||
@@ -1,556 +0,0 @@ | |||
1 | /* | ||
2 | * linux/fs/partitions/acorn.c | ||
3 | * | ||
4 | * Copyright (c) 1996-2000 Russell King. | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or modify | ||
7 | * it under the terms of the GNU General Public License version 2 as | ||
8 | * published by the Free Software Foundation. | ||
9 | * | ||
10 | * Scan ADFS partitions on hard disk drives. Unfortunately, there | ||
11 | * isn't a standard for partitioning drives on Acorn machines, so | ||
12 | * every single manufacturer of SCSI and IDE cards created their own | ||
13 | * method. | ||
14 | */ | ||
15 | #include <linux/buffer_head.h> | ||
16 | #include <linux/adfs_fs.h> | ||
17 | |||
18 | #include "check.h" | ||
19 | #include "acorn.h" | ||
20 | |||
21 | /* | ||
22 | * Partition types. (Oh for reusability) | ||
23 | */ | ||
24 | #define PARTITION_RISCIX_MFM 1 | ||
25 | #define PARTITION_RISCIX_SCSI 2 | ||
26 | #define PARTITION_LINUX 9 | ||
27 | |||
28 | #if defined(CONFIG_ACORN_PARTITION_CUMANA) || \ | ||
29 | defined(CONFIG_ACORN_PARTITION_ADFS) | ||
30 | static struct adfs_discrecord * | ||
31 | adfs_partition(struct parsed_partitions *state, char *name, char *data, | ||
32 | unsigned long first_sector, int slot) | ||
33 | { | ||
34 | struct adfs_discrecord *dr; | ||
35 | unsigned int nr_sects; | ||
36 | |||
37 | if (adfs_checkbblk(data)) | ||
38 | return NULL; | ||
39 | |||
40 | dr = (struct adfs_discrecord *)(data + 0x1c0); | ||
41 | |||
42 | if (dr->disc_size == 0 && dr->disc_size_high == 0) | ||
43 | return NULL; | ||
44 | |||
45 | nr_sects = (le32_to_cpu(dr->disc_size_high) << 23) | | ||
46 | (le32_to_cpu(dr->disc_size) >> 9); | ||
47 | |||
48 | if (name) { | ||
49 | strlcat(state->pp_buf, " [", PAGE_SIZE); | ||
50 | strlcat(state->pp_buf, name, PAGE_SIZE); | ||
51 | strlcat(state->pp_buf, "]", PAGE_SIZE); | ||
52 | } | ||
53 | put_partition(state, slot, first_sector, nr_sects); | ||
54 | return dr; | ||
55 | } | ||
56 | #endif | ||
57 | |||
58 | #ifdef CONFIG_ACORN_PARTITION_RISCIX | ||
59 | |||
60 | struct riscix_part { | ||
61 | __le32 start; | ||
62 | __le32 length; | ||
63 | __le32 one; | ||
64 | char name[16]; | ||
65 | }; | ||
66 | |||
67 | struct riscix_record { | ||
68 | __le32 magic; | ||
69 | #define RISCIX_MAGIC cpu_to_le32(0x4a657320) | ||
70 | __le32 date; | ||
71 | struct riscix_part part[8]; | ||
72 | }; | ||
73 | |||
74 | #if defined(CONFIG_ACORN_PARTITION_CUMANA) || \ | ||
75 | defined(CONFIG_ACORN_PARTITION_ADFS) | ||
76 | static int riscix_partition(struct parsed_partitions *state, | ||
77 | unsigned long first_sect, int slot, | ||
78 | unsigned long nr_sects) | ||
79 | { | ||
80 | Sector sect; | ||
81 | struct riscix_record *rr; | ||
82 | |||
83 | rr = read_part_sector(state, first_sect, §); | ||
84 | if (!rr) | ||
85 | return -1; | ||
86 | |||
87 | strlcat(state->pp_buf, " [RISCiX]", PAGE_SIZE); | ||
88 | |||
89 | |||
90 | if (rr->magic == RISCIX_MAGIC) { | ||
91 | unsigned long size = nr_sects > 2 ? 2 : nr_sects; | ||
92 | int part; | ||
93 | |||
94 | strlcat(state->pp_buf, " <", PAGE_SIZE); | ||
95 | |||
96 | put_partition(state, slot++, first_sect, size); | ||
97 | for (part = 0; part < 8; part++) { | ||
98 | if (rr->part[part].one && | ||
99 | memcmp(rr->part[part].name, "All\0", 4)) { | ||
100 | put_partition(state, slot++, | ||
101 | le32_to_cpu(rr->part[part].start), | ||
102 | le32_to_cpu(rr->part[part].length)); | ||
103 | strlcat(state->pp_buf, "(", PAGE_SIZE); | ||
104 | strlcat(state->pp_buf, rr->part[part].name, PAGE_SIZE); | ||
105 | strlcat(state->pp_buf, ")", PAGE_SIZE); | ||
106 | } | ||
107 | } | ||
108 | |||
109 | strlcat(state->pp_buf, " >\n", PAGE_SIZE); | ||
110 | } else { | ||
111 | put_partition(state, slot++, first_sect, nr_sects); | ||
112 | } | ||
113 | |||
114 | put_dev_sector(sect); | ||
115 | return slot; | ||
116 | } | ||
117 | #endif | ||
118 | #endif | ||
119 | |||
120 | #define LINUX_NATIVE_MAGIC 0xdeafa1de | ||
121 | #define LINUX_SWAP_MAGIC 0xdeafab1e | ||
122 | |||
123 | struct linux_part { | ||
124 | __le32 magic; | ||
125 | __le32 start_sect; | ||
126 | __le32 nr_sects; | ||
127 | }; | ||
128 | |||
129 | #if defined(CONFIG_ACORN_PARTITION_CUMANA) || \ | ||
130 | defined(CONFIG_ACORN_PARTITION_ADFS) | ||
131 | static int linux_partition(struct parsed_partitions *state, | ||
132 | unsigned long first_sect, int slot, | ||
133 | unsigned long nr_sects) | ||
134 | { | ||
135 | Sector sect; | ||
136 | struct linux_part *linuxp; | ||
137 | unsigned long size = nr_sects > 2 ? 2 : nr_sects; | ||
138 | |||
139 | strlcat(state->pp_buf, " [Linux]", PAGE_SIZE); | ||
140 | |||
141 | put_partition(state, slot++, first_sect, size); | ||
142 | |||
143 | linuxp = read_part_sector(state, first_sect, §); | ||
144 | if (!linuxp) | ||
145 | return -1; | ||
146 | |||
147 | strlcat(state->pp_buf, " <", PAGE_SIZE); | ||
148 | while (linuxp->magic == cpu_to_le32(LINUX_NATIVE_MAGIC) || | ||
149 | linuxp->magic == cpu_to_le32(LINUX_SWAP_MAGIC)) { | ||
150 | if (slot == state->limit) | ||
151 | break; | ||
152 | put_partition(state, slot++, first_sect + | ||
153 | le32_to_cpu(linuxp->start_sect), | ||
154 | le32_to_cpu(linuxp->nr_sects)); | ||
155 | linuxp ++; | ||
156 | } | ||
157 | strlcat(state->pp_buf, " >", PAGE_SIZE); | ||
158 | |||
159 | put_dev_sector(sect); | ||
160 | return slot; | ||
161 | } | ||
162 | #endif | ||
163 | |||
164 | #ifdef CONFIG_ACORN_PARTITION_CUMANA | ||
165 | int adfspart_check_CUMANA(struct parsed_partitions *state) | ||
166 | { | ||
167 | unsigned long first_sector = 0; | ||
168 | unsigned int start_blk = 0; | ||
169 | Sector sect; | ||
170 | unsigned char *data; | ||
171 | char *name = "CUMANA/ADFS"; | ||
172 | int first = 1; | ||
173 | int slot = 1; | ||
174 | |||
175 | /* | ||
176 | * Try Cumana style partitions - sector 6 contains ADFS boot block | ||
177 | * with pointer to next 'drive'. | ||
178 | * | ||
179 | * There are unknowns in this code - is the 'cylinder number' of the | ||
180 | * next partition relative to the start of this one - I'm assuming | ||
181 | * it is. | ||
182 | * | ||
183 | * Also, which ID did Cumana use? | ||
184 | * | ||
185 | * This is totally unfinished, and will require more work to get it | ||
186 | * going. Hence it is totally untested. | ||
187 | */ | ||
188 | do { | ||
189 | struct adfs_discrecord *dr; | ||
190 | unsigned int nr_sects; | ||
191 | |||
192 | data = read_part_sector(state, start_blk * 2 + 6, §); | ||
193 | if (!data) | ||
194 | return -1; | ||
195 | |||
196 | if (slot == state->limit) | ||
197 | break; | ||
198 | |||
199 | dr = adfs_partition(state, name, data, first_sector, slot++); | ||
200 | if (!dr) | ||
201 | break; | ||
202 | |||
203 | name = NULL; | ||
204 | |||
205 | nr_sects = (data[0x1fd] + (data[0x1fe] << 8)) * | ||
206 | (dr->heads + (dr->lowsector & 0x40 ? 1 : 0)) * | ||
207 | dr->secspertrack; | ||
208 | |||
209 | if (!nr_sects) | ||
210 | break; | ||
211 | |||
212 | first = 0; | ||
213 | first_sector += nr_sects; | ||
214 | start_blk += nr_sects >> (BLOCK_SIZE_BITS - 9); | ||
215 | nr_sects = 0; /* hmm - should be partition size */ | ||
216 | |||
217 | switch (data[0x1fc] & 15) { | ||
218 | case 0: /* No partition / ADFS? */ | ||
219 | break; | ||
220 | |||
221 | #ifdef CONFIG_ACORN_PARTITION_RISCIX | ||
222 | case PARTITION_RISCIX_SCSI: | ||
223 | /* RISCiX - we don't know how to find the next one. */ | ||
224 | slot = riscix_partition(state, first_sector, slot, | ||
225 | nr_sects); | ||
226 | break; | ||
227 | #endif | ||
228 | |||
229 | case PARTITION_LINUX: | ||
230 | slot = linux_partition(state, first_sector, slot, | ||
231 | nr_sects); | ||
232 | break; | ||
233 | } | ||
234 | put_dev_sector(sect); | ||
235 | if (slot == -1) | ||
236 | return -1; | ||
237 | } while (1); | ||
238 | put_dev_sector(sect); | ||
239 | return first ? 0 : 1; | ||
240 | } | ||
241 | #endif | ||
242 | |||
243 | #ifdef CONFIG_ACORN_PARTITION_ADFS | ||
244 | /* | ||
245 | * Purpose: allocate ADFS partitions. | ||
246 | * | ||
247 | * Params : hd - pointer to gendisk structure to store partition info. | ||
248 | * dev - device number to access. | ||
249 | * | ||
250 | * Returns: -1 on error, 0 for no ADFS boot sector, 1 for ok. | ||
251 | * | ||
252 | * Alloc : hda = whole drive | ||
253 | * hda1 = ADFS partition on first drive. | ||
254 | * hda2 = non-ADFS partition. | ||
255 | */ | ||
256 | int adfspart_check_ADFS(struct parsed_partitions *state) | ||
257 | { | ||
258 | unsigned long start_sect, nr_sects, sectscyl, heads; | ||
259 | Sector sect; | ||
260 | unsigned char *data; | ||
261 | struct adfs_discrecord *dr; | ||
262 | unsigned char id; | ||
263 | int slot = 1; | ||
264 | |||
265 | data = read_part_sector(state, 6, §); | ||
266 | if (!data) | ||
267 | return -1; | ||
268 | |||
269 | dr = adfs_partition(state, "ADFS", data, 0, slot++); | ||
270 | if (!dr) { | ||
271 | put_dev_sector(sect); | ||
272 | return 0; | ||
273 | } | ||
274 | |||
275 | heads = dr->heads + ((dr->lowsector >> 6) & 1); | ||
276 | sectscyl = dr->secspertrack * heads; | ||
277 | start_sect = ((data[0x1fe] << 8) + data[0x1fd]) * sectscyl; | ||
278 | id = data[0x1fc] & 15; | ||
279 | put_dev_sector(sect); | ||
280 | |||
281 | /* | ||
282 | * Work out start of non-adfs partition. | ||
283 | */ | ||
284 | nr_sects = (state->bdev->bd_inode->i_size >> 9) - start_sect; | ||
285 | |||
286 | if (start_sect) { | ||
287 | switch (id) { | ||
288 | #ifdef CONFIG_ACORN_PARTITION_RISCIX | ||
289 | case PARTITION_RISCIX_SCSI: | ||
290 | case PARTITION_RISCIX_MFM: | ||
291 | slot = riscix_partition(state, start_sect, slot, | ||
292 | nr_sects); | ||
293 | break; | ||
294 | #endif | ||
295 | |||
296 | case PARTITION_LINUX: | ||
297 | slot = linux_partition(state, start_sect, slot, | ||
298 | nr_sects); | ||
299 | break; | ||
300 | } | ||
301 | } | ||
302 | strlcat(state->pp_buf, "\n", PAGE_SIZE); | ||
303 | return 1; | ||
304 | } | ||
305 | #endif | ||
306 | |||
307 | #ifdef CONFIG_ACORN_PARTITION_ICS | ||
308 | |||
309 | struct ics_part { | ||
310 | __le32 start; | ||
311 | __le32 size; | ||
312 | }; | ||
313 | |||
314 | static int adfspart_check_ICSLinux(struct parsed_partitions *state, | ||
315 | unsigned long block) | ||
316 | { | ||
317 | Sector sect; | ||
318 | unsigned char *data = read_part_sector(state, block, §); | ||
319 | int result = 0; | ||
320 | |||
321 | if (data) { | ||
322 | if (memcmp(data, "LinuxPart", 9) == 0) | ||
323 | result = 1; | ||
324 | put_dev_sector(sect); | ||
325 | } | ||
326 | |||
327 | return result; | ||
328 | } | ||
329 | |||
330 | /* | ||
331 | * Check for a valid ICS partition using the checksum. | ||
332 | */ | ||
333 | static inline int valid_ics_sector(const unsigned char *data) | ||
334 | { | ||
335 | unsigned long sum; | ||
336 | int i; | ||
337 | |||
338 | for (i = 0, sum = 0x50617274; i < 508; i++) | ||
339 | sum += data[i]; | ||
340 | |||
341 | sum -= le32_to_cpu(*(__le32 *)(&data[508])); | ||
342 | |||
343 | return sum == 0; | ||
344 | } | ||
345 | |||
346 | /* | ||
347 | * Purpose: allocate ICS partitions. | ||
348 | * Params : hd - pointer to gendisk structure to store partition info. | ||
349 | * dev - device number to access. | ||
350 | * Returns: -1 on error, 0 for no ICS table, 1 for partitions ok. | ||
351 | * Alloc : hda = whole drive | ||
352 | * hda1 = ADFS partition 0 on first drive. | ||
353 | * hda2 = ADFS partition 1 on first drive. | ||
354 | * ..etc.. | ||
355 | */ | ||
356 | int adfspart_check_ICS(struct parsed_partitions *state) | ||
357 | { | ||
358 | const unsigned char *data; | ||
359 | const struct ics_part *p; | ||
360 | int slot; | ||
361 | Sector sect; | ||
362 | |||
363 | /* | ||
364 | * Try ICS style partitions - sector 0 contains partition info. | ||
365 | */ | ||
366 | data = read_part_sector(state, 0, §); | ||
367 | if (!data) | ||
368 | return -1; | ||
369 | |||
370 | if (!valid_ics_sector(data)) { | ||
371 | put_dev_sector(sect); | ||
372 | return 0; | ||
373 | } | ||
374 | |||
375 | strlcat(state->pp_buf, " [ICS]", PAGE_SIZE); | ||
376 | |||
377 | for (slot = 1, p = (const struct ics_part *)data; p->size; p++) { | ||
378 | u32 start = le32_to_cpu(p->start); | ||
379 | s32 size = le32_to_cpu(p->size); /* yes, it's signed. */ | ||
380 | |||
381 | if (slot == state->limit) | ||
382 | break; | ||
383 | |||
384 | /* | ||
385 | * Negative sizes tell the RISC OS ICS driver to ignore | ||
386 | * this partition - in effect it says that this does not | ||
387 | * contain an ADFS filesystem. | ||
388 | */ | ||
389 | if (size < 0) { | ||
390 | size = -size; | ||
391 | |||
392 | /* | ||
393 | * Our own extension - We use the first sector | ||
394 | * of the partition to identify what type this | ||
395 | * partition is. We must not make this visible | ||
396 | * to the filesystem. | ||
397 | */ | ||
398 | if (size > 1 && adfspart_check_ICSLinux(state, start)) { | ||
399 | start += 1; | ||
400 | size -= 1; | ||
401 | } | ||
402 | } | ||
403 | |||
404 | if (size) | ||
405 | put_partition(state, slot++, start, size); | ||
406 | } | ||
407 | |||
408 | put_dev_sector(sect); | ||
409 | strlcat(state->pp_buf, "\n", PAGE_SIZE); | ||
410 | return 1; | ||
411 | } | ||
412 | #endif | ||
413 | |||
414 | #ifdef CONFIG_ACORN_PARTITION_POWERTEC | ||
415 | struct ptec_part { | ||
416 | __le32 unused1; | ||
417 | __le32 unused2; | ||
418 | __le32 start; | ||
419 | __le32 size; | ||
420 | __le32 unused5; | ||
421 | char type[8]; | ||
422 | }; | ||
423 | |||
424 | static inline int valid_ptec_sector(const unsigned char *data) | ||
425 | { | ||
426 | unsigned char checksum = 0x2a; | ||
427 | int i; | ||
428 | |||
429 | /* | ||
430 | * If it looks like a PC/BIOS partition, then it | ||
431 | * probably isn't PowerTec. | ||
432 | */ | ||
433 | if (data[510] == 0x55 && data[511] == 0xaa) | ||
434 | return 0; | ||
435 | |||
436 | for (i = 0; i < 511; i++) | ||
437 | checksum += data[i]; | ||
438 | |||
439 | return checksum == data[511]; | ||
440 | } | ||
441 | |||
442 | /* | ||
443 | * Purpose: allocate ICS partitions. | ||
444 | * Params : hd - pointer to gendisk structure to store partition info. | ||
445 | * dev - device number to access. | ||
446 | * Returns: -1 on error, 0 for no ICS table, 1 for partitions ok. | ||
447 | * Alloc : hda = whole drive | ||
448 | * hda1 = ADFS partition 0 on first drive. | ||
449 | * hda2 = ADFS partition 1 on first drive. | ||
450 | * ..etc.. | ||
451 | */ | ||
452 | int adfspart_check_POWERTEC(struct parsed_partitions *state) | ||
453 | { | ||
454 | Sector sect; | ||
455 | const unsigned char *data; | ||
456 | const struct ptec_part *p; | ||
457 | int slot = 1; | ||
458 | int i; | ||
459 | |||
460 | data = read_part_sector(state, 0, §); | ||
461 | if (!data) | ||
462 | return -1; | ||
463 | |||
464 | if (!valid_ptec_sector(data)) { | ||
465 | put_dev_sector(sect); | ||
466 | return 0; | ||
467 | } | ||
468 | |||
469 | strlcat(state->pp_buf, " [POWERTEC]", PAGE_SIZE); | ||
470 | |||
471 | for (i = 0, p = (const struct ptec_part *)data; i < 12; i++, p++) { | ||
472 | u32 start = le32_to_cpu(p->start); | ||
473 | u32 size = le32_to_cpu(p->size); | ||
474 | |||
475 | if (size) | ||
476 | put_partition(state, slot++, start, size); | ||
477 | } | ||
478 | |||
479 | put_dev_sector(sect); | ||
480 | strlcat(state->pp_buf, "\n", PAGE_SIZE); | ||
481 | return 1; | ||
482 | } | ||
483 | #endif | ||
484 | |||
485 | #ifdef CONFIG_ACORN_PARTITION_EESOX | ||
486 | struct eesox_part { | ||
487 | char magic[6]; | ||
488 | char name[10]; | ||
489 | __le32 start; | ||
490 | __le32 unused6; | ||
491 | __le32 unused7; | ||
492 | __le32 unused8; | ||
493 | }; | ||
494 | |||
495 | /* | ||
496 | * Guess who created this format? | ||
497 | */ | ||
498 | static const char eesox_name[] = { | ||
499 | 'N', 'e', 'i', 'l', ' ', | ||
500 | 'C', 'r', 'i', 't', 'c', 'h', 'e', 'l', 'l', ' ', ' ' | ||
501 | }; | ||
502 | |||
503 | /* | ||
504 | * EESOX SCSI partition format. | ||
505 | * | ||
506 | * This is a goddamned awful partition format. We don't seem to store | ||
507 | * the size of the partition in this table, only the start addresses. | ||
508 | * | ||
509 | * There are two possibilities where the size comes from: | ||
510 | * 1. The individual ADFS boot block entries that are placed on the disk. | ||
511 | * 2. The start address of the next entry. | ||
512 | */ | ||
513 | int adfspart_check_EESOX(struct parsed_partitions *state) | ||
514 | { | ||
515 | Sector sect; | ||
516 | const unsigned char *data; | ||
517 | unsigned char buffer[256]; | ||
518 | struct eesox_part *p; | ||
519 | sector_t start = 0; | ||
520 | int i, slot = 1; | ||
521 | |||
522 | data = read_part_sector(state, 7, §); | ||
523 | if (!data) | ||
524 | return -1; | ||
525 | |||
526 | /* | ||
527 | * "Decrypt" the partition table. God knows why... | ||
528 | */ | ||
529 | for (i = 0; i < 256; i++) | ||
530 | buffer[i] = data[i] ^ eesox_name[i & 15]; | ||
531 | |||
532 | put_dev_sector(sect); | ||
533 | |||
534 | for (i = 0, p = (struct eesox_part *)buffer; i < 8; i++, p++) { | ||
535 | sector_t next; | ||
536 | |||
537 | if (memcmp(p->magic, "Eesox", 6)) | ||
538 | break; | ||
539 | |||
540 | next = le32_to_cpu(p->start); | ||
541 | if (i) | ||
542 | put_partition(state, slot++, start, next - start); | ||
543 | start = next; | ||
544 | } | ||
545 | |||
546 | if (i != 0) { | ||
547 | sector_t size; | ||
548 | |||
549 | size = get_capacity(state->bdev->bd_disk); | ||
550 | put_partition(state, slot++, start, size - start); | ||
551 | strlcat(state->pp_buf, "\n", PAGE_SIZE); | ||
552 | } | ||
553 | |||
554 | return i ? 1 : 0; | ||
555 | } | ||
556 | #endif | ||
diff --git a/block/partitions/acorn.h b/block/partitions/acorn.h deleted file mode 100644 index ede82852969..00000000000 --- a/block/partitions/acorn.h +++ /dev/null | |||
@@ -1,14 +0,0 @@ | |||
1 | /* | ||
2 | * linux/fs/partitions/acorn.h | ||
3 | * | ||
4 | * Copyright (C) 1996-2001 Russell King. | ||
5 | * | ||
6 | * I _hate_ this partitioning mess - why can't we have one defined | ||
7 | * format, and everyone stick to it? | ||
8 | */ | ||
9 | |||
10 | int adfspart_check_CUMANA(struct parsed_partitions *state); | ||
11 | int adfspart_check_ADFS(struct parsed_partitions *state); | ||
12 | int adfspart_check_ICS(struct parsed_partitions *state); | ||
13 | int adfspart_check_POWERTEC(struct parsed_partitions *state); | ||
14 | int adfspart_check_EESOX(struct parsed_partitions *state); | ||
diff --git a/block/partitions/amiga.c b/block/partitions/amiga.c deleted file mode 100644 index 70cbf44a156..00000000000 --- a/block/partitions/amiga.c +++ /dev/null | |||
@@ -1,139 +0,0 @@ | |||
1 | /* | ||
2 | * fs/partitions/amiga.c | ||
3 | * | ||
4 | * Code extracted from drivers/block/genhd.c | ||
5 | * | ||
6 | * Copyright (C) 1991-1998 Linus Torvalds | ||
7 | * Re-organised Feb 1998 Russell King | ||
8 | */ | ||
9 | |||
10 | #include <linux/types.h> | ||
11 | #include <linux/affs_hardblocks.h> | ||
12 | |||
13 | #include "check.h" | ||
14 | #include "amiga.h" | ||
15 | |||
16 | static __inline__ u32 | ||
17 | checksum_block(__be32 *m, int size) | ||
18 | { | ||
19 | u32 sum = 0; | ||
20 | |||
21 | while (size--) | ||
22 | sum += be32_to_cpu(*m++); | ||
23 | return sum; | ||
24 | } | ||
25 | |||
26 | int amiga_partition(struct parsed_partitions *state) | ||
27 | { | ||
28 | Sector sect; | ||
29 | unsigned char *data; | ||
30 | struct RigidDiskBlock *rdb; | ||
31 | struct PartitionBlock *pb; | ||
32 | int start_sect, nr_sects, blk, part, res = 0; | ||
33 | int blksize = 1; /* Multiplier for disk block size */ | ||
34 | int slot = 1; | ||
35 | char b[BDEVNAME_SIZE]; | ||
36 | |||
37 | for (blk = 0; ; blk++, put_dev_sector(sect)) { | ||
38 | if (blk == RDB_ALLOCATION_LIMIT) | ||
39 | goto rdb_done; | ||
40 | data = read_part_sector(state, blk, §); | ||
41 | if (!data) { | ||
42 | if (warn_no_part) | ||
43 | printk("Dev %s: unable to read RDB block %d\n", | ||
44 | bdevname(state->bdev, b), blk); | ||
45 | res = -1; | ||
46 | goto rdb_done; | ||
47 | } | ||
48 | if (*(__be32 *)data != cpu_to_be32(IDNAME_RIGIDDISK)) | ||
49 | continue; | ||
50 | |||
51 | rdb = (struct RigidDiskBlock *)data; | ||
52 | if (checksum_block((__be32 *)data, be32_to_cpu(rdb->rdb_SummedLongs) & 0x7F) == 0) | ||
53 | break; | ||
54 | /* Try again with 0xdc..0xdf zeroed, Windows might have | ||
55 | * trashed it. | ||
56 | */ | ||
57 | *(__be32 *)(data+0xdc) = 0; | ||
58 | if (checksum_block((__be32 *)data, | ||
59 | be32_to_cpu(rdb->rdb_SummedLongs) & 0x7F)==0) { | ||
60 | printk("Warning: Trashed word at 0xd0 in block %d " | ||
61 | "ignored in checksum calculation\n",blk); | ||
62 | break; | ||
63 | } | ||
64 | |||
65 | printk("Dev %s: RDB in block %d has bad checksum\n", | ||
66 | bdevname(state->bdev, b), blk); | ||
67 | } | ||
68 | |||
69 | /* blksize is blocks per 512 byte standard block */ | ||
70 | blksize = be32_to_cpu( rdb->rdb_BlockBytes ) / 512; | ||
71 | |||
72 | { | ||
73 | char tmp[7 + 10 + 1 + 1]; | ||
74 | |||
75 | /* Be more informative */ | ||
76 | snprintf(tmp, sizeof(tmp), " RDSK (%d)", blksize * 512); | ||
77 | strlcat(state->pp_buf, tmp, PAGE_SIZE); | ||
78 | } | ||
79 | blk = be32_to_cpu(rdb->rdb_PartitionList); | ||
80 | put_dev_sector(sect); | ||
81 | for (part = 1; blk>0 && part<=16; part++, put_dev_sector(sect)) { | ||
82 | blk *= blksize; /* Read in terms partition table understands */ | ||
83 | data = read_part_sector(state, blk, §); | ||
84 | if (!data) { | ||
85 | if (warn_no_part) | ||
86 | printk("Dev %s: unable to read partition block %d\n", | ||
87 | bdevname(state->bdev, b), blk); | ||
88 | res = -1; | ||
89 | goto rdb_done; | ||
90 | } | ||
91 | pb = (struct PartitionBlock *)data; | ||
92 | blk = be32_to_cpu(pb->pb_Next); | ||
93 | if (pb->pb_ID != cpu_to_be32(IDNAME_PARTITION)) | ||
94 | continue; | ||
95 | if (checksum_block((__be32 *)pb, be32_to_cpu(pb->pb_SummedLongs) & 0x7F) != 0 ) | ||
96 | continue; | ||
97 | |||
98 | /* Tell Kernel about it */ | ||
99 | |||
100 | nr_sects = (be32_to_cpu(pb->pb_Environment[10]) + 1 - | ||
101 | be32_to_cpu(pb->pb_Environment[9])) * | ||
102 | be32_to_cpu(pb->pb_Environment[3]) * | ||
103 | be32_to_cpu(pb->pb_Environment[5]) * | ||
104 | blksize; | ||
105 | if (!nr_sects) | ||
106 | continue; | ||
107 | start_sect = be32_to_cpu(pb->pb_Environment[9]) * | ||
108 | be32_to_cpu(pb->pb_Environment[3]) * | ||
109 | be32_to_cpu(pb->pb_Environment[5]) * | ||
110 | blksize; | ||
111 | put_partition(state,slot++,start_sect,nr_sects); | ||
112 | { | ||
113 | /* Be even more informative to aid mounting */ | ||
114 | char dostype[4]; | ||
115 | char tmp[42]; | ||
116 | |||
117 | __be32 *dt = (__be32 *)dostype; | ||
118 | *dt = pb->pb_Environment[16]; | ||
119 | if (dostype[3] < ' ') | ||
120 | snprintf(tmp, sizeof(tmp), " (%c%c%c^%c)", | ||
121 | dostype[0], dostype[1], | ||
122 | dostype[2], dostype[3] + '@' ); | ||
123 | else | ||
124 | snprintf(tmp, sizeof(tmp), " (%c%c%c%c)", | ||
125 | dostype[0], dostype[1], | ||
126 | dostype[2], dostype[3]); | ||
127 | strlcat(state->pp_buf, tmp, PAGE_SIZE); | ||
128 | snprintf(tmp, sizeof(tmp), "(res %d spb %d)", | ||
129 | be32_to_cpu(pb->pb_Environment[6]), | ||
130 | be32_to_cpu(pb->pb_Environment[4])); | ||
131 | strlcat(state->pp_buf, tmp, PAGE_SIZE); | ||
132 | } | ||
133 | res = 1; | ||
134 | } | ||
135 | strlcat(state->pp_buf, "\n", PAGE_SIZE); | ||
136 | |||
137 | rdb_done: | ||
138 | return res; | ||
139 | } | ||
diff --git a/block/partitions/amiga.h b/block/partitions/amiga.h deleted file mode 100644 index d094585cada..00000000000 --- a/block/partitions/amiga.h +++ /dev/null | |||
@@ -1,6 +0,0 @@ | |||
1 | /* | ||
2 | * fs/partitions/amiga.h | ||
3 | */ | ||
4 | |||
5 | int amiga_partition(struct parsed_partitions *state); | ||
6 | |||
diff --git a/block/partitions/atari.c b/block/partitions/atari.c deleted file mode 100644 index 9875b05e80a..00000000000 --- a/block/partitions/atari.c +++ /dev/null | |||
@@ -1,149 +0,0 @@ | |||
1 | /* | ||
2 | * fs/partitions/atari.c | ||
3 | * | ||
4 | * Code extracted from drivers/block/genhd.c | ||
5 | * | ||
6 | * Copyright (C) 1991-1998 Linus Torvalds | ||
7 | * Re-organised Feb 1998 Russell King | ||
8 | */ | ||
9 | |||
10 | #include <linux/ctype.h> | ||
11 | #include "check.h" | ||
12 | #include "atari.h" | ||
13 | |||
14 | /* ++guenther: this should be settable by the user ("make config")?. | ||
15 | */ | ||
16 | #define ICD_PARTS | ||
17 | |||
18 | /* check if a partition entry looks valid -- Atari format is assumed if at | ||
19 | least one of the primary entries is ok this way */ | ||
20 | #define VALID_PARTITION(pi,hdsiz) \ | ||
21 | (((pi)->flg & 1) && \ | ||
22 | isalnum((pi)->id[0]) && isalnum((pi)->id[1]) && isalnum((pi)->id[2]) && \ | ||
23 | be32_to_cpu((pi)->st) <= (hdsiz) && \ | ||
24 | be32_to_cpu((pi)->st) + be32_to_cpu((pi)->siz) <= (hdsiz)) | ||
25 | |||
26 | static inline int OK_id(char *s) | ||
27 | { | ||
28 | return memcmp (s, "GEM", 3) == 0 || memcmp (s, "BGM", 3) == 0 || | ||
29 | memcmp (s, "LNX", 3) == 0 || memcmp (s, "SWP", 3) == 0 || | ||
30 | memcmp (s, "RAW", 3) == 0 ; | ||
31 | } | ||
32 | |||
33 | int atari_partition(struct parsed_partitions *state) | ||
34 | { | ||
35 | Sector sect; | ||
36 | struct rootsector *rs; | ||
37 | struct partition_info *pi; | ||
38 | u32 extensect; | ||
39 | u32 hd_size; | ||
40 | int slot; | ||
41 | #ifdef ICD_PARTS | ||
42 | int part_fmt = 0; /* 0:unknown, 1:AHDI, 2:ICD/Supra */ | ||
43 | #endif | ||
44 | |||
45 | rs = read_part_sector(state, 0, §); | ||
46 | if (!rs) | ||
47 | return -1; | ||
48 | |||
49 | /* Verify this is an Atari rootsector: */ | ||
50 | hd_size = state->bdev->bd_inode->i_size >> 9; | ||
51 | if (!VALID_PARTITION(&rs->part[0], hd_size) && | ||
52 | !VALID_PARTITION(&rs->part[1], hd_size) && | ||
53 | !VALID_PARTITION(&rs->part[2], hd_size) && | ||
54 | !VALID_PARTITION(&rs->part[3], hd_size)) { | ||
55 | /* | ||
56 | * if there's no valid primary partition, assume that no Atari | ||
57 | * format partition table (there's no reliable magic or the like | ||
58 | * :-() | ||
59 | */ | ||
60 | put_dev_sector(sect); | ||
61 | return 0; | ||
62 | } | ||
63 | |||
64 | pi = &rs->part[0]; | ||
65 | strlcat(state->pp_buf, " AHDI", PAGE_SIZE); | ||
66 | for (slot = 1; pi < &rs->part[4] && slot < state->limit; slot++, pi++) { | ||
67 | struct rootsector *xrs; | ||
68 | Sector sect2; | ||
69 | ulong partsect; | ||
70 | |||
71 | if ( !(pi->flg & 1) ) | ||
72 | continue; | ||
73 | /* active partition */ | ||
74 | if (memcmp (pi->id, "XGM", 3) != 0) { | ||
75 | /* we don't care about other id's */ | ||
76 | put_partition (state, slot, be32_to_cpu(pi->st), | ||
77 | be32_to_cpu(pi->siz)); | ||
78 | continue; | ||
79 | } | ||
80 | /* extension partition */ | ||
81 | #ifdef ICD_PARTS | ||
82 | part_fmt = 1; | ||
83 | #endif | ||
84 | strlcat(state->pp_buf, " XGM<", PAGE_SIZE); | ||
85 | partsect = extensect = be32_to_cpu(pi->st); | ||
86 | while (1) { | ||
87 | xrs = read_part_sector(state, partsect, §2); | ||
88 | if (!xrs) { | ||
89 | printk (" block %ld read failed\n", partsect); | ||
90 | put_dev_sector(sect); | ||
91 | return -1; | ||
92 | } | ||
93 | |||
94 | /* ++roman: sanity check: bit 0 of flg field must be set */ | ||
95 | if (!(xrs->part[0].flg & 1)) { | ||
96 | printk( "\nFirst sub-partition in extended partition is not valid!\n" ); | ||
97 | put_dev_sector(sect2); | ||
98 | break; | ||
99 | } | ||
100 | |||
101 | put_partition(state, slot, | ||
102 | partsect + be32_to_cpu(xrs->part[0].st), | ||
103 | be32_to_cpu(xrs->part[0].siz)); | ||
104 | |||
105 | if (!(xrs->part[1].flg & 1)) { | ||
106 | /* end of linked partition list */ | ||
107 | put_dev_sector(sect2); | ||
108 | break; | ||
109 | } | ||
110 | if (memcmp( xrs->part[1].id, "XGM", 3 ) != 0) { | ||
111 | printk("\nID of extended partition is not XGM!\n"); | ||
112 | put_dev_sector(sect2); | ||
113 | break; | ||
114 | } | ||
115 | |||
116 | partsect = be32_to_cpu(xrs->part[1].st) + extensect; | ||
117 | put_dev_sector(sect2); | ||
118 | if (++slot == state->limit) { | ||
119 | printk( "\nMaximum number of partitions reached!\n" ); | ||
120 | break; | ||
121 | } | ||
122 | } | ||
123 | strlcat(state->pp_buf, " >", PAGE_SIZE); | ||
124 | } | ||
125 | #ifdef ICD_PARTS | ||
126 | if ( part_fmt!=1 ) { /* no extended partitions -> test ICD-format */ | ||
127 | pi = &rs->icdpart[0]; | ||
128 | /* sanity check: no ICD format if first partition invalid */ | ||
129 | if (OK_id(pi->id)) { | ||
130 | strlcat(state->pp_buf, " ICD<", PAGE_SIZE); | ||
131 | for (; pi < &rs->icdpart[8] && slot < state->limit; slot++, pi++) { | ||
132 | /* accept only GEM,BGM,RAW,LNX,SWP partitions */ | ||
133 | if (!((pi->flg & 1) && OK_id(pi->id))) | ||
134 | continue; | ||
135 | part_fmt = 2; | ||
136 | put_partition (state, slot, | ||
137 | be32_to_cpu(pi->st), | ||
138 | be32_to_cpu(pi->siz)); | ||
139 | } | ||
140 | strlcat(state->pp_buf, " >", PAGE_SIZE); | ||
141 | } | ||
142 | } | ||
143 | #endif | ||
144 | put_dev_sector(sect); | ||
145 | |||
146 | strlcat(state->pp_buf, "\n", PAGE_SIZE); | ||
147 | |||
148 | return 1; | ||
149 | } | ||
diff --git a/block/partitions/atari.h b/block/partitions/atari.h deleted file mode 100644 index fe2d32a89f3..00000000000 --- a/block/partitions/atari.h +++ /dev/null | |||
@@ -1,34 +0,0 @@ | |||
1 | /* | ||
2 | * fs/partitions/atari.h | ||
3 | * Moved by Russell King from: | ||
4 | * | ||
5 | * linux/include/linux/atari_rootsec.h | ||
6 | * definitions for Atari Rootsector layout | ||
7 | * by Andreas Schwab (schwab@ls5.informatik.uni-dortmund.de) | ||
8 | * | ||
9 | * modified for ICD/Supra partitioning scheme restricted to at most 12 | ||
10 | * partitions | ||
11 | * by Guenther Kelleter (guenther@pool.informatik.rwth-aachen.de) | ||
12 | */ | ||
13 | |||
14 | struct partition_info | ||
15 | { | ||
16 | u8 flg; /* bit 0: active; bit 7: bootable */ | ||
17 | char id[3]; /* "GEM", "BGM", "XGM", or other */ | ||
18 | __be32 st; /* start of partition */ | ||
19 | __be32 siz; /* length of partition */ | ||
20 | }; | ||
21 | |||
22 | struct rootsector | ||
23 | { | ||
24 | char unused[0x156]; /* room for boot code */ | ||
25 | struct partition_info icdpart[8]; /* info for ICD-partitions 5..12 */ | ||
26 | char unused2[0xc]; | ||
27 | u32 hd_siz; /* size of disk in blocks */ | ||
28 | struct partition_info part[4]; | ||
29 | u32 bsl_st; /* start of bad sector list */ | ||
30 | u32 bsl_cnt; /* length of bad sector list */ | ||
31 | u16 checksum; /* checksum for bootable disks */ | ||
32 | } __attribute__((__packed__)); | ||
33 | |||
34 | int atari_partition(struct parsed_partitions *state); | ||
diff --git a/block/partitions/check.c b/block/partitions/check.c deleted file mode 100644 index bc908672c97..00000000000 --- a/block/partitions/check.c +++ /dev/null | |||
@@ -1,166 +0,0 @@ | |||
1 | /* | ||
2 | * fs/partitions/check.c | ||
3 | * | ||
4 | * Code extracted from drivers/block/genhd.c | ||
5 | * Copyright (C) 1991-1998 Linus Torvalds | ||
6 | * Re-organised Feb 1998 Russell King | ||
7 | * | ||
8 | * We now have independent partition support from the | ||
9 | * block drivers, which allows all the partition code to | ||
10 | * be grouped in one location, and it to be mostly self | ||
11 | * contained. | ||
12 | * | ||
13 | * Added needed MAJORS for new pairs, {hdi,hdj}, {hdk,hdl} | ||
14 | */ | ||
15 | |||
16 | #include <linux/slab.h> | ||
17 | #include <linux/ctype.h> | ||
18 | #include <linux/genhd.h> | ||
19 | |||
20 | #include "check.h" | ||
21 | |||
22 | #include "acorn.h" | ||
23 | #include "amiga.h" | ||
24 | #include "atari.h" | ||
25 | #include "ldm.h" | ||
26 | #include "mac.h" | ||
27 | #include "msdos.h" | ||
28 | #include "osf.h" | ||
29 | #include "sgi.h" | ||
30 | #include "sun.h" | ||
31 | #include "ibm.h" | ||
32 | #include "ultrix.h" | ||
33 | #include "efi.h" | ||
34 | #include "karma.h" | ||
35 | #include "sysv68.h" | ||
36 | |||
37 | int warn_no_part = 1; /*This is ugly: should make genhd removable media aware*/ | ||
38 | |||
39 | static int (*check_part[])(struct parsed_partitions *) = { | ||
40 | /* | ||
41 | * Probe partition formats with tables at disk address 0 | ||
42 | * that also have an ADFS boot block at 0xdc0. | ||
43 | */ | ||
44 | #ifdef CONFIG_ACORN_PARTITION_ICS | ||
45 | adfspart_check_ICS, | ||
46 | #endif | ||
47 | #ifdef CONFIG_ACORN_PARTITION_POWERTEC | ||
48 | adfspart_check_POWERTEC, | ||
49 | #endif | ||
50 | #ifdef CONFIG_ACORN_PARTITION_EESOX | ||
51 | adfspart_check_EESOX, | ||
52 | #endif | ||
53 | |||
54 | /* | ||
55 | * Now move on to formats that only have partition info at | ||
56 | * disk address 0xdc0. Since these may also have stale | ||
57 | * PC/BIOS partition tables, they need to come before | ||
58 | * the msdos entry. | ||
59 | */ | ||
60 | #ifdef CONFIG_ACORN_PARTITION_CUMANA | ||
61 | adfspart_check_CUMANA, | ||
62 | #endif | ||
63 | #ifdef CONFIG_ACORN_PARTITION_ADFS | ||
64 | adfspart_check_ADFS, | ||
65 | #endif | ||
66 | |||
67 | #ifdef CONFIG_EFI_PARTITION | ||
68 | efi_partition, /* this must come before msdos */ | ||
69 | #endif | ||
70 | #ifdef CONFIG_SGI_PARTITION | ||
71 | sgi_partition, | ||
72 | #endif | ||
73 | #ifdef CONFIG_LDM_PARTITION | ||
74 | ldm_partition, /* this must come before msdos */ | ||
75 | #endif | ||
76 | #ifdef CONFIG_MSDOS_PARTITION | ||
77 | msdos_partition, | ||
78 | #endif | ||
79 | #ifdef CONFIG_OSF_PARTITION | ||
80 | osf_partition, | ||
81 | #endif | ||
82 | #ifdef CONFIG_SUN_PARTITION | ||
83 | sun_partition, | ||
84 | #endif | ||
85 | #ifdef CONFIG_AMIGA_PARTITION | ||
86 | amiga_partition, | ||
87 | #endif | ||
88 | #ifdef CONFIG_ATARI_PARTITION | ||
89 | atari_partition, | ||
90 | #endif | ||
91 | #ifdef CONFIG_MAC_PARTITION | ||
92 | mac_partition, | ||
93 | #endif | ||
94 | #ifdef CONFIG_ULTRIX_PARTITION | ||
95 | ultrix_partition, | ||
96 | #endif | ||
97 | #ifdef CONFIG_IBM_PARTITION | ||
98 | ibm_partition, | ||
99 | #endif | ||
100 | #ifdef CONFIG_KARMA_PARTITION | ||
101 | karma_partition, | ||
102 | #endif | ||
103 | #ifdef CONFIG_SYSV68_PARTITION | ||
104 | sysv68_partition, | ||
105 | #endif | ||
106 | NULL | ||
107 | }; | ||
108 | |||
109 | struct parsed_partitions * | ||
110 | check_partition(struct gendisk *hd, struct block_device *bdev) | ||
111 | { | ||
112 | struct parsed_partitions *state; | ||
113 | int i, res, err; | ||
114 | |||
115 | state = kzalloc(sizeof(struct parsed_partitions), GFP_KERNEL); | ||
116 | if (!state) | ||
117 | return NULL; | ||
118 | state->pp_buf = (char *)__get_free_page(GFP_KERNEL); | ||
119 | if (!state->pp_buf) { | ||
120 | kfree(state); | ||
121 | return NULL; | ||
122 | } | ||
123 | state->pp_buf[0] = '\0'; | ||
124 | |||
125 | state->bdev = bdev; | ||
126 | disk_name(hd, 0, state->name); | ||
127 | snprintf(state->pp_buf, PAGE_SIZE, " %s:", state->name); | ||
128 | if (isdigit(state->name[strlen(state->name)-1])) | ||
129 | sprintf(state->name, "p"); | ||
130 | |||
131 | state->limit = disk_max_parts(hd); | ||
132 | i = res = err = 0; | ||
133 | while (!res && check_part[i]) { | ||
134 | memset(&state->parts, 0, sizeof(state->parts)); | ||
135 | res = check_part[i++](state); | ||
136 | if (res < 0) { | ||
137 | /* We have hit an I/O error which we don't report now. | ||
138 | * But record it, and let the others do their job. | ||
139 | */ | ||
140 | err = res; | ||
141 | res = 0; | ||
142 | } | ||
143 | |||
144 | } | ||
145 | if (res > 0) { | ||
146 | printk(KERN_INFO "%s", state->pp_buf); | ||
147 | |||
148 | free_page((unsigned long)state->pp_buf); | ||
149 | return state; | ||
150 | } | ||
151 | if (state->access_beyond_eod) | ||
152 | err = -ENOSPC; | ||
153 | if (err) | ||
154 | /* The partition is unrecognized. So report I/O errors if there were any */ | ||
155 | res = err; | ||
156 | if (!res) | ||
157 | strlcat(state->pp_buf, " unknown partition table\n", PAGE_SIZE); | ||
158 | else if (warn_no_part) | ||
159 | strlcat(state->pp_buf, " unable to read partition table\n", PAGE_SIZE); | ||
160 | |||
161 | printk(KERN_INFO "%s", state->pp_buf); | ||
162 | |||
163 | free_page((unsigned long)state->pp_buf); | ||
164 | kfree(state); | ||
165 | return ERR_PTR(res); | ||
166 | } | ||
diff --git a/block/partitions/check.h b/block/partitions/check.h deleted file mode 100644 index 52b100311ec..00000000000 --- a/block/partitions/check.h +++ /dev/null | |||
@@ -1,52 +0,0 @@ | |||
1 | #include <linux/pagemap.h> | ||
2 | #include <linux/blkdev.h> | ||
3 | #include <linux/genhd.h> | ||
4 | |||
5 | /* | ||
6 | * add_gd_partition adds a partitions details to the devices partition | ||
7 | * description. | ||
8 | */ | ||
9 | struct parsed_partitions { | ||
10 | struct block_device *bdev; | ||
11 | char name[BDEVNAME_SIZE]; | ||
12 | struct { | ||
13 | sector_t from; | ||
14 | sector_t size; | ||
15 | int flags; | ||
16 | bool has_info; | ||
17 | struct partition_meta_info info; | ||
18 | } parts[DISK_MAX_PARTS]; | ||
19 | int next; | ||
20 | int limit; | ||
21 | bool access_beyond_eod; | ||
22 | char *pp_buf; | ||
23 | }; | ||
24 | |||
25 | struct parsed_partitions * | ||
26 | check_partition(struct gendisk *, struct block_device *); | ||
27 | |||
28 | static inline void *read_part_sector(struct parsed_partitions *state, | ||
29 | sector_t n, Sector *p) | ||
30 | { | ||
31 | if (n >= get_capacity(state->bdev->bd_disk)) { | ||
32 | state->access_beyond_eod = true; | ||
33 | return NULL; | ||
34 | } | ||
35 | return read_dev_sector(state->bdev, n, p); | ||
36 | } | ||
37 | |||
38 | static inline void | ||
39 | put_partition(struct parsed_partitions *p, int n, sector_t from, sector_t size) | ||
40 | { | ||
41 | if (n < p->limit) { | ||
42 | char tmp[1 + BDEVNAME_SIZE + 10 + 1]; | ||
43 | |||
44 | p->parts[n].from = from; | ||
45 | p->parts[n].size = size; | ||
46 | snprintf(tmp, sizeof(tmp), " %s%d", p->name, n); | ||
47 | strlcat(p->pp_buf, tmp, PAGE_SIZE); | ||
48 | } | ||
49 | } | ||
50 | |||
51 | extern int warn_no_part; | ||
52 | |||
diff --git a/block/partitions/efi.c b/block/partitions/efi.c deleted file mode 100644 index b62fb88b871..00000000000 --- a/block/partitions/efi.c +++ /dev/null | |||
@@ -1,670 +0,0 @@ | |||
1 | /************************************************************ | ||
2 | * EFI GUID Partition Table handling | ||
3 | * | ||
4 | * http://www.uefi.org/specs/ | ||
5 | * http://www.intel.com/technology/efi/ | ||
6 | * | ||
7 | * efi.[ch] by Matt Domsch <Matt_Domsch@dell.com> | ||
8 | * Copyright 2000,2001,2002,2004 Dell Inc. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or modify | ||
11 | * it under the terms of the GNU General Public License as published by | ||
12 | * the Free Software Foundation; either version 2 of the License, or | ||
13 | * (at your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
18 | * GNU General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public License | ||
21 | * along with this program; if not, write to the Free Software | ||
22 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
23 | * | ||
24 | * | ||
25 | * TODO: | ||
26 | * | ||
27 | * Changelog: | ||
28 | * Mon Nov 09 2004 Matt Domsch <Matt_Domsch@dell.com> | ||
29 | * - test for valid PMBR and valid PGPT before ever reading | ||
30 | * AGPT, allow override with 'gpt' kernel command line option. | ||
31 | * - check for first/last_usable_lba outside of size of disk | ||
32 | * | ||
33 | * Tue Mar 26 2002 Matt Domsch <Matt_Domsch@dell.com> | ||
34 | * - Ported to 2.5.7-pre1 and 2.5.7-dj2 | ||
35 | * - Applied patch to avoid fault in alternate header handling | ||
36 | * - cleaned up find_valid_gpt | ||
37 | * - On-disk structure and copy in memory is *always* LE now - | ||
38 | * swab fields as needed | ||
39 | * - remove print_gpt_header() | ||
40 | * - only use first max_p partition entries, to keep the kernel minor number | ||
41 | * and partition numbers tied. | ||
42 | * | ||
43 | * Mon Feb 04 2002 Matt Domsch <Matt_Domsch@dell.com> | ||
44 | * - Removed __PRIPTR_PREFIX - not being used | ||
45 | * | ||
46 | * Mon Jan 14 2002 Matt Domsch <Matt_Domsch@dell.com> | ||
47 | * - Ported to 2.5.2-pre11 + library crc32 patch Linus applied | ||
48 | * | ||
49 | * Thu Dec 6 2001 Matt Domsch <Matt_Domsch@dell.com> | ||
50 | * - Added compare_gpts(). | ||
51 | * - moved le_efi_guid_to_cpus() back into this file. GPT is the only | ||
52 | * thing that keeps EFI GUIDs on disk. | ||
53 | * - Changed gpt structure names and members to be simpler and more Linux-like. | ||
54 | * | ||
55 | * Wed Oct 17 2001 Matt Domsch <Matt_Domsch@dell.com> | ||
56 | * - Removed CONFIG_DEVFS_VOLUMES_UUID code entirely per Martin Wilck | ||
57 | * | ||
58 | * Wed Oct 10 2001 Matt Domsch <Matt_Domsch@dell.com> | ||
59 | * - Changed function comments to DocBook style per Andreas Dilger suggestion. | ||
60 | * | ||
61 | * Mon Oct 08 2001 Matt Domsch <Matt_Domsch@dell.com> | ||
62 | * - Change read_lba() to use the page cache per Al Viro's work. | ||
63 | * - print u64s properly on all architectures | ||
64 | * - fixed debug_printk(), now Dprintk() | ||
65 | * | ||
66 | * Mon Oct 01 2001 Matt Domsch <Matt_Domsch@dell.com> | ||
67 | * - Style cleanups | ||
68 | * - made most functions static | ||
69 | * - Endianness addition | ||
70 | * - remove test for second alternate header, as it's not per spec, | ||
71 | * and is unnecessary. There's now a method to read/write the last | ||
72 | * sector of an odd-sized disk from user space. No tools have ever | ||
73 | * been released which used this code, so it's effectively dead. | ||
74 | * - Per Asit Mallick of Intel, added a test for a valid PMBR. | ||
75 | * - Added kernel command line option 'gpt' to override valid PMBR test. | ||
76 | * | ||
77 | * Wed Jun 6 2001 Martin Wilck <Martin.Wilck@Fujitsu-Siemens.com> | ||
78 | * - added devfs volume UUID support (/dev/volumes/uuids) for | ||
79 | * mounting file systems by the partition GUID. | ||
80 | * | ||
81 | * Tue Dec 5 2000 Matt Domsch <Matt_Domsch@dell.com> | ||
82 | * - Moved crc32() to linux/lib, added efi_crc32(). | ||
83 | * | ||
84 | * Thu Nov 30 2000 Matt Domsch <Matt_Domsch@dell.com> | ||
85 | * - Replaced Intel's CRC32 function with an equivalent | ||
86 | * non-license-restricted version. | ||
87 | * | ||
88 | * Wed Oct 25 2000 Matt Domsch <Matt_Domsch@dell.com> | ||
89 | * - Fixed the last_lba() call to return the proper last block | ||
90 | * | ||
91 | * Thu Oct 12 2000 Matt Domsch <Matt_Domsch@dell.com> | ||
92 | * - Thanks to Andries Brouwer for his debugging assistance. | ||
93 | * - Code works, detects all the partitions. | ||
94 | * | ||
95 | ************************************************************/ | ||
96 | #include <linux/crc32.h> | ||
97 | #include <linux/ctype.h> | ||
98 | #include <linux/math64.h> | ||
99 | #include <linux/slab.h> | ||
100 | #include "check.h" | ||
101 | #include "efi.h" | ||
102 | |||
103 | /* This allows a kernel command line option 'gpt' to override | ||
104 | * the test for invalid PMBR. Not __initdata because reloading | ||
105 | * the partition tables happens after init too. | ||
106 | */ | ||
107 | static int force_gpt; | ||
108 | static int __init | ||
109 | force_gpt_fn(char *str) | ||
110 | { | ||
111 | force_gpt = 1; | ||
112 | return 1; | ||
113 | } | ||
114 | __setup("gpt", force_gpt_fn); | ||
115 | |||
116 | |||
117 | /** | ||
118 | * efi_crc32() - EFI version of crc32 function | ||
119 | * @buf: buffer to calculate crc32 of | ||
120 | * @len - length of buf | ||
121 | * | ||
122 | * Description: Returns EFI-style CRC32 value for @buf | ||
123 | * | ||
124 | * This function uses the little endian Ethernet polynomial | ||
125 | * but seeds the function with ~0, and xor's with ~0 at the end. | ||
126 | * Note, the EFI Specification, v1.02, has a reference to | ||
127 | * Dr. Dobbs Journal, May 1994 (actually it's in May 1992). | ||
128 | */ | ||
129 | static inline u32 | ||
130 | efi_crc32(const void *buf, unsigned long len) | ||
131 | { | ||
132 | return (crc32(~0L, buf, len) ^ ~0L); | ||
133 | } | ||
134 | |||
135 | /** | ||
136 | * last_lba(): return number of last logical block of device | ||
137 | * @bdev: block device | ||
138 | * | ||
139 | * Description: Returns last LBA value on success, 0 on error. | ||
140 | * This is stored (by sd and ide-geometry) in | ||
141 | * the part[0] entry for this disk, and is the number of | ||
142 | * physical sectors available on the disk. | ||
143 | */ | ||
144 | static u64 last_lba(struct block_device *bdev) | ||
145 | { | ||
146 | if (!bdev || !bdev->bd_inode) | ||
147 | return 0; | ||
148 | return div_u64(bdev->bd_inode->i_size, | ||
149 | bdev_logical_block_size(bdev)) - 1ULL; | ||
150 | } | ||
151 | |||
152 | static inline int | ||
153 | pmbr_part_valid(struct partition *part) | ||
154 | { | ||
155 | if (part->sys_ind == EFI_PMBR_OSTYPE_EFI_GPT && | ||
156 | le32_to_cpu(part->start_sect) == 1UL) | ||
157 | return 1; | ||
158 | return 0; | ||
159 | } | ||
160 | |||
161 | /** | ||
162 | * is_pmbr_valid(): test Protective MBR for validity | ||
163 | * @mbr: pointer to a legacy mbr structure | ||
164 | * | ||
165 | * Description: Returns 1 if PMBR is valid, 0 otherwise. | ||
166 | * Validity depends on two things: | ||
167 | * 1) MSDOS signature is in the last two bytes of the MBR | ||
168 | * 2) One partition of type 0xEE is found | ||
169 | */ | ||
170 | static int | ||
171 | is_pmbr_valid(legacy_mbr *mbr) | ||
172 | { | ||
173 | int i; | ||
174 | if (!mbr || le16_to_cpu(mbr->signature) != MSDOS_MBR_SIGNATURE) | ||
175 | return 0; | ||
176 | for (i = 0; i < 4; i++) | ||
177 | if (pmbr_part_valid(&mbr->partition_record[i])) | ||
178 | return 1; | ||
179 | return 0; | ||
180 | } | ||
181 | |||
182 | /** | ||
183 | * read_lba(): Read bytes from disk, starting at given LBA | ||
184 | * @state | ||
185 | * @lba | ||
186 | * @buffer | ||
187 | * @size_t | ||
188 | * | ||
189 | * Description: Reads @count bytes from @state->bdev into @buffer. | ||
190 | * Returns number of bytes read on success, 0 on error. | ||
191 | */ | ||
192 | static size_t read_lba(struct parsed_partitions *state, | ||
193 | u64 lba, u8 *buffer, size_t count) | ||
194 | { | ||
195 | size_t totalreadcount = 0; | ||
196 | struct block_device *bdev = state->bdev; | ||
197 | sector_t n = lba * (bdev_logical_block_size(bdev) / 512); | ||
198 | |||
199 | if (!buffer || lba > last_lba(bdev)) | ||
200 | return 0; | ||
201 | |||
202 | while (count) { | ||
203 | int copied = 512; | ||
204 | Sector sect; | ||
205 | unsigned char *data = read_part_sector(state, n++, §); | ||
206 | if (!data) | ||
207 | break; | ||
208 | if (copied > count) | ||
209 | copied = count; | ||
210 | memcpy(buffer, data, copied); | ||
211 | put_dev_sector(sect); | ||
212 | buffer += copied; | ||
213 | totalreadcount +=copied; | ||
214 | count -= copied; | ||
215 | } | ||
216 | return totalreadcount; | ||
217 | } | ||
218 | |||
219 | /** | ||
220 | * alloc_read_gpt_entries(): reads partition entries from disk | ||
221 | * @state | ||
222 | * @gpt - GPT header | ||
223 | * | ||
224 | * Description: Returns ptes on success, NULL on error. | ||
225 | * Allocates space for PTEs based on information found in @gpt. | ||
226 | * Notes: remember to free pte when you're done! | ||
227 | */ | ||
228 | static gpt_entry *alloc_read_gpt_entries(struct parsed_partitions *state, | ||
229 | gpt_header *gpt) | ||
230 | { | ||
231 | size_t count; | ||
232 | gpt_entry *pte; | ||
233 | |||
234 | if (!gpt) | ||
235 | return NULL; | ||
236 | |||
237 | count = le32_to_cpu(gpt->num_partition_entries) * | ||
238 | le32_to_cpu(gpt->sizeof_partition_entry); | ||
239 | if (!count) | ||
240 | return NULL; | ||
241 | pte = kzalloc(count, GFP_KERNEL); | ||
242 | if (!pte) | ||
243 | return NULL; | ||
244 | |||
245 | if (read_lba(state, le64_to_cpu(gpt->partition_entry_lba), | ||
246 | (u8 *) pte, | ||
247 | count) < count) { | ||
248 | kfree(pte); | ||
249 | pte=NULL; | ||
250 | return NULL; | ||
251 | } | ||
252 | return pte; | ||
253 | } | ||
254 | |||
255 | /** | ||
256 | * alloc_read_gpt_header(): Allocates GPT header, reads into it from disk | ||
257 | * @state | ||
258 | * @lba is the Logical Block Address of the partition table | ||
259 | * | ||
260 | * Description: returns GPT header on success, NULL on error. Allocates | ||
261 | * and fills a GPT header starting at @ from @state->bdev. | ||
262 | * Note: remember to free gpt when finished with it. | ||
263 | */ | ||
264 | static gpt_header *alloc_read_gpt_header(struct parsed_partitions *state, | ||
265 | u64 lba) | ||
266 | { | ||
267 | gpt_header *gpt; | ||
268 | unsigned ssz = bdev_logical_block_size(state->bdev); | ||
269 | |||
270 | gpt = kzalloc(ssz, GFP_KERNEL); | ||
271 | if (!gpt) | ||
272 | return NULL; | ||
273 | |||
274 | if (read_lba(state, lba, (u8 *) gpt, ssz) < ssz) { | ||
275 | kfree(gpt); | ||
276 | gpt=NULL; | ||
277 | return NULL; | ||
278 | } | ||
279 | |||
280 | return gpt; | ||
281 | } | ||
282 | |||
283 | /** | ||
284 | * is_gpt_valid() - tests one GPT header and PTEs for validity | ||
285 | * @state | ||
286 | * @lba is the logical block address of the GPT header to test | ||
287 | * @gpt is a GPT header ptr, filled on return. | ||
288 | * @ptes is a PTEs ptr, filled on return. | ||
289 | * | ||
290 | * Description: returns 1 if valid, 0 on error. | ||
291 | * If valid, returns pointers to newly allocated GPT header and PTEs. | ||
292 | */ | ||
293 | static int is_gpt_valid(struct parsed_partitions *state, u64 lba, | ||
294 | gpt_header **gpt, gpt_entry **ptes) | ||
295 | { | ||
296 | u32 crc, origcrc; | ||
297 | u64 lastlba; | ||
298 | |||
299 | if (!ptes) | ||
300 | return 0; | ||
301 | if (!(*gpt = alloc_read_gpt_header(state, lba))) | ||
302 | return 0; | ||
303 | |||
304 | /* Check the GUID Partition Table signature */ | ||
305 | if (le64_to_cpu((*gpt)->signature) != GPT_HEADER_SIGNATURE) { | ||
306 | pr_debug("GUID Partition Table Header signature is wrong:" | ||
307 | "%lld != %lld\n", | ||
308 | (unsigned long long)le64_to_cpu((*gpt)->signature), | ||
309 | (unsigned long long)GPT_HEADER_SIGNATURE); | ||
310 | goto fail; | ||
311 | } | ||
312 | |||
313 | /* Check the GUID Partition Table header size */ | ||
314 | if (le32_to_cpu((*gpt)->header_size) > | ||
315 | bdev_logical_block_size(state->bdev)) { | ||
316 | pr_debug("GUID Partition Table Header size is wrong: %u > %u\n", | ||
317 | le32_to_cpu((*gpt)->header_size), | ||
318 | bdev_logical_block_size(state->bdev)); | ||
319 | goto fail; | ||
320 | } | ||
321 | |||
322 | /* Check the GUID Partition Table CRC */ | ||
323 | origcrc = le32_to_cpu((*gpt)->header_crc32); | ||
324 | (*gpt)->header_crc32 = 0; | ||
325 | crc = efi_crc32((const unsigned char *) (*gpt), le32_to_cpu((*gpt)->header_size)); | ||
326 | |||
327 | if (crc != origcrc) { | ||
328 | pr_debug("GUID Partition Table Header CRC is wrong: %x != %x\n", | ||
329 | crc, origcrc); | ||
330 | goto fail; | ||
331 | } | ||
332 | (*gpt)->header_crc32 = cpu_to_le32(origcrc); | ||
333 | |||
334 | /* Check that the my_lba entry points to the LBA that contains | ||
335 | * the GUID Partition Table */ | ||
336 | if (le64_to_cpu((*gpt)->my_lba) != lba) { | ||
337 | pr_debug("GPT my_lba incorrect: %lld != %lld\n", | ||
338 | (unsigned long long)le64_to_cpu((*gpt)->my_lba), | ||
339 | (unsigned long long)lba); | ||
340 | goto fail; | ||
341 | } | ||
342 | |||
343 | /* Check the first_usable_lba and last_usable_lba are | ||
344 | * within the disk. | ||
345 | */ | ||
346 | lastlba = last_lba(state->bdev); | ||
347 | if (le64_to_cpu((*gpt)->first_usable_lba) > lastlba) { | ||
348 | pr_debug("GPT: first_usable_lba incorrect: %lld > %lld\n", | ||
349 | (unsigned long long)le64_to_cpu((*gpt)->first_usable_lba), | ||
350 | (unsigned long long)lastlba); | ||
351 | goto fail; | ||
352 | } | ||
353 | if (le64_to_cpu((*gpt)->last_usable_lba) > lastlba) { | ||
354 | pr_debug("GPT: last_usable_lba incorrect: %lld > %lld\n", | ||
355 | (unsigned long long)le64_to_cpu((*gpt)->last_usable_lba), | ||
356 | (unsigned long long)lastlba); | ||
357 | goto fail; | ||
358 | } | ||
359 | |||
360 | /* Check that sizeof_partition_entry has the correct value */ | ||
361 | if (le32_to_cpu((*gpt)->sizeof_partition_entry) != sizeof(gpt_entry)) { | ||
362 | pr_debug("GUID Partitition Entry Size check failed.\n"); | ||
363 | goto fail; | ||
364 | } | ||
365 | |||
366 | if (!(*ptes = alloc_read_gpt_entries(state, *gpt))) | ||
367 | goto fail; | ||
368 | |||
369 | /* Check the GUID Partition Entry Array CRC */ | ||
370 | crc = efi_crc32((const unsigned char *) (*ptes), | ||
371 | le32_to_cpu((*gpt)->num_partition_entries) * | ||
372 | le32_to_cpu((*gpt)->sizeof_partition_entry)); | ||
373 | |||
374 | if (crc != le32_to_cpu((*gpt)->partition_entry_array_crc32)) { | ||
375 | pr_debug("GUID Partitition Entry Array CRC check failed.\n"); | ||
376 | goto fail_ptes; | ||
377 | } | ||
378 | |||
379 | /* We're done, all's well */ | ||
380 | return 1; | ||
381 | |||
382 | fail_ptes: | ||
383 | kfree(*ptes); | ||
384 | *ptes = NULL; | ||
385 | fail: | ||
386 | kfree(*gpt); | ||
387 | *gpt = NULL; | ||
388 | return 0; | ||
389 | } | ||
390 | |||
391 | /** | ||
392 | * is_pte_valid() - tests one PTE for validity | ||
393 | * @pte is the pte to check | ||
394 | * @lastlba is last lba of the disk | ||
395 | * | ||
396 | * Description: returns 1 if valid, 0 on error. | ||
397 | */ | ||
398 | static inline int | ||
399 | is_pte_valid(const gpt_entry *pte, const u64 lastlba) | ||
400 | { | ||
401 | if ((!efi_guidcmp(pte->partition_type_guid, NULL_GUID)) || | ||
402 | le64_to_cpu(pte->starting_lba) > lastlba || | ||
403 | le64_to_cpu(pte->ending_lba) > lastlba) | ||
404 | return 0; | ||
405 | return 1; | ||
406 | } | ||
407 | |||
408 | /** | ||
409 | * compare_gpts() - Search disk for valid GPT headers and PTEs | ||
410 | * @pgpt is the primary GPT header | ||
411 | * @agpt is the alternate GPT header | ||
412 | * @lastlba is the last LBA number | ||
413 | * Description: Returns nothing. Sanity checks pgpt and agpt fields | ||
414 | * and prints warnings on discrepancies. | ||
415 | * | ||
416 | */ | ||
417 | static void | ||
418 | compare_gpts(gpt_header *pgpt, gpt_header *agpt, u64 lastlba) | ||
419 | { | ||
420 | int error_found = 0; | ||
421 | if (!pgpt || !agpt) | ||
422 | return; | ||
423 | if (le64_to_cpu(pgpt->my_lba) != le64_to_cpu(agpt->alternate_lba)) { | ||
424 | printk(KERN_WARNING | ||
425 | "GPT:Primary header LBA != Alt. header alternate_lba\n"); | ||
426 | printk(KERN_WARNING "GPT:%lld != %lld\n", | ||
427 | (unsigned long long)le64_to_cpu(pgpt->my_lba), | ||
428 | (unsigned long long)le64_to_cpu(agpt->alternate_lba)); | ||
429 | error_found++; | ||
430 | } | ||
431 | if (le64_to_cpu(pgpt->alternate_lba) != le64_to_cpu(agpt->my_lba)) { | ||
432 | printk(KERN_WARNING | ||
433 | "GPT:Primary header alternate_lba != Alt. header my_lba\n"); | ||
434 | printk(KERN_WARNING "GPT:%lld != %lld\n", | ||
435 | (unsigned long long)le64_to_cpu(pgpt->alternate_lba), | ||
436 | (unsigned long long)le64_to_cpu(agpt->my_lba)); | ||
437 | error_found++; | ||
438 | } | ||
439 | if (le64_to_cpu(pgpt->first_usable_lba) != | ||
440 | le64_to_cpu(agpt->first_usable_lba)) { | ||
441 | printk(KERN_WARNING "GPT:first_usable_lbas don't match.\n"); | ||
442 | printk(KERN_WARNING "GPT:%lld != %lld\n", | ||
443 | (unsigned long long)le64_to_cpu(pgpt->first_usable_lba), | ||
444 | (unsigned long long)le64_to_cpu(agpt->first_usable_lba)); | ||
445 | error_found++; | ||
446 | } | ||
447 | if (le64_to_cpu(pgpt->last_usable_lba) != | ||
448 | le64_to_cpu(agpt->last_usable_lba)) { | ||
449 | printk(KERN_WARNING "GPT:last_usable_lbas don't match.\n"); | ||
450 | printk(KERN_WARNING "GPT:%lld != %lld\n", | ||
451 | (unsigned long long)le64_to_cpu(pgpt->last_usable_lba), | ||
452 | (unsigned long long)le64_to_cpu(agpt->last_usable_lba)); | ||
453 | error_found++; | ||
454 | } | ||
455 | if (efi_guidcmp(pgpt->disk_guid, agpt->disk_guid)) { | ||
456 | printk(KERN_WARNING "GPT:disk_guids don't match.\n"); | ||
457 | error_found++; | ||
458 | } | ||
459 | if (le32_to_cpu(pgpt->num_partition_entries) != | ||
460 | le32_to_cpu(agpt->num_partition_entries)) { | ||
461 | printk(KERN_WARNING "GPT:num_partition_entries don't match: " | ||
462 | "0x%x != 0x%x\n", | ||
463 | le32_to_cpu(pgpt->num_partition_entries), | ||
464 | le32_to_cpu(agpt->num_partition_entries)); | ||
465 | error_found++; | ||
466 | } | ||
467 | if (le32_to_cpu(pgpt->sizeof_partition_entry) != | ||
468 | le32_to_cpu(agpt->sizeof_partition_entry)) { | ||
469 | printk(KERN_WARNING | ||
470 | "GPT:sizeof_partition_entry values don't match: " | ||
471 | "0x%x != 0x%x\n", | ||
472 | le32_to_cpu(pgpt->sizeof_partition_entry), | ||
473 | le32_to_cpu(agpt->sizeof_partition_entry)); | ||
474 | error_found++; | ||
475 | } | ||
476 | if (le32_to_cpu(pgpt->partition_entry_array_crc32) != | ||
477 | le32_to_cpu(agpt->partition_entry_array_crc32)) { | ||
478 | printk(KERN_WARNING | ||
479 | "GPT:partition_entry_array_crc32 values don't match: " | ||
480 | "0x%x != 0x%x\n", | ||
481 | le32_to_cpu(pgpt->partition_entry_array_crc32), | ||
482 | le32_to_cpu(agpt->partition_entry_array_crc32)); | ||
483 | error_found++; | ||
484 | } | ||
485 | if (le64_to_cpu(pgpt->alternate_lba) != lastlba) { | ||
486 | printk(KERN_WARNING | ||
487 | "GPT:Primary header thinks Alt. header is not at the end of the disk.\n"); | ||
488 | printk(KERN_WARNING "GPT:%lld != %lld\n", | ||
489 | (unsigned long long)le64_to_cpu(pgpt->alternate_lba), | ||
490 | (unsigned long long)lastlba); | ||
491 | error_found++; | ||
492 | } | ||
493 | |||
494 | if (le64_to_cpu(agpt->my_lba) != lastlba) { | ||
495 | printk(KERN_WARNING | ||
496 | "GPT:Alternate GPT header not at the end of the disk.\n"); | ||
497 | printk(KERN_WARNING "GPT:%lld != %lld\n", | ||
498 | (unsigned long long)le64_to_cpu(agpt->my_lba), | ||
499 | (unsigned long long)lastlba); | ||
500 | error_found++; | ||
501 | } | ||
502 | |||
503 | if (error_found) | ||
504 | printk(KERN_WARNING | ||
505 | "GPT: Use GNU Parted to correct GPT errors.\n"); | ||
506 | return; | ||
507 | } | ||
508 | |||
509 | /** | ||
510 | * find_valid_gpt() - Search disk for valid GPT headers and PTEs | ||
511 | * @state | ||
512 | * @gpt is a GPT header ptr, filled on return. | ||
513 | * @ptes is a PTEs ptr, filled on return. | ||
514 | * Description: Returns 1 if valid, 0 on error. | ||
515 | * If valid, returns pointers to newly allocated GPT header and PTEs. | ||
516 | * Validity depends on PMBR being valid (or being overridden by the | ||
517 | * 'gpt' kernel command line option) and finding either the Primary | ||
518 | * GPT header and PTEs valid, or the Alternate GPT header and PTEs | ||
519 | * valid. If the Primary GPT header is not valid, the Alternate GPT header | ||
520 | * is not checked unless the 'gpt' kernel command line option is passed. | ||
521 | * This protects against devices which misreport their size, and forces | ||
522 | * the user to decide to use the Alternate GPT. | ||
523 | */ | ||
524 | static int find_valid_gpt(struct parsed_partitions *state, gpt_header **gpt, | ||
525 | gpt_entry **ptes) | ||
526 | { | ||
527 | int good_pgpt = 0, good_agpt = 0, good_pmbr = 0; | ||
528 | gpt_header *pgpt = NULL, *agpt = NULL; | ||
529 | gpt_entry *pptes = NULL, *aptes = NULL; | ||
530 | legacy_mbr *legacymbr; | ||
531 | u64 lastlba; | ||
532 | |||
533 | if (!ptes) | ||
534 | return 0; | ||
535 | |||
536 | lastlba = last_lba(state->bdev); | ||
537 | if (!force_gpt) { | ||
538 | /* This will be added to the EFI Spec. per Intel after v1.02. */ | ||
539 | legacymbr = kzalloc(sizeof (*legacymbr), GFP_KERNEL); | ||
540 | if (legacymbr) { | ||
541 | read_lba(state, 0, (u8 *) legacymbr, | ||
542 | sizeof (*legacymbr)); | ||
543 | good_pmbr = is_pmbr_valid(legacymbr); | ||
544 | kfree(legacymbr); | ||
545 | } | ||
546 | if (!good_pmbr) | ||
547 | goto fail; | ||
548 | } | ||
549 | |||
550 | good_pgpt = is_gpt_valid(state, GPT_PRIMARY_PARTITION_TABLE_LBA, | ||
551 | &pgpt, &pptes); | ||
552 | if (good_pgpt) | ||
553 | good_agpt = is_gpt_valid(state, | ||
554 | le64_to_cpu(pgpt->alternate_lba), | ||
555 | &agpt, &aptes); | ||
556 | if (!good_agpt && force_gpt) | ||
557 | good_agpt = is_gpt_valid(state, lastlba, &agpt, &aptes); | ||
558 | |||
559 | /* The obviously unsuccessful case */ | ||
560 | if (!good_pgpt && !good_agpt) | ||
561 | goto fail; | ||
562 | |||
563 | compare_gpts(pgpt, agpt, lastlba); | ||
564 | |||
565 | /* The good cases */ | ||
566 | if (good_pgpt) { | ||
567 | *gpt = pgpt; | ||
568 | *ptes = pptes; | ||
569 | kfree(agpt); | ||
570 | kfree(aptes); | ||
571 | if (!good_agpt) { | ||
572 | printk(KERN_WARNING | ||
573 | "Alternate GPT is invalid, " | ||
574 | "using primary GPT.\n"); | ||
575 | } | ||
576 | return 1; | ||
577 | } | ||
578 | else if (good_agpt) { | ||
579 | *gpt = agpt; | ||
580 | *ptes = aptes; | ||
581 | kfree(pgpt); | ||
582 | kfree(pptes); | ||
583 | printk(KERN_WARNING | ||
584 | "Primary GPT is invalid, using alternate GPT.\n"); | ||
585 | return 1; | ||
586 | } | ||
587 | |||
588 | fail: | ||
589 | kfree(pgpt); | ||
590 | kfree(agpt); | ||
591 | kfree(pptes); | ||
592 | kfree(aptes); | ||
593 | *gpt = NULL; | ||
594 | *ptes = NULL; | ||
595 | return 0; | ||
596 | } | ||
597 | |||
598 | /** | ||
599 | * efi_partition(struct parsed_partitions *state) | ||
600 | * @state | ||
601 | * | ||
602 | * Description: called from check.c, if the disk contains GPT | ||
603 | * partitions, sets up partition entries in the kernel. | ||
604 | * | ||
605 | * If the first block on the disk is a legacy MBR, | ||
606 | * it will get handled by msdos_partition(). | ||
607 | * If it's a Protective MBR, we'll handle it here. | ||
608 | * | ||
609 | * We do not create a Linux partition for GPT, but | ||
610 | * only for the actual data partitions. | ||
611 | * Returns: | ||
612 | * -1 if unable to read the partition table | ||
613 | * 0 if this isn't our partition table | ||
614 | * 1 if successful | ||
615 | * | ||
616 | */ | ||
617 | int efi_partition(struct parsed_partitions *state) | ||
618 | { | ||
619 | gpt_header *gpt = NULL; | ||
620 | gpt_entry *ptes = NULL; | ||
621 | u32 i; | ||
622 | unsigned ssz = bdev_logical_block_size(state->bdev) / 512; | ||
623 | |||
624 | if (!find_valid_gpt(state, &gpt, &ptes) || !gpt || !ptes) { | ||
625 | kfree(gpt); | ||
626 | kfree(ptes); | ||
627 | return 0; | ||
628 | } | ||
629 | |||
630 | pr_debug("GUID Partition Table is valid! Yea!\n"); | ||
631 | |||
632 | for (i = 0; i < le32_to_cpu(gpt->num_partition_entries) && i < state->limit-1; i++) { | ||
633 | struct partition_meta_info *info; | ||
634 | unsigned label_count = 0; | ||
635 | unsigned label_max; | ||
636 | u64 start = le64_to_cpu(ptes[i].starting_lba); | ||
637 | u64 size = le64_to_cpu(ptes[i].ending_lba) - | ||
638 | le64_to_cpu(ptes[i].starting_lba) + 1ULL; | ||
639 | |||
640 | if (!is_pte_valid(&ptes[i], last_lba(state->bdev))) | ||
641 | continue; | ||
642 | |||
643 | put_partition(state, i+1, start * ssz, size * ssz); | ||
644 | |||
645 | /* If this is a RAID volume, tell md */ | ||
646 | if (!efi_guidcmp(ptes[i].partition_type_guid, | ||
647 | PARTITION_LINUX_RAID_GUID)) | ||
648 | state->parts[i + 1].flags = ADDPART_FLAG_RAID; | ||
649 | |||
650 | info = &state->parts[i + 1].info; | ||
651 | efi_guid_unparse(&ptes[i].unique_partition_guid, info->uuid); | ||
652 | |||
653 | /* Naively convert UTF16-LE to 7 bits. */ | ||
654 | label_max = min(sizeof(info->volname) - 1, | ||
655 | sizeof(ptes[i].partition_name)); | ||
656 | info->volname[label_max] = 0; | ||
657 | while (label_count < label_max) { | ||
658 | u8 c = ptes[i].partition_name[label_count] & 0xff; | ||
659 | if (c && !isprint(c)) | ||
660 | c = '!'; | ||
661 | info->volname[label_count] = c; | ||
662 | label_count++; | ||
663 | } | ||
664 | state->parts[i + 1].has_info = true; | ||
665 | } | ||
666 | kfree(ptes); | ||
667 | kfree(gpt); | ||
668 | strlcat(state->pp_buf, "\n", PAGE_SIZE); | ||
669 | return 1; | ||
670 | } | ||
diff --git a/block/partitions/efi.h b/block/partitions/efi.h deleted file mode 100644 index b69ab729558..00000000000 --- a/block/partitions/efi.h +++ /dev/null | |||
@@ -1,134 +0,0 @@ | |||
1 | /************************************************************ | ||
2 | * EFI GUID Partition Table | ||
3 | * Per Intel EFI Specification v1.02 | ||
4 | * http://developer.intel.com/technology/efi/efi.htm | ||
5 | * | ||
6 | * By Matt Domsch <Matt_Domsch@dell.com> Fri Sep 22 22:15:56 CDT 2000 | ||
7 | * Copyright 2000,2001 Dell Inc. | ||
8 | * | ||
9 | * This program is free software; you can redistribute it and/or modify | ||
10 | * it under the terms of the GNU General Public License as published by | ||
11 | * the Free Software Foundation; either version 2 of the License, or | ||
12 | * (at your option) any later version. | ||
13 | * | ||
14 | * This program is distributed in the hope that it will be useful, | ||
15 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
16 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
17 | * GNU General Public License for more details. | ||
18 | * | ||
19 | * You should have received a copy of the GNU General Public License | ||
20 | * along with this program; if not, write to the Free Software | ||
21 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
22 | * | ||
23 | ************************************************************/ | ||
24 | |||
25 | #ifndef FS_PART_EFI_H_INCLUDED | ||
26 | #define FS_PART_EFI_H_INCLUDED | ||
27 | |||
28 | #include <linux/types.h> | ||
29 | #include <linux/fs.h> | ||
30 | #include <linux/genhd.h> | ||
31 | #include <linux/kernel.h> | ||
32 | #include <linux/major.h> | ||
33 | #include <linux/string.h> | ||
34 | #include <linux/efi.h> | ||
35 | |||
36 | #define MSDOS_MBR_SIGNATURE 0xaa55 | ||
37 | #define EFI_PMBR_OSTYPE_EFI 0xEF | ||
38 | #define EFI_PMBR_OSTYPE_EFI_GPT 0xEE | ||
39 | |||
40 | #define GPT_HEADER_SIGNATURE 0x5452415020494645ULL | ||
41 | #define GPT_HEADER_REVISION_V1 0x00010000 | ||
42 | #define GPT_PRIMARY_PARTITION_TABLE_LBA 1 | ||
43 | |||
44 | #define PARTITION_SYSTEM_GUID \ | ||
45 | EFI_GUID( 0xC12A7328, 0xF81F, 0x11d2, \ | ||
46 | 0xBA, 0x4B, 0x00, 0xA0, 0xC9, 0x3E, 0xC9, 0x3B) | ||
47 | #define LEGACY_MBR_PARTITION_GUID \ | ||
48 | EFI_GUID( 0x024DEE41, 0x33E7, 0x11d3, \ | ||
49 | 0x9D, 0x69, 0x00, 0x08, 0xC7, 0x81, 0xF3, 0x9F) | ||
50 | #define PARTITION_MSFT_RESERVED_GUID \ | ||
51 | EFI_GUID( 0xE3C9E316, 0x0B5C, 0x4DB8, \ | ||
52 | 0x81, 0x7D, 0xF9, 0x2D, 0xF0, 0x02, 0x15, 0xAE) | ||
53 | #define PARTITION_BASIC_DATA_GUID \ | ||
54 | EFI_GUID( 0xEBD0A0A2, 0xB9E5, 0x4433, \ | ||
55 | 0x87, 0xC0, 0x68, 0xB6, 0xB7, 0x26, 0x99, 0xC7) | ||
56 | #define PARTITION_LINUX_RAID_GUID \ | ||
57 | EFI_GUID( 0xa19d880f, 0x05fc, 0x4d3b, \ | ||
58 | 0xa0, 0x06, 0x74, 0x3f, 0x0f, 0x84, 0x91, 0x1e) | ||
59 | #define PARTITION_LINUX_SWAP_GUID \ | ||
60 | EFI_GUID( 0x0657fd6d, 0xa4ab, 0x43c4, \ | ||
61 | 0x84, 0xe5, 0x09, 0x33, 0xc8, 0x4b, 0x4f, 0x4f) | ||
62 | #define PARTITION_LINUX_LVM_GUID \ | ||
63 | EFI_GUID( 0xe6d6d379, 0xf507, 0x44c2, \ | ||
64 | 0xa2, 0x3c, 0x23, 0x8f, 0x2a, 0x3d, 0xf9, 0x28) | ||
65 | |||
66 | typedef struct _gpt_header { | ||
67 | __le64 signature; | ||
68 | __le32 revision; | ||
69 | __le32 header_size; | ||
70 | __le32 header_crc32; | ||
71 | __le32 reserved1; | ||
72 | __le64 my_lba; | ||
73 | __le64 alternate_lba; | ||
74 | __le64 first_usable_lba; | ||
75 | __le64 last_usable_lba; | ||
76 | efi_guid_t disk_guid; | ||
77 | __le64 partition_entry_lba; | ||
78 | __le32 num_partition_entries; | ||
79 | __le32 sizeof_partition_entry; | ||
80 | __le32 partition_entry_array_crc32; | ||
81 | |||
82 | /* The rest of the logical block is reserved by UEFI and must be zero. | ||
83 | * EFI standard handles this by: | ||
84 | * | ||
85 | * uint8_t reserved2[ BlockSize - 92 ]; | ||
86 | */ | ||
87 | } __attribute__ ((packed)) gpt_header; | ||
88 | |||
89 | typedef struct _gpt_entry_attributes { | ||
90 | u64 required_to_function:1; | ||
91 | u64 reserved:47; | ||
92 | u64 type_guid_specific:16; | ||
93 | } __attribute__ ((packed)) gpt_entry_attributes; | ||
94 | |||
95 | typedef struct _gpt_entry { | ||
96 | efi_guid_t partition_type_guid; | ||
97 | efi_guid_t unique_partition_guid; | ||
98 | __le64 starting_lba; | ||
99 | __le64 ending_lba; | ||
100 | gpt_entry_attributes attributes; | ||
101 | efi_char16_t partition_name[72 / sizeof (efi_char16_t)]; | ||
102 | } __attribute__ ((packed)) gpt_entry; | ||
103 | |||
104 | typedef struct _legacy_mbr { | ||
105 | u8 boot_code[440]; | ||
106 | __le32 unique_mbr_signature; | ||
107 | __le16 unknown; | ||
108 | struct partition partition_record[4]; | ||
109 | __le16 signature; | ||
110 | } __attribute__ ((packed)) legacy_mbr; | ||
111 | |||
112 | /* Functions */ | ||
113 | extern int efi_partition(struct parsed_partitions *state); | ||
114 | |||
115 | #endif | ||
116 | |||
117 | /* | ||
118 | * Overrides for Emacs so that we follow Linus's tabbing style. | ||
119 | * Emacs will notice this stuff at the end of the file and automatically | ||
120 | * adjust the settings for this buffer only. This must remain at the end | ||
121 | * of the file. | ||
122 | * -------------------------------------------------------------------------- | ||
123 | * Local variables: | ||
124 | * c-indent-level: 4 | ||
125 | * c-brace-imaginary-offset: 0 | ||
126 | * c-brace-offset: -4 | ||
127 | * c-argdecl-indent: 4 | ||
128 | * c-label-offset: -4 | ||
129 | * c-continued-statement-offset: 4 | ||
130 | * c-continued-brace-offset: 0 | ||
131 | * indent-tabs-mode: nil | ||
132 | * tab-width: 8 | ||
133 | * End: | ||
134 | */ | ||
diff --git a/block/partitions/ibm.c b/block/partitions/ibm.c deleted file mode 100644 index 47a61474e79..00000000000 --- a/block/partitions/ibm.c +++ /dev/null | |||
@@ -1,364 +0,0 @@ | |||
1 | /* | ||
2 | * Author(s)......: Holger Smolinski <Holger.Smolinski@de.ibm.com> | ||
3 | * Volker Sameske <sameske@de.ibm.com> | ||
4 | * Bugreports.to..: <Linux390@de.ibm.com> | ||
5 | * Copyright IBM Corp. 1999, 2012 | ||
6 | */ | ||
7 | |||
8 | #include <linux/buffer_head.h> | ||
9 | #include <linux/hdreg.h> | ||
10 | #include <linux/slab.h> | ||
11 | #include <asm/dasd.h> | ||
12 | #include <asm/ebcdic.h> | ||
13 | #include <asm/uaccess.h> | ||
14 | #include <asm/vtoc.h> | ||
15 | |||
16 | #include "check.h" | ||
17 | #include "ibm.h" | ||
18 | |||
19 | |||
20 | union label_t { | ||
21 | struct vtoc_volume_label_cdl vol; | ||
22 | struct vtoc_volume_label_ldl lnx; | ||
23 | struct vtoc_cms_label cms; | ||
24 | }; | ||
25 | |||
26 | /* | ||
27 | * compute the block number from a | ||
28 | * cyl-cyl-head-head structure | ||
29 | */ | ||
30 | static sector_t cchh2blk(struct vtoc_cchh *ptr, struct hd_geometry *geo) | ||
31 | { | ||
32 | sector_t cyl; | ||
33 | __u16 head; | ||
34 | |||
35 | /* decode cylinder and heads for large volumes */ | ||
36 | cyl = ptr->hh & 0xFFF0; | ||
37 | cyl <<= 12; | ||
38 | cyl |= ptr->cc; | ||
39 | head = ptr->hh & 0x000F; | ||
40 | return cyl * geo->heads * geo->sectors + | ||
41 | head * geo->sectors; | ||
42 | } | ||
43 | |||
44 | /* | ||
45 | * compute the block number from a | ||
46 | * cyl-cyl-head-head-block structure | ||
47 | */ | ||
48 | static sector_t cchhb2blk(struct vtoc_cchhb *ptr, struct hd_geometry *geo) | ||
49 | { | ||
50 | sector_t cyl; | ||
51 | __u16 head; | ||
52 | |||
53 | /* decode cylinder and heads for large volumes */ | ||
54 | cyl = ptr->hh & 0xFFF0; | ||
55 | cyl <<= 12; | ||
56 | cyl |= ptr->cc; | ||
57 | head = ptr->hh & 0x000F; | ||
58 | return cyl * geo->heads * geo->sectors + | ||
59 | head * geo->sectors + | ||
60 | ptr->b; | ||
61 | } | ||
62 | |||
63 | static int find_label(struct parsed_partitions *state, | ||
64 | dasd_information2_t *info, | ||
65 | struct hd_geometry *geo, | ||
66 | int blocksize, | ||
67 | sector_t *labelsect, | ||
68 | char name[], | ||
69 | char type[], | ||
70 | union label_t *label) | ||
71 | { | ||
72 | Sector sect; | ||
73 | unsigned char *data; | ||
74 | sector_t testsect[3]; | ||
75 | unsigned char temp[5]; | ||
76 | int found = 0; | ||
77 | int i, testcount; | ||
78 | |||
79 | /* There a three places where we may find a valid label: | ||
80 | * - on an ECKD disk it's block 2 | ||
81 | * - on an FBA disk it's block 1 | ||
82 | * - on an CMS formatted FBA disk it is sector 1, even if the block size | ||
83 | * is larger than 512 bytes (possible if the DIAG discipline is used) | ||
84 | * If we have a valid info structure, then we know exactly which case we | ||
85 | * have, otherwise we just search through all possebilities. | ||
86 | */ | ||
87 | if (info) { | ||
88 | if ((info->cu_type == 0x6310 && info->dev_type == 0x9336) || | ||
89 | (info->cu_type == 0x3880 && info->dev_type == 0x3370)) | ||
90 | testsect[0] = info->label_block; | ||
91 | else | ||
92 | testsect[0] = info->label_block * (blocksize >> 9); | ||
93 | testcount = 1; | ||
94 | } else { | ||
95 | testsect[0] = 1; | ||
96 | testsect[1] = (blocksize >> 9); | ||
97 | testsect[2] = 2 * (blocksize >> 9); | ||
98 | testcount = 3; | ||
99 | } | ||
100 | for (i = 0; i < testcount; ++i) { | ||
101 | data = read_part_sector(state, testsect[i], §); | ||
102 | if (data == NULL) | ||
103 | continue; | ||
104 | memcpy(label, data, sizeof(*label)); | ||
105 | memcpy(temp, data, 4); | ||
106 | temp[4] = 0; | ||
107 | EBCASC(temp, 4); | ||
108 | put_dev_sector(sect); | ||
109 | if (!strcmp(temp, "VOL1") || | ||
110 | !strcmp(temp, "LNX1") || | ||
111 | !strcmp(temp, "CMS1")) { | ||
112 | if (!strcmp(temp, "VOL1")) { | ||
113 | strncpy(type, label->vol.vollbl, 4); | ||
114 | strncpy(name, label->vol.volid, 6); | ||
115 | } else { | ||
116 | strncpy(type, label->lnx.vollbl, 4); | ||
117 | strncpy(name, label->lnx.volid, 6); | ||
118 | } | ||
119 | EBCASC(type, 4); | ||
120 | EBCASC(name, 6); | ||
121 | *labelsect = testsect[i]; | ||
122 | found = 1; | ||
123 | break; | ||
124 | } | ||
125 | } | ||
126 | if (!found) | ||
127 | memset(label, 0, sizeof(*label)); | ||
128 | |||
129 | return found; | ||
130 | } | ||
131 | |||
132 | static int find_vol1_partitions(struct parsed_partitions *state, | ||
133 | struct hd_geometry *geo, | ||
134 | int blocksize, | ||
135 | char name[], | ||
136 | union label_t *label) | ||
137 | { | ||
138 | sector_t blk; | ||
139 | int counter; | ||
140 | char tmp[64]; | ||
141 | Sector sect; | ||
142 | unsigned char *data; | ||
143 | loff_t offset, size; | ||
144 | struct vtoc_format1_label f1; | ||
145 | int secperblk; | ||
146 | |||
147 | snprintf(tmp, sizeof(tmp), "VOL1/%8s:", name); | ||
148 | strlcat(state->pp_buf, tmp, PAGE_SIZE); | ||
149 | /* | ||
150 | * get start of VTOC from the disk label and then search for format1 | ||
151 | * and format8 labels | ||
152 | */ | ||
153 | secperblk = blocksize >> 9; | ||
154 | blk = cchhb2blk(&label->vol.vtoc, geo) + 1; | ||
155 | counter = 0; | ||
156 | data = read_part_sector(state, blk * secperblk, §); | ||
157 | while (data != NULL) { | ||
158 | memcpy(&f1, data, sizeof(struct vtoc_format1_label)); | ||
159 | put_dev_sector(sect); | ||
160 | /* skip FMT4 / FMT5 / FMT7 labels */ | ||
161 | if (f1.DS1FMTID == _ascebc['4'] | ||
162 | || f1.DS1FMTID == _ascebc['5'] | ||
163 | || f1.DS1FMTID == _ascebc['7'] | ||
164 | || f1.DS1FMTID == _ascebc['9']) { | ||
165 | blk++; | ||
166 | data = read_part_sector(state, blk * secperblk, §); | ||
167 | continue; | ||
168 | } | ||
169 | /* only FMT1 and 8 labels valid at this point */ | ||
170 | if (f1.DS1FMTID != _ascebc['1'] && | ||
171 | f1.DS1FMTID != _ascebc['8']) | ||
172 | break; | ||
173 | /* OK, we got valid partition data */ | ||
174 | offset = cchh2blk(&f1.DS1EXT1.llimit, geo); | ||
175 | size = cchh2blk(&f1.DS1EXT1.ulimit, geo) - | ||
176 | offset + geo->sectors; | ||
177 | offset *= secperblk; | ||
178 | size *= secperblk; | ||
179 | if (counter >= state->limit) | ||
180 | break; | ||
181 | put_partition(state, counter + 1, offset, size); | ||
182 | counter++; | ||
183 | blk++; | ||
184 | data = read_part_sector(state, blk * secperblk, §); | ||
185 | } | ||
186 | strlcat(state->pp_buf, "\n", PAGE_SIZE); | ||
187 | |||
188 | if (!data) | ||
189 | return -1; | ||
190 | |||
191 | return 1; | ||
192 | } | ||
193 | |||
194 | static int find_lnx1_partitions(struct parsed_partitions *state, | ||
195 | struct hd_geometry *geo, | ||
196 | int blocksize, | ||
197 | char name[], | ||
198 | union label_t *label, | ||
199 | sector_t labelsect, | ||
200 | loff_t i_size, | ||
201 | dasd_information2_t *info) | ||
202 | { | ||
203 | loff_t offset, geo_size, size; | ||
204 | char tmp[64]; | ||
205 | int secperblk; | ||
206 | |||
207 | snprintf(tmp, sizeof(tmp), "LNX1/%8s:", name); | ||
208 | strlcat(state->pp_buf, tmp, PAGE_SIZE); | ||
209 | secperblk = blocksize >> 9; | ||
210 | if (label->lnx.ldl_version == 0xf2) { | ||
211 | size = label->lnx.formatted_blocks * secperblk; | ||
212 | } else { | ||
213 | /* | ||
214 | * Formated w/o large volume support. If the sanity check | ||
215 | * 'size based on geo == size based on i_size' is true, then | ||
216 | * we can safely assume that we know the formatted size of | ||
217 | * the disk, otherwise we need additional information | ||
218 | * that we can only get from a real DASD device. | ||
219 | */ | ||
220 | geo_size = geo->cylinders * geo->heads | ||
221 | * geo->sectors * secperblk; | ||
222 | size = i_size >> 9; | ||
223 | if (size != geo_size) { | ||
224 | if (!info) { | ||
225 | strlcat(state->pp_buf, "\n", PAGE_SIZE); | ||
226 | return 1; | ||
227 | } | ||
228 | if (!strcmp(info->type, "ECKD")) | ||
229 | if (geo_size < size) | ||
230 | size = geo_size; | ||
231 | /* else keep size based on i_size */ | ||
232 | } | ||
233 | } | ||
234 | /* first and only partition starts in the first block after the label */ | ||
235 | offset = labelsect + secperblk; | ||
236 | put_partition(state, 1, offset, size - offset); | ||
237 | strlcat(state->pp_buf, "\n", PAGE_SIZE); | ||
238 | return 1; | ||
239 | } | ||
240 | |||
241 | static int find_cms1_partitions(struct parsed_partitions *state, | ||
242 | struct hd_geometry *geo, | ||
243 | int blocksize, | ||
244 | char name[], | ||
245 | union label_t *label, | ||
246 | sector_t labelsect) | ||
247 | { | ||
248 | loff_t offset, size; | ||
249 | char tmp[64]; | ||
250 | int secperblk; | ||
251 | |||
252 | /* | ||
253 | * VM style CMS1 labeled disk | ||
254 | */ | ||
255 | blocksize = label->cms.block_size; | ||
256 | secperblk = blocksize >> 9; | ||
257 | if (label->cms.disk_offset != 0) { | ||
258 | snprintf(tmp, sizeof(tmp), "CMS1/%8s(MDSK):", name); | ||
259 | strlcat(state->pp_buf, tmp, PAGE_SIZE); | ||
260 | /* disk is reserved minidisk */ | ||
261 | offset = label->cms.disk_offset * secperblk; | ||
262 | size = (label->cms.block_count - 1) * secperblk; | ||
263 | } else { | ||
264 | snprintf(tmp, sizeof(tmp), "CMS1/%8s:", name); | ||
265 | strlcat(state->pp_buf, tmp, PAGE_SIZE); | ||
266 | /* | ||
267 | * Special case for FBA devices: | ||
268 | * If an FBA device is CMS formatted with blocksize > 512 byte | ||
269 | * and the DIAG discipline is used, then the CMS label is found | ||
270 | * in sector 1 instead of block 1. However, the partition is | ||
271 | * still supposed to start in block 2. | ||
272 | */ | ||
273 | if (labelsect == 1) | ||
274 | offset = 2 * secperblk; | ||
275 | else | ||
276 | offset = labelsect + secperblk; | ||
277 | size = label->cms.block_count * secperblk; | ||
278 | } | ||
279 | |||
280 | put_partition(state, 1, offset, size-offset); | ||
281 | strlcat(state->pp_buf, "\n", PAGE_SIZE); | ||
282 | return 1; | ||
283 | } | ||
284 | |||
285 | |||
286 | /* | ||
287 | * This is the main function, called by check.c | ||
288 | */ | ||
289 | int ibm_partition(struct parsed_partitions *state) | ||
290 | { | ||
291 | struct block_device *bdev = state->bdev; | ||
292 | int blocksize, res; | ||
293 | loff_t i_size, offset, size; | ||
294 | dasd_information2_t *info; | ||
295 | struct hd_geometry *geo; | ||
296 | char type[5] = {0,}; | ||
297 | char name[7] = {0,}; | ||
298 | sector_t labelsect; | ||
299 | union label_t *label; | ||
300 | |||
301 | res = 0; | ||
302 | blocksize = bdev_logical_block_size(bdev); | ||
303 | if (blocksize <= 0) | ||
304 | goto out_exit; | ||
305 | i_size = i_size_read(bdev->bd_inode); | ||
306 | if (i_size == 0) | ||
307 | goto out_exit; | ||
308 | info = kmalloc(sizeof(dasd_information2_t), GFP_KERNEL); | ||
309 | if (info == NULL) | ||
310 | goto out_exit; | ||
311 | geo = kmalloc(sizeof(struct hd_geometry), GFP_KERNEL); | ||
312 | if (geo == NULL) | ||
313 | goto out_nogeo; | ||
314 | label = kmalloc(sizeof(union label_t), GFP_KERNEL); | ||
315 | if (label == NULL) | ||
316 | goto out_nolab; | ||
317 | if (ioctl_by_bdev(bdev, HDIO_GETGEO, (unsigned long)geo) != 0) | ||
318 | goto out_freeall; | ||
319 | if (ioctl_by_bdev(bdev, BIODASDINFO2, (unsigned long)info) != 0) { | ||
320 | kfree(info); | ||
321 | info = NULL; | ||
322 | } | ||
323 | |||
324 | if (find_label(state, info, geo, blocksize, &labelsect, name, type, | ||
325 | label)) { | ||
326 | if (!strncmp(type, "VOL1", 4)) { | ||
327 | res = find_vol1_partitions(state, geo, blocksize, name, | ||
328 | label); | ||
329 | } else if (!strncmp(type, "LNX1", 4)) { | ||
330 | res = find_lnx1_partitions(state, geo, blocksize, name, | ||
331 | label, labelsect, i_size, | ||
332 | info); | ||
333 | } else if (!strncmp(type, "CMS1", 4)) { | ||
334 | res = find_cms1_partitions(state, geo, blocksize, name, | ||
335 | label, labelsect); | ||
336 | } | ||
337 | } else if (info) { | ||
338 | /* | ||
339 | * ugly but needed for backward compatibility: | ||
340 | * If the block device is a DASD (i.e. BIODASDINFO2 works), | ||
341 | * then we claim it in any case, even though it has no valid | ||
342 | * label. If it has the LDL format, then we simply define a | ||
343 | * partition as if it had an LNX1 label. | ||
344 | */ | ||
345 | res = 1; | ||
346 | if (info->format == DASD_FORMAT_LDL) { | ||
347 | strlcat(state->pp_buf, "(nonl)", PAGE_SIZE); | ||
348 | size = i_size >> 9; | ||
349 | offset = (info->label_block + 1) * (blocksize >> 9); | ||
350 | put_partition(state, 1, offset, size-offset); | ||
351 | strlcat(state->pp_buf, "\n", PAGE_SIZE); | ||
352 | } | ||
353 | } else | ||
354 | res = 0; | ||
355 | |||
356 | out_freeall: | ||
357 | kfree(label); | ||
358 | out_nolab: | ||
359 | kfree(geo); | ||
360 | out_nogeo: | ||
361 | kfree(info); | ||
362 | out_exit: | ||
363 | return res; | ||
364 | } | ||
diff --git a/block/partitions/ibm.h b/block/partitions/ibm.h deleted file mode 100644 index 08fb0804a81..00000000000 --- a/block/partitions/ibm.h +++ /dev/null | |||
@@ -1 +0,0 @@ | |||
1 | int ibm_partition(struct parsed_partitions *); | ||
diff --git a/block/partitions/karma.c b/block/partitions/karma.c deleted file mode 100644 index 0ea19312706..00000000000 --- a/block/partitions/karma.c +++ /dev/null | |||
@@ -1,57 +0,0 @@ | |||
1 | /* | ||
2 | * fs/partitions/karma.c | ||
3 | * Rio Karma partition info. | ||
4 | * | ||
5 | * Copyright (C) 2006 Bob Copeland (me@bobcopeland.com) | ||
6 | * based on osf.c | ||
7 | */ | ||
8 | |||
9 | #include "check.h" | ||
10 | #include "karma.h" | ||
11 | |||
12 | int karma_partition(struct parsed_partitions *state) | ||
13 | { | ||
14 | int i; | ||
15 | int slot = 1; | ||
16 | Sector sect; | ||
17 | unsigned char *data; | ||
18 | struct disklabel { | ||
19 | u8 d_reserved[270]; | ||
20 | struct d_partition { | ||
21 | __le32 p_res; | ||
22 | u8 p_fstype; | ||
23 | u8 p_res2[3]; | ||
24 | __le32 p_offset; | ||
25 | __le32 p_size; | ||
26 | } d_partitions[2]; | ||
27 | u8 d_blank[208]; | ||
28 | __le16 d_magic; | ||
29 | } __attribute__((packed)) *label; | ||
30 | struct d_partition *p; | ||
31 | |||
32 | data = read_part_sector(state, 0, §); | ||
33 | if (!data) | ||
34 | return -1; | ||
35 | |||
36 | label = (struct disklabel *)data; | ||
37 | if (le16_to_cpu(label->d_magic) != KARMA_LABEL_MAGIC) { | ||
38 | put_dev_sector(sect); | ||
39 | return 0; | ||
40 | } | ||
41 | |||
42 | p = label->d_partitions; | ||
43 | for (i = 0 ; i < 2; i++, p++) { | ||
44 | if (slot == state->limit) | ||
45 | break; | ||
46 | |||
47 | if (p->p_fstype == 0x4d && le32_to_cpu(p->p_size)) { | ||
48 | put_partition(state, slot, le32_to_cpu(p->p_offset), | ||
49 | le32_to_cpu(p->p_size)); | ||
50 | } | ||
51 | slot++; | ||
52 | } | ||
53 | strlcat(state->pp_buf, "\n", PAGE_SIZE); | ||
54 | put_dev_sector(sect); | ||
55 | return 1; | ||
56 | } | ||
57 | |||
diff --git a/block/partitions/karma.h b/block/partitions/karma.h deleted file mode 100644 index c764b2e9df2..00000000000 --- a/block/partitions/karma.h +++ /dev/null | |||
@@ -1,8 +0,0 @@ | |||
1 | /* | ||
2 | * fs/partitions/karma.h | ||
3 | */ | ||
4 | |||
5 | #define KARMA_LABEL_MAGIC 0xAB56 | ||
6 | |||
7 | int karma_partition(struct parsed_partitions *state); | ||
8 | |||
diff --git a/block/partitions/ldm.c b/block/partitions/ldm.c deleted file mode 100644 index e507cfbd044..00000000000 --- a/block/partitions/ldm.c +++ /dev/null | |||
@@ -1,1567 +0,0 @@ | |||
1 | /** | ||
2 | * ldm - Support for Windows Logical Disk Manager (Dynamic Disks) | ||
3 | * | ||
4 | * Copyright (C) 2001,2002 Richard Russon <ldm@flatcap.org> | ||
5 | * Copyright (c) 2001-2012 Anton Altaparmakov | ||
6 | * Copyright (C) 2001,2002 Jakob Kemi <jakob.kemi@telia.com> | ||
7 | * | ||
8 | * Documentation is available at http://www.linux-ntfs.org/doku.php?id=downloads | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or modify it under | ||
11 | * the terms of the GNU General Public License as published by the Free Software | ||
12 | * Foundation; either version 2 of the License, or (at your option) any later | ||
13 | * version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, but WITHOUT | ||
16 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS | ||
17 | * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more | ||
18 | * details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public License along with | ||
21 | * this program (in the main directory of the source in the file COPYING); if | ||
22 | * not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, | ||
23 | * Boston, MA 02111-1307 USA | ||
24 | */ | ||
25 | |||
26 | #include <linux/slab.h> | ||
27 | #include <linux/pagemap.h> | ||
28 | #include <linux/stringify.h> | ||
29 | #include <linux/kernel.h> | ||
30 | #include "ldm.h" | ||
31 | #include "check.h" | ||
32 | #include "msdos.h" | ||
33 | |||
34 | /** | ||
35 | * ldm_debug/info/error/crit - Output an error message | ||
36 | * @f: A printf format string containing the message | ||
37 | * @...: Variables to substitute into @f | ||
38 | * | ||
39 | * ldm_debug() writes a DEBUG level message to the syslog but only if the | ||
40 | * driver was compiled with debug enabled. Otherwise, the call turns into a NOP. | ||
41 | */ | ||
42 | #ifndef CONFIG_LDM_DEBUG | ||
43 | #define ldm_debug(...) do {} while (0) | ||
44 | #else | ||
45 | #define ldm_debug(f, a...) _ldm_printk (KERN_DEBUG, __func__, f, ##a) | ||
46 | #endif | ||
47 | |||
48 | #define ldm_crit(f, a...) _ldm_printk (KERN_CRIT, __func__, f, ##a) | ||
49 | #define ldm_error(f, a...) _ldm_printk (KERN_ERR, __func__, f, ##a) | ||
50 | #define ldm_info(f, a...) _ldm_printk (KERN_INFO, __func__, f, ##a) | ||
51 | |||
52 | static __printf(3, 4) | ||
53 | void _ldm_printk(const char *level, const char *function, const char *fmt, ...) | ||
54 | { | ||
55 | struct va_format vaf; | ||
56 | va_list args; | ||
57 | |||
58 | va_start (args, fmt); | ||
59 | |||
60 | vaf.fmt = fmt; | ||
61 | vaf.va = &args; | ||
62 | |||
63 | printk("%s%s(): %pV\n", level, function, &vaf); | ||
64 | |||
65 | va_end(args); | ||
66 | } | ||
67 | |||
68 | /** | ||
69 | * ldm_parse_hexbyte - Convert a ASCII hex number to a byte | ||
70 | * @src: Pointer to at least 2 characters to convert. | ||
71 | * | ||
72 | * Convert a two character ASCII hex string to a number. | ||
73 | * | ||
74 | * Return: 0-255 Success, the byte was parsed correctly | ||
75 | * -1 Error, an invalid character was supplied | ||
76 | */ | ||
77 | static int ldm_parse_hexbyte (const u8 *src) | ||
78 | { | ||
79 | unsigned int x; /* For correct wrapping */ | ||
80 | int h; | ||
81 | |||
82 | /* high part */ | ||
83 | x = h = hex_to_bin(src[0]); | ||
84 | if (h < 0) | ||
85 | return -1; | ||
86 | |||
87 | /* low part */ | ||
88 | h = hex_to_bin(src[1]); | ||
89 | if (h < 0) | ||
90 | return -1; | ||
91 | |||
92 | return (x << 4) + h; | ||
93 | } | ||
94 | |||
95 | /** | ||
96 | * ldm_parse_guid - Convert GUID from ASCII to binary | ||
97 | * @src: 36 char string of the form fa50ff2b-f2e8-45de-83fa-65417f2f49ba | ||
98 | * @dest: Memory block to hold binary GUID (16 bytes) | ||
99 | * | ||
100 | * N.B. The GUID need not be NULL terminated. | ||
101 | * | ||
102 | * Return: 'true' @dest contains binary GUID | ||
103 | * 'false' @dest contents are undefined | ||
104 | */ | ||
105 | static bool ldm_parse_guid (const u8 *src, u8 *dest) | ||
106 | { | ||
107 | static const int size[] = { 4, 2, 2, 2, 6 }; | ||
108 | int i, j, v; | ||
109 | |||
110 | if (src[8] != '-' || src[13] != '-' || | ||
111 | src[18] != '-' || src[23] != '-') | ||
112 | return false; | ||
113 | |||
114 | for (j = 0; j < 5; j++, src++) | ||
115 | for (i = 0; i < size[j]; i++, src+=2, *dest++ = v) | ||
116 | if ((v = ldm_parse_hexbyte (src)) < 0) | ||
117 | return false; | ||
118 | |||
119 | return true; | ||
120 | } | ||
121 | |||
122 | /** | ||
123 | * ldm_parse_privhead - Read the LDM Database PRIVHEAD structure | ||
124 | * @data: Raw database PRIVHEAD structure loaded from the device | ||
125 | * @ph: In-memory privhead structure in which to return parsed information | ||
126 | * | ||
127 | * This parses the LDM database PRIVHEAD structure supplied in @data and | ||
128 | * sets up the in-memory privhead structure @ph with the obtained information. | ||
129 | * | ||
130 | * Return: 'true' @ph contains the PRIVHEAD data | ||
131 | * 'false' @ph contents are undefined | ||
132 | */ | ||
133 | static bool ldm_parse_privhead(const u8 *data, struct privhead *ph) | ||
134 | { | ||
135 | bool is_vista = false; | ||
136 | |||
137 | BUG_ON(!data || !ph); | ||
138 | if (MAGIC_PRIVHEAD != get_unaligned_be64(data)) { | ||
139 | ldm_error("Cannot find PRIVHEAD structure. LDM database is" | ||
140 | " corrupt. Aborting."); | ||
141 | return false; | ||
142 | } | ||
143 | ph->ver_major = get_unaligned_be16(data + 0x000C); | ||
144 | ph->ver_minor = get_unaligned_be16(data + 0x000E); | ||
145 | ph->logical_disk_start = get_unaligned_be64(data + 0x011B); | ||
146 | ph->logical_disk_size = get_unaligned_be64(data + 0x0123); | ||
147 | ph->config_start = get_unaligned_be64(data + 0x012B); | ||
148 | ph->config_size = get_unaligned_be64(data + 0x0133); | ||
149 | /* Version 2.11 is Win2k/XP and version 2.12 is Vista. */ | ||
150 | if (ph->ver_major == 2 && ph->ver_minor == 12) | ||
151 | is_vista = true; | ||
152 | if (!is_vista && (ph->ver_major != 2 || ph->ver_minor != 11)) { | ||
153 | ldm_error("Expected PRIVHEAD version 2.11 or 2.12, got %d.%d." | ||
154 | " Aborting.", ph->ver_major, ph->ver_minor); | ||
155 | return false; | ||
156 | } | ||
157 | ldm_debug("PRIVHEAD version %d.%d (Windows %s).", ph->ver_major, | ||
158 | ph->ver_minor, is_vista ? "Vista" : "2000/XP"); | ||
159 | if (ph->config_size != LDM_DB_SIZE) { /* 1 MiB in sectors. */ | ||
160 | /* Warn the user and continue, carefully. */ | ||
161 | ldm_info("Database is normally %u bytes, it claims to " | ||
162 | "be %llu bytes.", LDM_DB_SIZE, | ||
163 | (unsigned long long)ph->config_size); | ||
164 | } | ||
165 | if ((ph->logical_disk_size == 0) || (ph->logical_disk_start + | ||
166 | ph->logical_disk_size > ph->config_start)) { | ||
167 | ldm_error("PRIVHEAD disk size doesn't match real disk size"); | ||
168 | return false; | ||
169 | } | ||
170 | if (!ldm_parse_guid(data + 0x0030, ph->disk_id)) { | ||
171 | ldm_error("PRIVHEAD contains an invalid GUID."); | ||
172 | return false; | ||
173 | } | ||
174 | ldm_debug("Parsed PRIVHEAD successfully."); | ||
175 | return true; | ||
176 | } | ||
177 | |||
178 | /** | ||
179 | * ldm_parse_tocblock - Read the LDM Database TOCBLOCK structure | ||
180 | * @data: Raw database TOCBLOCK structure loaded from the device | ||
181 | * @toc: In-memory toc structure in which to return parsed information | ||
182 | * | ||
183 | * This parses the LDM Database TOCBLOCK (table of contents) structure supplied | ||
184 | * in @data and sets up the in-memory tocblock structure @toc with the obtained | ||
185 | * information. | ||
186 | * | ||
187 | * N.B. The *_start and *_size values returned in @toc are not range-checked. | ||
188 | * | ||
189 | * Return: 'true' @toc contains the TOCBLOCK data | ||
190 | * 'false' @toc contents are undefined | ||
191 | */ | ||
192 | static bool ldm_parse_tocblock (const u8 *data, struct tocblock *toc) | ||
193 | { | ||
194 | BUG_ON (!data || !toc); | ||
195 | |||
196 | if (MAGIC_TOCBLOCK != get_unaligned_be64(data)) { | ||
197 | ldm_crit ("Cannot find TOCBLOCK, database may be corrupt."); | ||
198 | return false; | ||
199 | } | ||
200 | strncpy (toc->bitmap1_name, data + 0x24, sizeof (toc->bitmap1_name)); | ||
201 | toc->bitmap1_name[sizeof (toc->bitmap1_name) - 1] = 0; | ||
202 | toc->bitmap1_start = get_unaligned_be64(data + 0x2E); | ||
203 | toc->bitmap1_size = get_unaligned_be64(data + 0x36); | ||
204 | |||
205 | if (strncmp (toc->bitmap1_name, TOC_BITMAP1, | ||
206 | sizeof (toc->bitmap1_name)) != 0) { | ||
207 | ldm_crit ("TOCBLOCK's first bitmap is '%s', should be '%s'.", | ||
208 | TOC_BITMAP1, toc->bitmap1_name); | ||
209 | return false; | ||
210 | } | ||
211 | strncpy (toc->bitmap2_name, data + 0x46, sizeof (toc->bitmap2_name)); | ||
212 | toc->bitmap2_name[sizeof (toc->bitmap2_name) - 1] = 0; | ||
213 | toc->bitmap2_start = get_unaligned_be64(data + 0x50); | ||
214 | toc->bitmap2_size = get_unaligned_be64(data + 0x58); | ||
215 | if (strncmp (toc->bitmap2_name, TOC_BITMAP2, | ||
216 | sizeof (toc->bitmap2_name)) != 0) { | ||
217 | ldm_crit ("TOCBLOCK's second bitmap is '%s', should be '%s'.", | ||
218 | TOC_BITMAP2, toc->bitmap2_name); | ||
219 | return false; | ||
220 | } | ||
221 | ldm_debug ("Parsed TOCBLOCK successfully."); | ||
222 | return true; | ||
223 | } | ||
224 | |||
225 | /** | ||
226 | * ldm_parse_vmdb - Read the LDM Database VMDB structure | ||
227 | * @data: Raw database VMDB structure loaded from the device | ||
228 | * @vm: In-memory vmdb structure in which to return parsed information | ||
229 | * | ||
230 | * This parses the LDM Database VMDB structure supplied in @data and sets up | ||
231 | * the in-memory vmdb structure @vm with the obtained information. | ||
232 | * | ||
233 | * N.B. The *_start, *_size and *_seq values will be range-checked later. | ||
234 | * | ||
235 | * Return: 'true' @vm contains VMDB info | ||
236 | * 'false' @vm contents are undefined | ||
237 | */ | ||
238 | static bool ldm_parse_vmdb (const u8 *data, struct vmdb *vm) | ||
239 | { | ||
240 | BUG_ON (!data || !vm); | ||
241 | |||
242 | if (MAGIC_VMDB != get_unaligned_be32(data)) { | ||
243 | ldm_crit ("Cannot find the VMDB, database may be corrupt."); | ||
244 | return false; | ||
245 | } | ||
246 | |||
247 | vm->ver_major = get_unaligned_be16(data + 0x12); | ||
248 | vm->ver_minor = get_unaligned_be16(data + 0x14); | ||
249 | if ((vm->ver_major != 4) || (vm->ver_minor != 10)) { | ||
250 | ldm_error ("Expected VMDB version %d.%d, got %d.%d. " | ||
251 | "Aborting.", 4, 10, vm->ver_major, vm->ver_minor); | ||
252 | return false; | ||
253 | } | ||
254 | |||
255 | vm->vblk_size = get_unaligned_be32(data + 0x08); | ||
256 | if (vm->vblk_size == 0) { | ||
257 | ldm_error ("Illegal VBLK size"); | ||
258 | return false; | ||
259 | } | ||
260 | |||
261 | vm->vblk_offset = get_unaligned_be32(data + 0x0C); | ||
262 | vm->last_vblk_seq = get_unaligned_be32(data + 0x04); | ||
263 | |||
264 | ldm_debug ("Parsed VMDB successfully."); | ||
265 | return true; | ||
266 | } | ||
267 | |||
268 | /** | ||
269 | * ldm_compare_privheads - Compare two privhead objects | ||
270 | * @ph1: First privhead | ||
271 | * @ph2: Second privhead | ||
272 | * | ||
273 | * This compares the two privhead structures @ph1 and @ph2. | ||
274 | * | ||
275 | * Return: 'true' Identical | ||
276 | * 'false' Different | ||
277 | */ | ||
278 | static bool ldm_compare_privheads (const struct privhead *ph1, | ||
279 | const struct privhead *ph2) | ||
280 | { | ||
281 | BUG_ON (!ph1 || !ph2); | ||
282 | |||
283 | return ((ph1->ver_major == ph2->ver_major) && | ||
284 | (ph1->ver_minor == ph2->ver_minor) && | ||
285 | (ph1->logical_disk_start == ph2->logical_disk_start) && | ||
286 | (ph1->logical_disk_size == ph2->logical_disk_size) && | ||
287 | (ph1->config_start == ph2->config_start) && | ||
288 | (ph1->config_size == ph2->config_size) && | ||
289 | !memcmp (ph1->disk_id, ph2->disk_id, GUID_SIZE)); | ||
290 | } | ||
291 | |||
292 | /** | ||
293 | * ldm_compare_tocblocks - Compare two tocblock objects | ||
294 | * @toc1: First toc | ||
295 | * @toc2: Second toc | ||
296 | * | ||
297 | * This compares the two tocblock structures @toc1 and @toc2. | ||
298 | * | ||
299 | * Return: 'true' Identical | ||
300 | * 'false' Different | ||
301 | */ | ||
302 | static bool ldm_compare_tocblocks (const struct tocblock *toc1, | ||
303 | const struct tocblock *toc2) | ||
304 | { | ||
305 | BUG_ON (!toc1 || !toc2); | ||
306 | |||
307 | return ((toc1->bitmap1_start == toc2->bitmap1_start) && | ||
308 | (toc1->bitmap1_size == toc2->bitmap1_size) && | ||
309 | (toc1->bitmap2_start == toc2->bitmap2_start) && | ||
310 | (toc1->bitmap2_size == toc2->bitmap2_size) && | ||
311 | !strncmp (toc1->bitmap1_name, toc2->bitmap1_name, | ||
312 | sizeof (toc1->bitmap1_name)) && | ||
313 | !strncmp (toc1->bitmap2_name, toc2->bitmap2_name, | ||
314 | sizeof (toc1->bitmap2_name))); | ||
315 | } | ||
316 | |||
317 | /** | ||
318 | * ldm_validate_privheads - Compare the primary privhead with its backups | ||
319 | * @state: Partition check state including device holding the LDM Database | ||
320 | * @ph1: Memory struct to fill with ph contents | ||
321 | * | ||
322 | * Read and compare all three privheads from disk. | ||
323 | * | ||
324 | * The privheads on disk show the size and location of the main disk area and | ||
325 | * the configuration area (the database). The values are range-checked against | ||
326 | * @hd, which contains the real size of the disk. | ||
327 | * | ||
328 | * Return: 'true' Success | ||
329 | * 'false' Error | ||
330 | */ | ||
331 | static bool ldm_validate_privheads(struct parsed_partitions *state, | ||
332 | struct privhead *ph1) | ||
333 | { | ||
334 | static const int off[3] = { OFF_PRIV1, OFF_PRIV2, OFF_PRIV3 }; | ||
335 | struct privhead *ph[3] = { ph1 }; | ||
336 | Sector sect; | ||
337 | u8 *data; | ||
338 | bool result = false; | ||
339 | long num_sects; | ||
340 | int i; | ||
341 | |||
342 | BUG_ON (!state || !ph1); | ||
343 | |||
344 | ph[1] = kmalloc (sizeof (*ph[1]), GFP_KERNEL); | ||
345 | ph[2] = kmalloc (sizeof (*ph[2]), GFP_KERNEL); | ||
346 | if (!ph[1] || !ph[2]) { | ||
347 | ldm_crit ("Out of memory."); | ||
348 | goto out; | ||
349 | } | ||
350 | |||
351 | /* off[1 & 2] are relative to ph[0]->config_start */ | ||
352 | ph[0]->config_start = 0; | ||
353 | |||
354 | /* Read and parse privheads */ | ||
355 | for (i = 0; i < 3; i++) { | ||
356 | data = read_part_sector(state, ph[0]->config_start + off[i], | ||
357 | §); | ||
358 | if (!data) { | ||
359 | ldm_crit ("Disk read failed."); | ||
360 | goto out; | ||
361 | } | ||
362 | result = ldm_parse_privhead (data, ph[i]); | ||
363 | put_dev_sector (sect); | ||
364 | if (!result) { | ||
365 | ldm_error ("Cannot find PRIVHEAD %d.", i+1); /* Log again */ | ||
366 | if (i < 2) | ||
367 | goto out; /* Already logged */ | ||
368 | else | ||
369 | break; /* FIXME ignore for now, 3rd PH can fail on odd-sized disks */ | ||
370 | } | ||
371 | } | ||
372 | |||
373 | num_sects = state->bdev->bd_inode->i_size >> 9; | ||
374 | |||
375 | if ((ph[0]->config_start > num_sects) || | ||
376 | ((ph[0]->config_start + ph[0]->config_size) > num_sects)) { | ||
377 | ldm_crit ("Database extends beyond the end of the disk."); | ||
378 | goto out; | ||
379 | } | ||
380 | |||
381 | if ((ph[0]->logical_disk_start > ph[0]->config_start) || | ||
382 | ((ph[0]->logical_disk_start + ph[0]->logical_disk_size) | ||
383 | > ph[0]->config_start)) { | ||
384 | ldm_crit ("Disk and database overlap."); | ||
385 | goto out; | ||
386 | } | ||
387 | |||
388 | if (!ldm_compare_privheads (ph[0], ph[1])) { | ||
389 | ldm_crit ("Primary and backup PRIVHEADs don't match."); | ||
390 | goto out; | ||
391 | } | ||
392 | /* FIXME ignore this for now | ||
393 | if (!ldm_compare_privheads (ph[0], ph[2])) { | ||
394 | ldm_crit ("Primary and backup PRIVHEADs don't match."); | ||
395 | goto out; | ||
396 | }*/ | ||
397 | ldm_debug ("Validated PRIVHEADs successfully."); | ||
398 | result = true; | ||
399 | out: | ||
400 | kfree (ph[1]); | ||
401 | kfree (ph[2]); | ||
402 | return result; | ||
403 | } | ||
404 | |||
405 | /** | ||
406 | * ldm_validate_tocblocks - Validate the table of contents and its backups | ||
407 | * @state: Partition check state including device holding the LDM Database | ||
408 | * @base: Offset, into @state->bdev, of the database | ||
409 | * @ldb: Cache of the database structures | ||
410 | * | ||
411 | * Find and compare the four tables of contents of the LDM Database stored on | ||
412 | * @state->bdev and return the parsed information into @toc1. | ||
413 | * | ||
414 | * The offsets and sizes of the configs are range-checked against a privhead. | ||
415 | * | ||
416 | * Return: 'true' @toc1 contains validated TOCBLOCK info | ||
417 | * 'false' @toc1 contents are undefined | ||
418 | */ | ||
419 | static bool ldm_validate_tocblocks(struct parsed_partitions *state, | ||
420 | unsigned long base, struct ldmdb *ldb) | ||
421 | { | ||
422 | static const int off[4] = { OFF_TOCB1, OFF_TOCB2, OFF_TOCB3, OFF_TOCB4}; | ||
423 | struct tocblock *tb[4]; | ||
424 | struct privhead *ph; | ||
425 | Sector sect; | ||
426 | u8 *data; | ||
427 | int i, nr_tbs; | ||
428 | bool result = false; | ||
429 | |||
430 | BUG_ON(!state || !ldb); | ||
431 | ph = &ldb->ph; | ||
432 | tb[0] = &ldb->toc; | ||
433 | tb[1] = kmalloc(sizeof(*tb[1]) * 3, GFP_KERNEL); | ||
434 | if (!tb[1]) { | ||
435 | ldm_crit("Out of memory."); | ||
436 | goto err; | ||
437 | } | ||
438 | tb[2] = (struct tocblock*)((u8*)tb[1] + sizeof(*tb[1])); | ||
439 | tb[3] = (struct tocblock*)((u8*)tb[2] + sizeof(*tb[2])); | ||
440 | /* | ||
441 | * Try to read and parse all four TOCBLOCKs. | ||
442 | * | ||
443 | * Windows Vista LDM v2.12 does not always have all four TOCBLOCKs so | ||
444 | * skip any that fail as long as we get at least one valid TOCBLOCK. | ||
445 | */ | ||
446 | for (nr_tbs = i = 0; i < 4; i++) { | ||
447 | data = read_part_sector(state, base + off[i], §); | ||
448 | if (!data) { | ||
449 | ldm_error("Disk read failed for TOCBLOCK %d.", i); | ||
450 | continue; | ||
451 | } | ||
452 | if (ldm_parse_tocblock(data, tb[nr_tbs])) | ||
453 | nr_tbs++; | ||
454 | put_dev_sector(sect); | ||
455 | } | ||
456 | if (!nr_tbs) { | ||
457 | ldm_crit("Failed to find a valid TOCBLOCK."); | ||
458 | goto err; | ||
459 | } | ||
460 | /* Range check the TOCBLOCK against a privhead. */ | ||
461 | if (((tb[0]->bitmap1_start + tb[0]->bitmap1_size) > ph->config_size) || | ||
462 | ((tb[0]->bitmap2_start + tb[0]->bitmap2_size) > | ||
463 | ph->config_size)) { | ||
464 | ldm_crit("The bitmaps are out of range. Giving up."); | ||
465 | goto err; | ||
466 | } | ||
467 | /* Compare all loaded TOCBLOCKs. */ | ||
468 | for (i = 1; i < nr_tbs; i++) { | ||
469 | if (!ldm_compare_tocblocks(tb[0], tb[i])) { | ||
470 | ldm_crit("TOCBLOCKs 0 and %d do not match.", i); | ||
471 | goto err; | ||
472 | } | ||
473 | } | ||
474 | ldm_debug("Validated %d TOCBLOCKs successfully.", nr_tbs); | ||
475 | result = true; | ||
476 | err: | ||
477 | kfree(tb[1]); | ||
478 | return result; | ||
479 | } | ||
480 | |||
481 | /** | ||
482 | * ldm_validate_vmdb - Read the VMDB and validate it | ||
483 | * @state: Partition check state including device holding the LDM Database | ||
484 | * @base: Offset, into @bdev, of the database | ||
485 | * @ldb: Cache of the database structures | ||
486 | * | ||
487 | * Find the vmdb of the LDM Database stored on @bdev and return the parsed | ||
488 | * information in @ldb. | ||
489 | * | ||
490 | * Return: 'true' @ldb contains validated VBDB info | ||
491 | * 'false' @ldb contents are undefined | ||
492 | */ | ||
493 | static bool ldm_validate_vmdb(struct parsed_partitions *state, | ||
494 | unsigned long base, struct ldmdb *ldb) | ||
495 | { | ||
496 | Sector sect; | ||
497 | u8 *data; | ||
498 | bool result = false; | ||
499 | struct vmdb *vm; | ||
500 | struct tocblock *toc; | ||
501 | |||
502 | BUG_ON (!state || !ldb); | ||
503 | |||
504 | vm = &ldb->vm; | ||
505 | toc = &ldb->toc; | ||
506 | |||
507 | data = read_part_sector(state, base + OFF_VMDB, §); | ||
508 | if (!data) { | ||
509 | ldm_crit ("Disk read failed."); | ||
510 | return false; | ||
511 | } | ||
512 | |||
513 | if (!ldm_parse_vmdb (data, vm)) | ||
514 | goto out; /* Already logged */ | ||
515 | |||
516 | /* Are there uncommitted transactions? */ | ||
517 | if (get_unaligned_be16(data + 0x10) != 0x01) { | ||
518 | ldm_crit ("Database is not in a consistent state. Aborting."); | ||
519 | goto out; | ||
520 | } | ||
521 | |||
522 | if (vm->vblk_offset != 512) | ||
523 | ldm_info ("VBLKs start at offset 0x%04x.", vm->vblk_offset); | ||
524 | |||
525 | /* | ||
526 | * The last_vblkd_seq can be before the end of the vmdb, just make sure | ||
527 | * it is not out of bounds. | ||
528 | */ | ||
529 | if ((vm->vblk_size * vm->last_vblk_seq) > (toc->bitmap1_size << 9)) { | ||
530 | ldm_crit ("VMDB exceeds allowed size specified by TOCBLOCK. " | ||
531 | "Database is corrupt. Aborting."); | ||
532 | goto out; | ||
533 | } | ||
534 | |||
535 | result = true; | ||
536 | out: | ||
537 | put_dev_sector (sect); | ||
538 | return result; | ||
539 | } | ||
540 | |||
541 | |||
542 | /** | ||
543 | * ldm_validate_partition_table - Determine whether bdev might be a dynamic disk | ||
544 | * @state: Partition check state including device holding the LDM Database | ||
545 | * | ||
546 | * This function provides a weak test to decide whether the device is a dynamic | ||
547 | * disk or not. It looks for an MS-DOS-style partition table containing at | ||
548 | * least one partition of type 0x42 (formerly SFS, now used by Windows for | ||
549 | * dynamic disks). | ||
550 | * | ||
551 | * N.B. The only possible error can come from the read_part_sector and that is | ||
552 | * only likely to happen if the underlying device is strange. If that IS | ||
553 | * the case we should return zero to let someone else try. | ||
554 | * | ||
555 | * Return: 'true' @state->bdev is a dynamic disk | ||
556 | * 'false' @state->bdev is not a dynamic disk, or an error occurred | ||
557 | */ | ||
558 | static bool ldm_validate_partition_table(struct parsed_partitions *state) | ||
559 | { | ||
560 | Sector sect; | ||
561 | u8 *data; | ||
562 | struct partition *p; | ||
563 | int i; | ||
564 | bool result = false; | ||
565 | |||
566 | BUG_ON(!state); | ||
567 | |||
568 | data = read_part_sector(state, 0, §); | ||
569 | if (!data) { | ||
570 | ldm_info ("Disk read failed."); | ||
571 | return false; | ||
572 | } | ||
573 | |||
574 | if (*(__le16*) (data + 0x01FE) != cpu_to_le16 (MSDOS_LABEL_MAGIC)) | ||
575 | goto out; | ||
576 | |||
577 | p = (struct partition*)(data + 0x01BE); | ||
578 | for (i = 0; i < 4; i++, p++) | ||
579 | if (SYS_IND (p) == LDM_PARTITION) { | ||
580 | result = true; | ||
581 | break; | ||
582 | } | ||
583 | |||
584 | if (result) | ||
585 | ldm_debug ("Found W2K dynamic disk partition type."); | ||
586 | |||
587 | out: | ||
588 | put_dev_sector (sect); | ||
589 | return result; | ||
590 | } | ||
591 | |||
592 | /** | ||
593 | * ldm_get_disk_objid - Search a linked list of vblk's for a given Disk Id | ||
594 | * @ldb: Cache of the database structures | ||
595 | * | ||
596 | * The LDM Database contains a list of all partitions on all dynamic disks. | ||
597 | * The primary PRIVHEAD, at the beginning of the physical disk, tells us | ||
598 | * the GUID of this disk. This function searches for the GUID in a linked | ||
599 | * list of vblk's. | ||
600 | * | ||
601 | * Return: Pointer, A matching vblk was found | ||
602 | * NULL, No match, or an error | ||
603 | */ | ||
604 | static struct vblk * ldm_get_disk_objid (const struct ldmdb *ldb) | ||
605 | { | ||
606 | struct list_head *item; | ||
607 | |||
608 | BUG_ON (!ldb); | ||
609 | |||
610 | list_for_each (item, &ldb->v_disk) { | ||
611 | struct vblk *v = list_entry (item, struct vblk, list); | ||
612 | if (!memcmp (v->vblk.disk.disk_id, ldb->ph.disk_id, GUID_SIZE)) | ||
613 | return v; | ||
614 | } | ||
615 | |||
616 | return NULL; | ||
617 | } | ||
618 | |||
619 | /** | ||
620 | * ldm_create_data_partitions - Create data partitions for this device | ||
621 | * @pp: List of the partitions parsed so far | ||
622 | * @ldb: Cache of the database structures | ||
623 | * | ||
624 | * The database contains ALL the partitions for ALL disk groups, so we need to | ||
625 | * filter out this specific disk. Using the disk's object id, we can find all | ||
626 | * the partitions in the database that belong to this disk. | ||
627 | * | ||
628 | * Add each partition in our database, to the parsed_partitions structure. | ||
629 | * | ||
630 | * N.B. This function creates the partitions in the order it finds partition | ||
631 | * objects in the linked list. | ||
632 | * | ||
633 | * Return: 'true' Partition created | ||
634 | * 'false' Error, probably a range checking problem | ||
635 | */ | ||
636 | static bool ldm_create_data_partitions (struct parsed_partitions *pp, | ||
637 | const struct ldmdb *ldb) | ||
638 | { | ||
639 | struct list_head *item; | ||
640 | struct vblk *vb; | ||
641 | struct vblk *disk; | ||
642 | struct vblk_part *part; | ||
643 | int part_num = 1; | ||
644 | |||
645 | BUG_ON (!pp || !ldb); | ||
646 | |||
647 | disk = ldm_get_disk_objid (ldb); | ||
648 | if (!disk) { | ||
649 | ldm_crit ("Can't find the ID of this disk in the database."); | ||
650 | return false; | ||
651 | } | ||
652 | |||
653 | strlcat(pp->pp_buf, " [LDM]", PAGE_SIZE); | ||
654 | |||
655 | /* Create the data partitions */ | ||
656 | list_for_each (item, &ldb->v_part) { | ||
657 | vb = list_entry (item, struct vblk, list); | ||
658 | part = &vb->vblk.part; | ||
659 | |||
660 | if (part->disk_id != disk->obj_id) | ||
661 | continue; | ||
662 | |||
663 | put_partition (pp, part_num, ldb->ph.logical_disk_start + | ||
664 | part->start, part->size); | ||
665 | part_num++; | ||
666 | } | ||
667 | |||
668 | strlcat(pp->pp_buf, "\n", PAGE_SIZE); | ||
669 | return true; | ||
670 | } | ||
671 | |||
672 | |||
673 | /** | ||
674 | * ldm_relative - Calculate the next relative offset | ||
675 | * @buffer: Block of data being worked on | ||
676 | * @buflen: Size of the block of data | ||
677 | * @base: Size of the previous fixed width fields | ||
678 | * @offset: Cumulative size of the previous variable-width fields | ||
679 | * | ||
680 | * Because many of the VBLK fields are variable-width, it's necessary | ||
681 | * to calculate each offset based on the previous one and the length | ||
682 | * of the field it pointed to. | ||
683 | * | ||
684 | * Return: -1 Error, the calculated offset exceeded the size of the buffer | ||
685 | * n OK, a range-checked offset into buffer | ||
686 | */ | ||
687 | static int ldm_relative(const u8 *buffer, int buflen, int base, int offset) | ||
688 | { | ||
689 | |||
690 | base += offset; | ||
691 | if (!buffer || offset < 0 || base > buflen) { | ||
692 | if (!buffer) | ||
693 | ldm_error("!buffer"); | ||
694 | if (offset < 0) | ||
695 | ldm_error("offset (%d) < 0", offset); | ||
696 | if (base > buflen) | ||
697 | ldm_error("base (%d) > buflen (%d)", base, buflen); | ||
698 | return -1; | ||
699 | } | ||
700 | if (base + buffer[base] >= buflen) { | ||
701 | ldm_error("base (%d) + buffer[base] (%d) >= buflen (%d)", base, | ||
702 | buffer[base], buflen); | ||
703 | return -1; | ||
704 | } | ||
705 | return buffer[base] + offset + 1; | ||
706 | } | ||
707 | |||
708 | /** | ||
709 | * ldm_get_vnum - Convert a variable-width, big endian number, into cpu order | ||
710 | * @block: Pointer to the variable-width number to convert | ||
711 | * | ||
712 | * Large numbers in the LDM Database are often stored in a packed format. Each | ||
713 | * number is prefixed by a one byte width marker. All numbers in the database | ||
714 | * are stored in big-endian byte order. This function reads one of these | ||
715 | * numbers and returns the result | ||
716 | * | ||
717 | * N.B. This function DOES NOT perform any range checking, though the most | ||
718 | * it will read is eight bytes. | ||
719 | * | ||
720 | * Return: n A number | ||
721 | * 0 Zero, or an error occurred | ||
722 | */ | ||
723 | static u64 ldm_get_vnum (const u8 *block) | ||
724 | { | ||
725 | u64 tmp = 0; | ||
726 | u8 length; | ||
727 | |||
728 | BUG_ON (!block); | ||
729 | |||
730 | length = *block++; | ||
731 | |||
732 | if (length && length <= 8) | ||
733 | while (length--) | ||
734 | tmp = (tmp << 8) | *block++; | ||
735 | else | ||
736 | ldm_error ("Illegal length %d.", length); | ||
737 | |||
738 | return tmp; | ||
739 | } | ||
740 | |||
741 | /** | ||
742 | * ldm_get_vstr - Read a length-prefixed string into a buffer | ||
743 | * @block: Pointer to the length marker | ||
744 | * @buffer: Location to copy string to | ||
745 | * @buflen: Size of the output buffer | ||
746 | * | ||
747 | * Many of the strings in the LDM Database are not NULL terminated. Instead | ||
748 | * they are prefixed by a one byte length marker. This function copies one of | ||
749 | * these strings into a buffer. | ||
750 | * | ||
751 | * N.B. This function DOES NOT perform any range checking on the input. | ||
752 | * If the buffer is too small, the output will be truncated. | ||
753 | * | ||
754 | * Return: 0, Error and @buffer contents are undefined | ||
755 | * n, String length in characters (excluding NULL) | ||
756 | * buflen-1, String was truncated. | ||
757 | */ | ||
758 | static int ldm_get_vstr (const u8 *block, u8 *buffer, int buflen) | ||
759 | { | ||
760 | int length; | ||
761 | |||
762 | BUG_ON (!block || !buffer); | ||
763 | |||
764 | length = block[0]; | ||
765 | if (length >= buflen) { | ||
766 | ldm_error ("Truncating string %d -> %d.", length, buflen); | ||
767 | length = buflen - 1; | ||
768 | } | ||
769 | memcpy (buffer, block + 1, length); | ||
770 | buffer[length] = 0; | ||
771 | return length; | ||
772 | } | ||
773 | |||
774 | |||
775 | /** | ||
776 | * ldm_parse_cmp3 - Read a raw VBLK Component object into a vblk structure | ||
777 | * @buffer: Block of data being worked on | ||
778 | * @buflen: Size of the block of data | ||
779 | * @vb: In-memory vblk in which to return information | ||
780 | * | ||
781 | * Read a raw VBLK Component object (version 3) into a vblk structure. | ||
782 | * | ||
783 | * Return: 'true' @vb contains a Component VBLK | ||
784 | * 'false' @vb contents are not defined | ||
785 | */ | ||
786 | static bool ldm_parse_cmp3 (const u8 *buffer, int buflen, struct vblk *vb) | ||
787 | { | ||
788 | int r_objid, r_name, r_vstate, r_child, r_parent, r_stripe, r_cols, len; | ||
789 | struct vblk_comp *comp; | ||
790 | |||
791 | BUG_ON (!buffer || !vb); | ||
792 | |||
793 | r_objid = ldm_relative (buffer, buflen, 0x18, 0); | ||
794 | r_name = ldm_relative (buffer, buflen, 0x18, r_objid); | ||
795 | r_vstate = ldm_relative (buffer, buflen, 0x18, r_name); | ||
796 | r_child = ldm_relative (buffer, buflen, 0x1D, r_vstate); | ||
797 | r_parent = ldm_relative (buffer, buflen, 0x2D, r_child); | ||
798 | |||
799 | if (buffer[0x12] & VBLK_FLAG_COMP_STRIPE) { | ||
800 | r_stripe = ldm_relative (buffer, buflen, 0x2E, r_parent); | ||
801 | r_cols = ldm_relative (buffer, buflen, 0x2E, r_stripe); | ||
802 | len = r_cols; | ||
803 | } else { | ||
804 | r_stripe = 0; | ||
805 | r_cols = 0; | ||
806 | len = r_parent; | ||
807 | } | ||
808 | if (len < 0) | ||
809 | return false; | ||
810 | |||
811 | len += VBLK_SIZE_CMP3; | ||
812 | if (len != get_unaligned_be32(buffer + 0x14)) | ||
813 | return false; | ||
814 | |||
815 | comp = &vb->vblk.comp; | ||
816 | ldm_get_vstr (buffer + 0x18 + r_name, comp->state, | ||
817 | sizeof (comp->state)); | ||
818 | comp->type = buffer[0x18 + r_vstate]; | ||
819 | comp->children = ldm_get_vnum (buffer + 0x1D + r_vstate); | ||
820 | comp->parent_id = ldm_get_vnum (buffer + 0x2D + r_child); | ||
821 | comp->chunksize = r_stripe ? ldm_get_vnum (buffer+r_parent+0x2E) : 0; | ||
822 | |||
823 | return true; | ||
824 | } | ||
825 | |||
826 | /** | ||
827 | * ldm_parse_dgr3 - Read a raw VBLK Disk Group object into a vblk structure | ||
828 | * @buffer: Block of data being worked on | ||
829 | * @buflen: Size of the block of data | ||
830 | * @vb: In-memory vblk in which to return information | ||
831 | * | ||
832 | * Read a raw VBLK Disk Group object (version 3) into a vblk structure. | ||
833 | * | ||
834 | * Return: 'true' @vb contains a Disk Group VBLK | ||
835 | * 'false' @vb contents are not defined | ||
836 | */ | ||
837 | static int ldm_parse_dgr3 (const u8 *buffer, int buflen, struct vblk *vb) | ||
838 | { | ||
839 | int r_objid, r_name, r_diskid, r_id1, r_id2, len; | ||
840 | struct vblk_dgrp *dgrp; | ||
841 | |||
842 | BUG_ON (!buffer || !vb); | ||
843 | |||
844 | r_objid = ldm_relative (buffer, buflen, 0x18, 0); | ||
845 | r_name = ldm_relative (buffer, buflen, 0x18, r_objid); | ||
846 | r_diskid = ldm_relative (buffer, buflen, 0x18, r_name); | ||
847 | |||
848 | if (buffer[0x12] & VBLK_FLAG_DGR3_IDS) { | ||
849 | r_id1 = ldm_relative (buffer, buflen, 0x24, r_diskid); | ||
850 | r_id2 = ldm_relative (buffer, buflen, 0x24, r_id1); | ||
851 | len = r_id2; | ||
852 | } else { | ||
853 | r_id1 = 0; | ||
854 | r_id2 = 0; | ||
855 | len = r_diskid; | ||
856 | } | ||
857 | if (len < 0) | ||
858 | return false; | ||
859 | |||
860 | len += VBLK_SIZE_DGR3; | ||
861 | if (len != get_unaligned_be32(buffer + 0x14)) | ||
862 | return false; | ||
863 | |||
864 | dgrp = &vb->vblk.dgrp; | ||
865 | ldm_get_vstr (buffer + 0x18 + r_name, dgrp->disk_id, | ||
866 | sizeof (dgrp->disk_id)); | ||
867 | return true; | ||
868 | } | ||
869 | |||
870 | /** | ||
871 | * ldm_parse_dgr4 - Read a raw VBLK Disk Group object into a vblk structure | ||
872 | * @buffer: Block of data being worked on | ||
873 | * @buflen: Size of the block of data | ||
874 | * @vb: In-memory vblk in which to return information | ||
875 | * | ||
876 | * Read a raw VBLK Disk Group object (version 4) into a vblk structure. | ||
877 | * | ||
878 | * Return: 'true' @vb contains a Disk Group VBLK | ||
879 | * 'false' @vb contents are not defined | ||
880 | */ | ||
881 | static bool ldm_parse_dgr4 (const u8 *buffer, int buflen, struct vblk *vb) | ||
882 | { | ||
883 | char buf[64]; | ||
884 | int r_objid, r_name, r_id1, r_id2, len; | ||
885 | struct vblk_dgrp *dgrp; | ||
886 | |||
887 | BUG_ON (!buffer || !vb); | ||
888 | |||
889 | r_objid = ldm_relative (buffer, buflen, 0x18, 0); | ||
890 | r_name = ldm_relative (buffer, buflen, 0x18, r_objid); | ||
891 | |||
892 | if (buffer[0x12] & VBLK_FLAG_DGR4_IDS) { | ||
893 | r_id1 = ldm_relative (buffer, buflen, 0x44, r_name); | ||
894 | r_id2 = ldm_relative (buffer, buflen, 0x44, r_id1); | ||
895 | len = r_id2; | ||
896 | } else { | ||
897 | r_id1 = 0; | ||
898 | r_id2 = 0; | ||
899 | len = r_name; | ||
900 | } | ||
901 | if (len < 0) | ||
902 | return false; | ||
903 | |||
904 | len += VBLK_SIZE_DGR4; | ||
905 | if (len != get_unaligned_be32(buffer + 0x14)) | ||
906 | return false; | ||
907 | |||
908 | dgrp = &vb->vblk.dgrp; | ||
909 | |||
910 | ldm_get_vstr (buffer + 0x18 + r_objid, buf, sizeof (buf)); | ||
911 | return true; | ||
912 | } | ||
913 | |||
914 | /** | ||
915 | * ldm_parse_dsk3 - Read a raw VBLK Disk object into a vblk structure | ||
916 | * @buffer: Block of data being worked on | ||
917 | * @buflen: Size of the block of data | ||
918 | * @vb: In-memory vblk in which to return information | ||
919 | * | ||
920 | * Read a raw VBLK Disk object (version 3) into a vblk structure. | ||
921 | * | ||
922 | * Return: 'true' @vb contains a Disk VBLK | ||
923 | * 'false' @vb contents are not defined | ||
924 | */ | ||
925 | static bool ldm_parse_dsk3 (const u8 *buffer, int buflen, struct vblk *vb) | ||
926 | { | ||
927 | int r_objid, r_name, r_diskid, r_altname, len; | ||
928 | struct vblk_disk *disk; | ||
929 | |||
930 | BUG_ON (!buffer || !vb); | ||
931 | |||
932 | r_objid = ldm_relative (buffer, buflen, 0x18, 0); | ||
933 | r_name = ldm_relative (buffer, buflen, 0x18, r_objid); | ||
934 | r_diskid = ldm_relative (buffer, buflen, 0x18, r_name); | ||
935 | r_altname = ldm_relative (buffer, buflen, 0x18, r_diskid); | ||
936 | len = r_altname; | ||
937 | if (len < 0) | ||
938 | return false; | ||
939 | |||
940 | len += VBLK_SIZE_DSK3; | ||
941 | if (len != get_unaligned_be32(buffer + 0x14)) | ||
942 | return false; | ||
943 | |||
944 | disk = &vb->vblk.disk; | ||
945 | ldm_get_vstr (buffer + 0x18 + r_diskid, disk->alt_name, | ||
946 | sizeof (disk->alt_name)); | ||
947 | if (!ldm_parse_guid (buffer + 0x19 + r_name, disk->disk_id)) | ||
948 | return false; | ||
949 | |||
950 | return true; | ||
951 | } | ||
952 | |||
953 | /** | ||
954 | * ldm_parse_dsk4 - Read a raw VBLK Disk object into a vblk structure | ||
955 | * @buffer: Block of data being worked on | ||
956 | * @buflen: Size of the block of data | ||
957 | * @vb: In-memory vblk in which to return information | ||
958 | * | ||
959 | * Read a raw VBLK Disk object (version 4) into a vblk structure. | ||
960 | * | ||
961 | * Return: 'true' @vb contains a Disk VBLK | ||
962 | * 'false' @vb contents are not defined | ||
963 | */ | ||
964 | static bool ldm_parse_dsk4 (const u8 *buffer, int buflen, struct vblk *vb) | ||
965 | { | ||
966 | int r_objid, r_name, len; | ||
967 | struct vblk_disk *disk; | ||
968 | |||
969 | BUG_ON (!buffer || !vb); | ||
970 | |||
971 | r_objid = ldm_relative (buffer, buflen, 0x18, 0); | ||
972 | r_name = ldm_relative (buffer, buflen, 0x18, r_objid); | ||
973 | len = r_name; | ||
974 | if (len < 0) | ||
975 | return false; | ||
976 | |||
977 | len += VBLK_SIZE_DSK4; | ||
978 | if (len != get_unaligned_be32(buffer + 0x14)) | ||
979 | return false; | ||
980 | |||
981 | disk = &vb->vblk.disk; | ||
982 | memcpy (disk->disk_id, buffer + 0x18 + r_name, GUID_SIZE); | ||
983 | return true; | ||
984 | } | ||
985 | |||
986 | /** | ||
987 | * ldm_parse_prt3 - Read a raw VBLK Partition object into a vblk structure | ||
988 | * @buffer: Block of data being worked on | ||
989 | * @buflen: Size of the block of data | ||
990 | * @vb: In-memory vblk in which to return information | ||
991 | * | ||
992 | * Read a raw VBLK Partition object (version 3) into a vblk structure. | ||
993 | * | ||
994 | * Return: 'true' @vb contains a Partition VBLK | ||
995 | * 'false' @vb contents are not defined | ||
996 | */ | ||
997 | static bool ldm_parse_prt3(const u8 *buffer, int buflen, struct vblk *vb) | ||
998 | { | ||
999 | int r_objid, r_name, r_size, r_parent, r_diskid, r_index, len; | ||
1000 | struct vblk_part *part; | ||
1001 | |||
1002 | BUG_ON(!buffer || !vb); | ||
1003 | r_objid = ldm_relative(buffer, buflen, 0x18, 0); | ||
1004 | if (r_objid < 0) { | ||
1005 | ldm_error("r_objid %d < 0", r_objid); | ||
1006 | return false; | ||
1007 | } | ||
1008 | r_name = ldm_relative(buffer, buflen, 0x18, r_objid); | ||
1009 | if (r_name < 0) { | ||
1010 | ldm_error("r_name %d < 0", r_name); | ||
1011 | return false; | ||
1012 | } | ||
1013 | r_size = ldm_relative(buffer, buflen, 0x34, r_name); | ||
1014 | if (r_size < 0) { | ||
1015 | ldm_error("r_size %d < 0", r_size); | ||
1016 | return false; | ||
1017 | } | ||
1018 | r_parent = ldm_relative(buffer, buflen, 0x34, r_size); | ||
1019 | if (r_parent < 0) { | ||
1020 | ldm_error("r_parent %d < 0", r_parent); | ||
1021 | return false; | ||
1022 | } | ||
1023 | r_diskid = ldm_relative(buffer, buflen, 0x34, r_parent); | ||
1024 | if (r_diskid < 0) { | ||
1025 | ldm_error("r_diskid %d < 0", r_diskid); | ||
1026 | return false; | ||
1027 | } | ||
1028 | if (buffer[0x12] & VBLK_FLAG_PART_INDEX) { | ||
1029 | r_index = ldm_relative(buffer, buflen, 0x34, r_diskid); | ||
1030 | if (r_index < 0) { | ||
1031 | ldm_error("r_index %d < 0", r_index); | ||
1032 | return false; | ||
1033 | } | ||
1034 | len = r_index; | ||
1035 | } else { | ||
1036 | r_index = 0; | ||
1037 | len = r_diskid; | ||
1038 | } | ||
1039 | if (len < 0) { | ||
1040 | ldm_error("len %d < 0", len); | ||
1041 | return false; | ||
1042 | } | ||
1043 | len += VBLK_SIZE_PRT3; | ||
1044 | if (len > get_unaligned_be32(buffer + 0x14)) { | ||
1045 | ldm_error("len %d > BE32(buffer + 0x14) %d", len, | ||
1046 | get_unaligned_be32(buffer + 0x14)); | ||
1047 | return false; | ||
1048 | } | ||
1049 | part = &vb->vblk.part; | ||
1050 | part->start = get_unaligned_be64(buffer + 0x24 + r_name); | ||
1051 | part->volume_offset = get_unaligned_be64(buffer + 0x2C + r_name); | ||
1052 | part->size = ldm_get_vnum(buffer + 0x34 + r_name); | ||
1053 | part->parent_id = ldm_get_vnum(buffer + 0x34 + r_size); | ||
1054 | part->disk_id = ldm_get_vnum(buffer + 0x34 + r_parent); | ||
1055 | if (vb->flags & VBLK_FLAG_PART_INDEX) | ||
1056 | part->partnum = buffer[0x35 + r_diskid]; | ||
1057 | else | ||
1058 | part->partnum = 0; | ||
1059 | return true; | ||
1060 | } | ||
1061 | |||
1062 | /** | ||
1063 | * ldm_parse_vol5 - Read a raw VBLK Volume object into a vblk structure | ||
1064 | * @buffer: Block of data being worked on | ||
1065 | * @buflen: Size of the block of data | ||
1066 | * @vb: In-memory vblk in which to return information | ||
1067 | * | ||
1068 | * Read a raw VBLK Volume object (version 5) into a vblk structure. | ||
1069 | * | ||
1070 | * Return: 'true' @vb contains a Volume VBLK | ||
1071 | * 'false' @vb contents are not defined | ||
1072 | */ | ||
1073 | static bool ldm_parse_vol5(const u8 *buffer, int buflen, struct vblk *vb) | ||
1074 | { | ||
1075 | int r_objid, r_name, r_vtype, r_disable_drive_letter, r_child, r_size; | ||
1076 | int r_id1, r_id2, r_size2, r_drive, len; | ||
1077 | struct vblk_volu *volu; | ||
1078 | |||
1079 | BUG_ON(!buffer || !vb); | ||
1080 | r_objid = ldm_relative(buffer, buflen, 0x18, 0); | ||
1081 | if (r_objid < 0) { | ||
1082 | ldm_error("r_objid %d < 0", r_objid); | ||
1083 | return false; | ||
1084 | } | ||
1085 | r_name = ldm_relative(buffer, buflen, 0x18, r_objid); | ||
1086 | if (r_name < 0) { | ||
1087 | ldm_error("r_name %d < 0", r_name); | ||
1088 | return false; | ||
1089 | } | ||
1090 | r_vtype = ldm_relative(buffer, buflen, 0x18, r_name); | ||
1091 | if (r_vtype < 0) { | ||
1092 | ldm_error("r_vtype %d < 0", r_vtype); | ||
1093 | return false; | ||
1094 | } | ||
1095 | r_disable_drive_letter = ldm_relative(buffer, buflen, 0x18, r_vtype); | ||
1096 | if (r_disable_drive_letter < 0) { | ||
1097 | ldm_error("r_disable_drive_letter %d < 0", | ||
1098 | r_disable_drive_letter); | ||
1099 | return false; | ||
1100 | } | ||
1101 | r_child = ldm_relative(buffer, buflen, 0x2D, r_disable_drive_letter); | ||
1102 | if (r_child < 0) { | ||
1103 | ldm_error("r_child %d < 0", r_child); | ||
1104 | return false; | ||
1105 | } | ||
1106 | r_size = ldm_relative(buffer, buflen, 0x3D, r_child); | ||
1107 | if (r_size < 0) { | ||
1108 | ldm_error("r_size %d < 0", r_size); | ||
1109 | return false; | ||
1110 | } | ||
1111 | if (buffer[0x12] & VBLK_FLAG_VOLU_ID1) { | ||
1112 | r_id1 = ldm_relative(buffer, buflen, 0x52, r_size); | ||
1113 | if (r_id1 < 0) { | ||
1114 | ldm_error("r_id1 %d < 0", r_id1); | ||
1115 | return false; | ||
1116 | } | ||
1117 | } else | ||
1118 | r_id1 = r_size; | ||
1119 | if (buffer[0x12] & VBLK_FLAG_VOLU_ID2) { | ||
1120 | r_id2 = ldm_relative(buffer, buflen, 0x52, r_id1); | ||
1121 | if (r_id2 < 0) { | ||
1122 | ldm_error("r_id2 %d < 0", r_id2); | ||
1123 | return false; | ||
1124 | } | ||
1125 | } else | ||
1126 | r_id2 = r_id1; | ||
1127 | if (buffer[0x12] & VBLK_FLAG_VOLU_SIZE) { | ||
1128 | r_size2 = ldm_relative(buffer, buflen, 0x52, r_id2); | ||
1129 | if (r_size2 < 0) { | ||
1130 | ldm_error("r_size2 %d < 0", r_size2); | ||
1131 | return false; | ||
1132 | } | ||
1133 | } else | ||
1134 | r_size2 = r_id2; | ||
1135 | if (buffer[0x12] & VBLK_FLAG_VOLU_DRIVE) { | ||
1136 | r_drive = ldm_relative(buffer, buflen, 0x52, r_size2); | ||
1137 | if (r_drive < 0) { | ||
1138 | ldm_error("r_drive %d < 0", r_drive); | ||
1139 | return false; | ||
1140 | } | ||
1141 | } else | ||
1142 | r_drive = r_size2; | ||
1143 | len = r_drive; | ||
1144 | if (len < 0) { | ||
1145 | ldm_error("len %d < 0", len); | ||
1146 | return false; | ||
1147 | } | ||
1148 | len += VBLK_SIZE_VOL5; | ||
1149 | if (len > get_unaligned_be32(buffer + 0x14)) { | ||
1150 | ldm_error("len %d > BE32(buffer + 0x14) %d", len, | ||
1151 | get_unaligned_be32(buffer + 0x14)); | ||
1152 | return false; | ||
1153 | } | ||
1154 | volu = &vb->vblk.volu; | ||
1155 | ldm_get_vstr(buffer + 0x18 + r_name, volu->volume_type, | ||
1156 | sizeof(volu->volume_type)); | ||
1157 | memcpy(volu->volume_state, buffer + 0x18 + r_disable_drive_letter, | ||
1158 | sizeof(volu->volume_state)); | ||
1159 | volu->size = ldm_get_vnum(buffer + 0x3D + r_child); | ||
1160 | volu->partition_type = buffer[0x41 + r_size]; | ||
1161 | memcpy(volu->guid, buffer + 0x42 + r_size, sizeof(volu->guid)); | ||
1162 | if (buffer[0x12] & VBLK_FLAG_VOLU_DRIVE) { | ||
1163 | ldm_get_vstr(buffer + 0x52 + r_size, volu->drive_hint, | ||
1164 | sizeof(volu->drive_hint)); | ||
1165 | } | ||
1166 | return true; | ||
1167 | } | ||
1168 | |||
1169 | /** | ||
1170 | * ldm_parse_vblk - Read a raw VBLK object into a vblk structure | ||
1171 | * @buf: Block of data being worked on | ||
1172 | * @len: Size of the block of data | ||
1173 | * @vb: In-memory vblk in which to return information | ||
1174 | * | ||
1175 | * Read a raw VBLK object into a vblk structure. This function just reads the | ||
1176 | * information common to all VBLK types, then delegates the rest of the work to | ||
1177 | * helper functions: ldm_parse_*. | ||
1178 | * | ||
1179 | * Return: 'true' @vb contains a VBLK | ||
1180 | * 'false' @vb contents are not defined | ||
1181 | */ | ||
1182 | static bool ldm_parse_vblk (const u8 *buf, int len, struct vblk *vb) | ||
1183 | { | ||
1184 | bool result = false; | ||
1185 | int r_objid; | ||
1186 | |||
1187 | BUG_ON (!buf || !vb); | ||
1188 | |||
1189 | r_objid = ldm_relative (buf, len, 0x18, 0); | ||
1190 | if (r_objid < 0) { | ||
1191 | ldm_error ("VBLK header is corrupt."); | ||
1192 | return false; | ||
1193 | } | ||
1194 | |||
1195 | vb->flags = buf[0x12]; | ||
1196 | vb->type = buf[0x13]; | ||
1197 | vb->obj_id = ldm_get_vnum (buf + 0x18); | ||
1198 | ldm_get_vstr (buf+0x18+r_objid, vb->name, sizeof (vb->name)); | ||
1199 | |||
1200 | switch (vb->type) { | ||
1201 | case VBLK_CMP3: result = ldm_parse_cmp3 (buf, len, vb); break; | ||
1202 | case VBLK_DSK3: result = ldm_parse_dsk3 (buf, len, vb); break; | ||
1203 | case VBLK_DSK4: result = ldm_parse_dsk4 (buf, len, vb); break; | ||
1204 | case VBLK_DGR3: result = ldm_parse_dgr3 (buf, len, vb); break; | ||
1205 | case VBLK_DGR4: result = ldm_parse_dgr4 (buf, len, vb); break; | ||
1206 | case VBLK_PRT3: result = ldm_parse_prt3 (buf, len, vb); break; | ||
1207 | case VBLK_VOL5: result = ldm_parse_vol5 (buf, len, vb); break; | ||
1208 | } | ||
1209 | |||
1210 | if (result) | ||
1211 | ldm_debug ("Parsed VBLK 0x%llx (type: 0x%02x) ok.", | ||
1212 | (unsigned long long) vb->obj_id, vb->type); | ||
1213 | else | ||
1214 | ldm_error ("Failed to parse VBLK 0x%llx (type: 0x%02x).", | ||
1215 | (unsigned long long) vb->obj_id, vb->type); | ||
1216 | |||
1217 | return result; | ||
1218 | } | ||
1219 | |||
1220 | |||
1221 | /** | ||
1222 | * ldm_ldmdb_add - Adds a raw VBLK entry to the ldmdb database | ||
1223 | * @data: Raw VBLK to add to the database | ||
1224 | * @len: Size of the raw VBLK | ||
1225 | * @ldb: Cache of the database structures | ||
1226 | * | ||
1227 | * The VBLKs are sorted into categories. Partitions are also sorted by offset. | ||
1228 | * | ||
1229 | * N.B. This function does not check the validity of the VBLKs. | ||
1230 | * | ||
1231 | * Return: 'true' The VBLK was added | ||
1232 | * 'false' An error occurred | ||
1233 | */ | ||
1234 | static bool ldm_ldmdb_add (u8 *data, int len, struct ldmdb *ldb) | ||
1235 | { | ||
1236 | struct vblk *vb; | ||
1237 | struct list_head *item; | ||
1238 | |||
1239 | BUG_ON (!data || !ldb); | ||
1240 | |||
1241 | vb = kmalloc (sizeof (*vb), GFP_KERNEL); | ||
1242 | if (!vb) { | ||
1243 | ldm_crit ("Out of memory."); | ||
1244 | return false; | ||
1245 | } | ||
1246 | |||
1247 | if (!ldm_parse_vblk (data, len, vb)) { | ||
1248 | kfree(vb); | ||
1249 | return false; /* Already logged */ | ||
1250 | } | ||
1251 | |||
1252 | /* Put vblk into the correct list. */ | ||
1253 | switch (vb->type) { | ||
1254 | case VBLK_DGR3: | ||
1255 | case VBLK_DGR4: | ||
1256 | list_add (&vb->list, &ldb->v_dgrp); | ||
1257 | break; | ||
1258 | case VBLK_DSK3: | ||
1259 | case VBLK_DSK4: | ||
1260 | list_add (&vb->list, &ldb->v_disk); | ||
1261 | break; | ||
1262 | case VBLK_VOL5: | ||
1263 | list_add (&vb->list, &ldb->v_volu); | ||
1264 | break; | ||
1265 | case VBLK_CMP3: | ||
1266 | list_add (&vb->list, &ldb->v_comp); | ||
1267 | break; | ||
1268 | case VBLK_PRT3: | ||
1269 | /* Sort by the partition's start sector. */ | ||
1270 | list_for_each (item, &ldb->v_part) { | ||
1271 | struct vblk *v = list_entry (item, struct vblk, list); | ||
1272 | if ((v->vblk.part.disk_id == vb->vblk.part.disk_id) && | ||
1273 | (v->vblk.part.start > vb->vblk.part.start)) { | ||
1274 | list_add_tail (&vb->list, &v->list); | ||
1275 | return true; | ||
1276 | } | ||
1277 | } | ||
1278 | list_add_tail (&vb->list, &ldb->v_part); | ||
1279 | break; | ||
1280 | } | ||
1281 | return true; | ||
1282 | } | ||
1283 | |||
1284 | /** | ||
1285 | * ldm_frag_add - Add a VBLK fragment to a list | ||
1286 | * @data: Raw fragment to be added to the list | ||
1287 | * @size: Size of the raw fragment | ||
1288 | * @frags: Linked list of VBLK fragments | ||
1289 | * | ||
1290 | * Fragmented VBLKs may not be consecutive in the database, so they are placed | ||
1291 | * in a list so they can be pieced together later. | ||
1292 | * | ||
1293 | * Return: 'true' Success, the VBLK was added to the list | ||
1294 | * 'false' Error, a problem occurred | ||
1295 | */ | ||
1296 | static bool ldm_frag_add (const u8 *data, int size, struct list_head *frags) | ||
1297 | { | ||
1298 | struct frag *f; | ||
1299 | struct list_head *item; | ||
1300 | int rec, num, group; | ||
1301 | |||
1302 | BUG_ON (!data || !frags); | ||
1303 | |||
1304 | if (size < 2 * VBLK_SIZE_HEAD) { | ||
1305 | ldm_error("Value of size is to small."); | ||
1306 | return false; | ||
1307 | } | ||
1308 | |||
1309 | group = get_unaligned_be32(data + 0x08); | ||
1310 | rec = get_unaligned_be16(data + 0x0C); | ||
1311 | num = get_unaligned_be16(data + 0x0E); | ||
1312 | if ((num < 1) || (num > 4)) { | ||
1313 | ldm_error ("A VBLK claims to have %d parts.", num); | ||
1314 | return false; | ||
1315 | } | ||
1316 | if (rec >= num) { | ||
1317 | ldm_error("REC value (%d) exceeds NUM value (%d)", rec, num); | ||
1318 | return false; | ||
1319 | } | ||
1320 | |||
1321 | list_for_each (item, frags) { | ||
1322 | f = list_entry (item, struct frag, list); | ||
1323 | if (f->group == group) | ||
1324 | goto found; | ||
1325 | } | ||
1326 | |||
1327 | f = kmalloc (sizeof (*f) + size*num, GFP_KERNEL); | ||
1328 | if (!f) { | ||
1329 | ldm_crit ("Out of memory."); | ||
1330 | return false; | ||
1331 | } | ||
1332 | |||
1333 | f->group = group; | ||
1334 | f->num = num; | ||
1335 | f->rec = rec; | ||
1336 | f->map = 0xFF << num; | ||
1337 | |||
1338 | list_add_tail (&f->list, frags); | ||
1339 | found: | ||
1340 | if (rec >= f->num) { | ||
1341 | ldm_error("REC value (%d) exceeds NUM value (%d)", rec, f->num); | ||
1342 | return false; | ||
1343 | } | ||
1344 | if (f->map & (1 << rec)) { | ||
1345 | ldm_error ("Duplicate VBLK, part %d.", rec); | ||
1346 | f->map &= 0x7F; /* Mark the group as broken */ | ||
1347 | return false; | ||
1348 | } | ||
1349 | f->map |= (1 << rec); | ||
1350 | if (!rec) | ||
1351 | memcpy(f->data, data, VBLK_SIZE_HEAD); | ||
1352 | data += VBLK_SIZE_HEAD; | ||
1353 | size -= VBLK_SIZE_HEAD; | ||
1354 | memcpy(f->data + VBLK_SIZE_HEAD + rec * size, data, size); | ||
1355 | return true; | ||
1356 | } | ||
1357 | |||
1358 | /** | ||
1359 | * ldm_frag_free - Free a linked list of VBLK fragments | ||
1360 | * @list: Linked list of fragments | ||
1361 | * | ||
1362 | * Free a linked list of VBLK fragments | ||
1363 | * | ||
1364 | * Return: none | ||
1365 | */ | ||
1366 | static void ldm_frag_free (struct list_head *list) | ||
1367 | { | ||
1368 | struct list_head *item, *tmp; | ||
1369 | |||
1370 | BUG_ON (!list); | ||
1371 | |||
1372 | list_for_each_safe (item, tmp, list) | ||
1373 | kfree (list_entry (item, struct frag, list)); | ||
1374 | } | ||
1375 | |||
1376 | /** | ||
1377 | * ldm_frag_commit - Validate fragmented VBLKs and add them to the database | ||
1378 | * @frags: Linked list of VBLK fragments | ||
1379 | * @ldb: Cache of the database structures | ||
1380 | * | ||
1381 | * Now that all the fragmented VBLKs have been collected, they must be added to | ||
1382 | * the database for later use. | ||
1383 | * | ||
1384 | * Return: 'true' All the fragments we added successfully | ||
1385 | * 'false' One or more of the fragments we invalid | ||
1386 | */ | ||
1387 | static bool ldm_frag_commit (struct list_head *frags, struct ldmdb *ldb) | ||
1388 | { | ||
1389 | struct frag *f; | ||
1390 | struct list_head *item; | ||
1391 | |||
1392 | BUG_ON (!frags || !ldb); | ||
1393 | |||
1394 | list_for_each (item, frags) { | ||
1395 | f = list_entry (item, struct frag, list); | ||
1396 | |||
1397 | if (f->map != 0xFF) { | ||
1398 | ldm_error ("VBLK group %d is incomplete (0x%02x).", | ||
1399 | f->group, f->map); | ||
1400 | return false; | ||
1401 | } | ||
1402 | |||
1403 | if (!ldm_ldmdb_add (f->data, f->num*ldb->vm.vblk_size, ldb)) | ||
1404 | return false; /* Already logged */ | ||
1405 | } | ||
1406 | return true; | ||
1407 | } | ||
1408 | |||
1409 | /** | ||
1410 | * ldm_get_vblks - Read the on-disk database of VBLKs into memory | ||
1411 | * @state: Partition check state including device holding the LDM Database | ||
1412 | * @base: Offset, into @state->bdev, of the database | ||
1413 | * @ldb: Cache of the database structures | ||
1414 | * | ||
1415 | * To use the information from the VBLKs, they need to be read from the disk, | ||
1416 | * unpacked and validated. We cache them in @ldb according to their type. | ||
1417 | * | ||
1418 | * Return: 'true' All the VBLKs were read successfully | ||
1419 | * 'false' An error occurred | ||
1420 | */ | ||
1421 | static bool ldm_get_vblks(struct parsed_partitions *state, unsigned long base, | ||
1422 | struct ldmdb *ldb) | ||
1423 | { | ||
1424 | int size, perbuf, skip, finish, s, v, recs; | ||
1425 | u8 *data = NULL; | ||
1426 | Sector sect; | ||
1427 | bool result = false; | ||
1428 | LIST_HEAD (frags); | ||
1429 | |||
1430 | BUG_ON(!state || !ldb); | ||
1431 | |||
1432 | size = ldb->vm.vblk_size; | ||
1433 | perbuf = 512 / size; | ||
1434 | skip = ldb->vm.vblk_offset >> 9; /* Bytes to sectors */ | ||
1435 | finish = (size * ldb->vm.last_vblk_seq) >> 9; | ||
1436 | |||
1437 | for (s = skip; s < finish; s++) { /* For each sector */ | ||
1438 | data = read_part_sector(state, base + OFF_VMDB + s, §); | ||
1439 | if (!data) { | ||
1440 | ldm_crit ("Disk read failed."); | ||
1441 | goto out; | ||
1442 | } | ||
1443 | |||
1444 | for (v = 0; v < perbuf; v++, data+=size) { /* For each vblk */ | ||
1445 | if (MAGIC_VBLK != get_unaligned_be32(data)) { | ||
1446 | ldm_error ("Expected to find a VBLK."); | ||
1447 | goto out; | ||
1448 | } | ||
1449 | |||
1450 | recs = get_unaligned_be16(data + 0x0E); /* Number of records */ | ||
1451 | if (recs == 1) { | ||
1452 | if (!ldm_ldmdb_add (data, size, ldb)) | ||
1453 | goto out; /* Already logged */ | ||
1454 | } else if (recs > 1) { | ||
1455 | if (!ldm_frag_add (data, size, &frags)) | ||
1456 | goto out; /* Already logged */ | ||
1457 | } | ||
1458 | /* else Record is not in use, ignore it. */ | ||
1459 | } | ||
1460 | put_dev_sector (sect); | ||
1461 | data = NULL; | ||
1462 | } | ||
1463 | |||
1464 | result = ldm_frag_commit (&frags, ldb); /* Failures, already logged */ | ||
1465 | out: | ||
1466 | if (data) | ||
1467 | put_dev_sector (sect); | ||
1468 | ldm_frag_free (&frags); | ||
1469 | |||
1470 | return result; | ||
1471 | } | ||
1472 | |||
1473 | /** | ||
1474 | * ldm_free_vblks - Free a linked list of vblk's | ||
1475 | * @lh: Head of a linked list of struct vblk | ||
1476 | * | ||
1477 | * Free a list of vblk's and free the memory used to maintain the list. | ||
1478 | * | ||
1479 | * Return: none | ||
1480 | */ | ||
1481 | static void ldm_free_vblks (struct list_head *lh) | ||
1482 | { | ||
1483 | struct list_head *item, *tmp; | ||
1484 | |||
1485 | BUG_ON (!lh); | ||
1486 | |||
1487 | list_for_each_safe (item, tmp, lh) | ||
1488 | kfree (list_entry (item, struct vblk, list)); | ||
1489 | } | ||
1490 | |||
1491 | |||
1492 | /** | ||
1493 | * ldm_partition - Find out whether a device is a dynamic disk and handle it | ||
1494 | * @state: Partition check state including device holding the LDM Database | ||
1495 | * | ||
1496 | * This determines whether the device @bdev is a dynamic disk and if so creates | ||
1497 | * the partitions necessary in the gendisk structure pointed to by @hd. | ||
1498 | * | ||
1499 | * We create a dummy device 1, which contains the LDM database, and then create | ||
1500 | * each partition described by the LDM database in sequence as devices 2+. For | ||
1501 | * example, if the device is hda, we would have: hda1: LDM database, hda2, hda3, | ||
1502 | * and so on: the actual data containing partitions. | ||
1503 | * | ||
1504 | * Return: 1 Success, @state->bdev is a dynamic disk and we handled it | ||
1505 | * 0 Success, @state->bdev is not a dynamic disk | ||
1506 | * -1 An error occurred before enough information had been read | ||
1507 | * Or @state->bdev is a dynamic disk, but it may be corrupted | ||
1508 | */ | ||
1509 | int ldm_partition(struct parsed_partitions *state) | ||
1510 | { | ||
1511 | struct ldmdb *ldb; | ||
1512 | unsigned long base; | ||
1513 | int result = -1; | ||
1514 | |||
1515 | BUG_ON(!state); | ||
1516 | |||
1517 | /* Look for signs of a Dynamic Disk */ | ||
1518 | if (!ldm_validate_partition_table(state)) | ||
1519 | return 0; | ||
1520 | |||
1521 | ldb = kmalloc (sizeof (*ldb), GFP_KERNEL); | ||
1522 | if (!ldb) { | ||
1523 | ldm_crit ("Out of memory."); | ||
1524 | goto out; | ||
1525 | } | ||
1526 | |||
1527 | /* Parse and check privheads. */ | ||
1528 | if (!ldm_validate_privheads(state, &ldb->ph)) | ||
1529 | goto out; /* Already logged */ | ||
1530 | |||
1531 | /* All further references are relative to base (database start). */ | ||
1532 | base = ldb->ph.config_start; | ||
1533 | |||
1534 | /* Parse and check tocs and vmdb. */ | ||
1535 | if (!ldm_validate_tocblocks(state, base, ldb) || | ||
1536 | !ldm_validate_vmdb(state, base, ldb)) | ||
1537 | goto out; /* Already logged */ | ||
1538 | |||
1539 | /* Initialize vblk lists in ldmdb struct */ | ||
1540 | INIT_LIST_HEAD (&ldb->v_dgrp); | ||
1541 | INIT_LIST_HEAD (&ldb->v_disk); | ||
1542 | INIT_LIST_HEAD (&ldb->v_volu); | ||
1543 | INIT_LIST_HEAD (&ldb->v_comp); | ||
1544 | INIT_LIST_HEAD (&ldb->v_part); | ||
1545 | |||
1546 | if (!ldm_get_vblks(state, base, ldb)) { | ||
1547 | ldm_crit ("Failed to read the VBLKs from the database."); | ||
1548 | goto cleanup; | ||
1549 | } | ||
1550 | |||
1551 | /* Finally, create the data partition devices. */ | ||
1552 | if (ldm_create_data_partitions(state, ldb)) { | ||
1553 | ldm_debug ("Parsed LDM database successfully."); | ||
1554 | result = 1; | ||
1555 | } | ||
1556 | /* else Already logged */ | ||
1557 | |||
1558 | cleanup: | ||
1559 | ldm_free_vblks (&ldb->v_dgrp); | ||
1560 | ldm_free_vblks (&ldb->v_disk); | ||
1561 | ldm_free_vblks (&ldb->v_volu); | ||
1562 | ldm_free_vblks (&ldb->v_comp); | ||
1563 | ldm_free_vblks (&ldb->v_part); | ||
1564 | out: | ||
1565 | kfree (ldb); | ||
1566 | return result; | ||
1567 | } | ||
diff --git a/block/partitions/ldm.h b/block/partitions/ldm.h deleted file mode 100644 index 374242c0971..00000000000 --- a/block/partitions/ldm.h +++ /dev/null | |||
@@ -1,215 +0,0 @@ | |||
1 | /** | ||
2 | * ldm - Part of the Linux-NTFS project. | ||
3 | * | ||
4 | * Copyright (C) 2001,2002 Richard Russon <ldm@flatcap.org> | ||
5 | * Copyright (c) 2001-2007 Anton Altaparmakov | ||
6 | * Copyright (C) 2001,2002 Jakob Kemi <jakob.kemi@telia.com> | ||
7 | * | ||
8 | * Documentation is available at http://www.linux-ntfs.org/doku.php?id=downloads | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or modify it | ||
11 | * under the terms of the GNU General Public License as published by the Free | ||
12 | * Software Foundation; either version 2 of the License, or (at your option) | ||
13 | * any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
18 | * GNU General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public License | ||
21 | * along with this program (in the main directory of the Linux-NTFS source | ||
22 | * in the file COPYING); if not, write to the Free Software Foundation, | ||
23 | * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
24 | */ | ||
25 | |||
26 | #ifndef _FS_PT_LDM_H_ | ||
27 | #define _FS_PT_LDM_H_ | ||
28 | |||
29 | #include <linux/types.h> | ||
30 | #include <linux/list.h> | ||
31 | #include <linux/genhd.h> | ||
32 | #include <linux/fs.h> | ||
33 | #include <asm/unaligned.h> | ||
34 | #include <asm/byteorder.h> | ||
35 | |||
36 | struct parsed_partitions; | ||
37 | |||
38 | /* Magic numbers in CPU format. */ | ||
39 | #define MAGIC_VMDB 0x564D4442 /* VMDB */ | ||
40 | #define MAGIC_VBLK 0x56424C4B /* VBLK */ | ||
41 | #define MAGIC_PRIVHEAD 0x5052495648454144ULL /* PRIVHEAD */ | ||
42 | #define MAGIC_TOCBLOCK 0x544F43424C4F434BULL /* TOCBLOCK */ | ||
43 | |||
44 | /* The defined vblk types. */ | ||
45 | #define VBLK_VOL5 0x51 /* Volume, version 5 */ | ||
46 | #define VBLK_CMP3 0x32 /* Component, version 3 */ | ||
47 | #define VBLK_PRT3 0x33 /* Partition, version 3 */ | ||
48 | #define VBLK_DSK3 0x34 /* Disk, version 3 */ | ||
49 | #define VBLK_DSK4 0x44 /* Disk, version 4 */ | ||
50 | #define VBLK_DGR3 0x35 /* Disk Group, version 3 */ | ||
51 | #define VBLK_DGR4 0x45 /* Disk Group, version 4 */ | ||
52 | |||
53 | /* vblk flags indicating extra information will be present */ | ||
54 | #define VBLK_FLAG_COMP_STRIPE 0x10 | ||
55 | #define VBLK_FLAG_PART_INDEX 0x08 | ||
56 | #define VBLK_FLAG_DGR3_IDS 0x08 | ||
57 | #define VBLK_FLAG_DGR4_IDS 0x08 | ||
58 | #define VBLK_FLAG_VOLU_ID1 0x08 | ||
59 | #define VBLK_FLAG_VOLU_ID2 0x20 | ||
60 | #define VBLK_FLAG_VOLU_SIZE 0x80 | ||
61 | #define VBLK_FLAG_VOLU_DRIVE 0x02 | ||
62 | |||
63 | /* size of a vblk's static parts */ | ||
64 | #define VBLK_SIZE_HEAD 16 | ||
65 | #define VBLK_SIZE_CMP3 22 /* Name and version */ | ||
66 | #define VBLK_SIZE_DGR3 12 | ||
67 | #define VBLK_SIZE_DGR4 44 | ||
68 | #define VBLK_SIZE_DSK3 12 | ||
69 | #define VBLK_SIZE_DSK4 45 | ||
70 | #define VBLK_SIZE_PRT3 28 | ||
71 | #define VBLK_SIZE_VOL5 58 | ||
72 | |||
73 | /* component types */ | ||
74 | #define COMP_STRIPE 0x01 /* Stripe-set */ | ||
75 | #define COMP_BASIC 0x02 /* Basic disk */ | ||
76 | #define COMP_RAID 0x03 /* Raid-set */ | ||
77 | |||
78 | /* Other constants. */ | ||
79 | #define LDM_DB_SIZE 2048 /* Size in sectors (= 1MiB). */ | ||
80 | |||
81 | #define OFF_PRIV1 6 /* Offset of the first privhead | ||
82 | relative to the start of the | ||
83 | device in sectors */ | ||
84 | |||
85 | /* Offsets to structures within the LDM Database in sectors. */ | ||
86 | #define OFF_PRIV2 1856 /* Backup private headers. */ | ||
87 | #define OFF_PRIV3 2047 | ||
88 | |||
89 | #define OFF_TOCB1 1 /* Tables of contents. */ | ||
90 | #define OFF_TOCB2 2 | ||
91 | #define OFF_TOCB3 2045 | ||
92 | #define OFF_TOCB4 2046 | ||
93 | |||
94 | #define OFF_VMDB 17 /* List of partitions. */ | ||
95 | |||
96 | #define LDM_PARTITION 0x42 /* Formerly SFS (Landis). */ | ||
97 | |||
98 | #define TOC_BITMAP1 "config" /* Names of the two defined */ | ||
99 | #define TOC_BITMAP2 "log" /* bitmaps in the TOCBLOCK. */ | ||
100 | |||
101 | /* Borrowed from msdos.c */ | ||
102 | #define SYS_IND(p) (get_unaligned(&(p)->sys_ind)) | ||
103 | |||
104 | struct frag { /* VBLK Fragment handling */ | ||
105 | struct list_head list; | ||
106 | u32 group; | ||
107 | u8 num; /* Total number of records */ | ||
108 | u8 rec; /* This is record number n */ | ||
109 | u8 map; /* Which portions are in use */ | ||
110 | u8 data[0]; | ||
111 | }; | ||
112 | |||
113 | /* In memory LDM database structures. */ | ||
114 | |||
115 | #define GUID_SIZE 16 | ||
116 | |||
117 | struct privhead { /* Offsets and sizes are in sectors. */ | ||
118 | u16 ver_major; | ||
119 | u16 ver_minor; | ||
120 | u64 logical_disk_start; | ||
121 | u64 logical_disk_size; | ||
122 | u64 config_start; | ||
123 | u64 config_size; | ||
124 | u8 disk_id[GUID_SIZE]; | ||
125 | }; | ||
126 | |||
127 | struct tocblock { /* We have exactly two bitmaps. */ | ||
128 | u8 bitmap1_name[16]; | ||
129 | u64 bitmap1_start; | ||
130 | u64 bitmap1_size; | ||
131 | u8 bitmap2_name[16]; | ||
132 | u64 bitmap2_start; | ||
133 | u64 bitmap2_size; | ||
134 | }; | ||
135 | |||
136 | struct vmdb { /* VMDB: The database header */ | ||
137 | u16 ver_major; | ||
138 | u16 ver_minor; | ||
139 | u32 vblk_size; | ||
140 | u32 vblk_offset; | ||
141 | u32 last_vblk_seq; | ||
142 | }; | ||
143 | |||
144 | struct vblk_comp { /* VBLK Component */ | ||
145 | u8 state[16]; | ||
146 | u64 parent_id; | ||
147 | u8 type; | ||
148 | u8 children; | ||
149 | u16 chunksize; | ||
150 | }; | ||
151 | |||
152 | struct vblk_dgrp { /* VBLK Disk Group */ | ||
153 | u8 disk_id[64]; | ||
154 | }; | ||
155 | |||
156 | struct vblk_disk { /* VBLK Disk */ | ||
157 | u8 disk_id[GUID_SIZE]; | ||
158 | u8 alt_name[128]; | ||
159 | }; | ||
160 | |||
161 | struct vblk_part { /* VBLK Partition */ | ||
162 | u64 start; | ||
163 | u64 size; /* start, size and vol_off in sectors */ | ||
164 | u64 volume_offset; | ||
165 | u64 parent_id; | ||
166 | u64 disk_id; | ||
167 | u8 partnum; | ||
168 | }; | ||
169 | |||
170 | struct vblk_volu { /* VBLK Volume */ | ||
171 | u8 volume_type[16]; | ||
172 | u8 volume_state[16]; | ||
173 | u8 guid[16]; | ||
174 | u8 drive_hint[4]; | ||
175 | u64 size; | ||
176 | u8 partition_type; | ||
177 | }; | ||
178 | |||
179 | struct vblk_head { /* VBLK standard header */ | ||
180 | u32 group; | ||
181 | u16 rec; | ||
182 | u16 nrec; | ||
183 | }; | ||
184 | |||
185 | struct vblk { /* Generalised VBLK */ | ||
186 | u8 name[64]; | ||
187 | u64 obj_id; | ||
188 | u32 sequence; | ||
189 | u8 flags; | ||
190 | u8 type; | ||
191 | union { | ||
192 | struct vblk_comp comp; | ||
193 | struct vblk_dgrp dgrp; | ||
194 | struct vblk_disk disk; | ||
195 | struct vblk_part part; | ||
196 | struct vblk_volu volu; | ||
197 | } vblk; | ||
198 | struct list_head list; | ||
199 | }; | ||
200 | |||
201 | struct ldmdb { /* Cache of the database */ | ||
202 | struct privhead ph; | ||
203 | struct tocblock toc; | ||
204 | struct vmdb vm; | ||
205 | struct list_head v_dgrp; | ||
206 | struct list_head v_disk; | ||
207 | struct list_head v_volu; | ||
208 | struct list_head v_comp; | ||
209 | struct list_head v_part; | ||
210 | }; | ||
211 | |||
212 | int ldm_partition(struct parsed_partitions *state); | ||
213 | |||
214 | #endif /* _FS_PT_LDM_H_ */ | ||
215 | |||
diff --git a/block/partitions/mac.c b/block/partitions/mac.c deleted file mode 100644 index 11f688bd76c..00000000000 --- a/block/partitions/mac.c +++ /dev/null | |||
@@ -1,134 +0,0 @@ | |||
1 | /* | ||
2 | * fs/partitions/mac.c | ||
3 | * | ||
4 | * Code extracted from drivers/block/genhd.c | ||
5 | * Copyright (C) 1991-1998 Linus Torvalds | ||
6 | * Re-organised Feb 1998 Russell King | ||
7 | */ | ||
8 | |||
9 | #include <linux/ctype.h> | ||
10 | #include "check.h" | ||
11 | #include "mac.h" | ||
12 | |||
13 | #ifdef CONFIG_PPC_PMAC | ||
14 | #include <asm/machdep.h> | ||
15 | extern void note_bootable_part(dev_t dev, int part, int goodness); | ||
16 | #endif | ||
17 | |||
18 | /* | ||
19 | * Code to understand MacOS partition tables. | ||
20 | */ | ||
21 | |||
22 | static inline void mac_fix_string(char *stg, int len) | ||
23 | { | ||
24 | int i; | ||
25 | |||
26 | for (i = len - 1; i >= 0 && stg[i] == ' '; i--) | ||
27 | stg[i] = 0; | ||
28 | } | ||
29 | |||
30 | int mac_partition(struct parsed_partitions *state) | ||
31 | { | ||
32 | Sector sect; | ||
33 | unsigned char *data; | ||
34 | int slot, blocks_in_map; | ||
35 | unsigned secsize; | ||
36 | #ifdef CONFIG_PPC_PMAC | ||
37 | int found_root = 0; | ||
38 | int found_root_goodness = 0; | ||
39 | #endif | ||
40 | struct mac_partition *part; | ||
41 | struct mac_driver_desc *md; | ||
42 | |||
43 | /* Get 0th block and look at the first partition map entry. */ | ||
44 | md = read_part_sector(state, 0, §); | ||
45 | if (!md) | ||
46 | return -1; | ||
47 | if (be16_to_cpu(md->signature) != MAC_DRIVER_MAGIC) { | ||
48 | put_dev_sector(sect); | ||
49 | return 0; | ||
50 | } | ||
51 | secsize = be16_to_cpu(md->block_size); | ||
52 | put_dev_sector(sect); | ||
53 | data = read_part_sector(state, secsize/512, §); | ||
54 | if (!data) | ||
55 | return -1; | ||
56 | part = (struct mac_partition *) (data + secsize%512); | ||
57 | if (be16_to_cpu(part->signature) != MAC_PARTITION_MAGIC) { | ||
58 | put_dev_sector(sect); | ||
59 | return 0; /* not a MacOS disk */ | ||
60 | } | ||
61 | blocks_in_map = be32_to_cpu(part->map_count); | ||
62 | if (blocks_in_map < 0 || blocks_in_map >= DISK_MAX_PARTS) { | ||
63 | put_dev_sector(sect); | ||
64 | return 0; | ||
65 | } | ||
66 | strlcat(state->pp_buf, " [mac]", PAGE_SIZE); | ||
67 | for (slot = 1; slot <= blocks_in_map; ++slot) { | ||
68 | int pos = slot * secsize; | ||
69 | put_dev_sector(sect); | ||
70 | data = read_part_sector(state, pos/512, §); | ||
71 | if (!data) | ||
72 | return -1; | ||
73 | part = (struct mac_partition *) (data + pos%512); | ||
74 | if (be16_to_cpu(part->signature) != MAC_PARTITION_MAGIC) | ||
75 | break; | ||
76 | put_partition(state, slot, | ||
77 | be32_to_cpu(part->start_block) * (secsize/512), | ||
78 | be32_to_cpu(part->block_count) * (secsize/512)); | ||
79 | |||
80 | if (!strnicmp(part->type, "Linux_RAID", 10)) | ||
81 | state->parts[slot].flags = ADDPART_FLAG_RAID; | ||
82 | #ifdef CONFIG_PPC_PMAC | ||
83 | /* | ||
84 | * If this is the first bootable partition, tell the | ||
85 | * setup code, in case it wants to make this the root. | ||
86 | */ | ||
87 | if (machine_is(powermac)) { | ||
88 | int goodness = 0; | ||
89 | |||
90 | mac_fix_string(part->processor, 16); | ||
91 | mac_fix_string(part->name, 32); | ||
92 | mac_fix_string(part->type, 32); | ||
93 | |||
94 | if ((be32_to_cpu(part->status) & MAC_STATUS_BOOTABLE) | ||
95 | && strcasecmp(part->processor, "powerpc") == 0) | ||
96 | goodness++; | ||
97 | |||
98 | if (strcasecmp(part->type, "Apple_UNIX_SVR2") == 0 | ||
99 | || (strnicmp(part->type, "Linux", 5) == 0 | ||
100 | && strcasecmp(part->type, "Linux_swap") != 0)) { | ||
101 | int i, l; | ||
102 | |||
103 | goodness++; | ||
104 | l = strlen(part->name); | ||
105 | if (strcmp(part->name, "/") == 0) | ||
106 | goodness++; | ||
107 | for (i = 0; i <= l - 4; ++i) { | ||
108 | if (strnicmp(part->name + i, "root", | ||
109 | 4) == 0) { | ||
110 | goodness += 2; | ||
111 | break; | ||
112 | } | ||
113 | } | ||
114 | if (strnicmp(part->name, "swap", 4) == 0) | ||
115 | goodness--; | ||
116 | } | ||
117 | |||
118 | if (goodness > found_root_goodness) { | ||
119 | found_root = slot; | ||
120 | found_root_goodness = goodness; | ||
121 | } | ||
122 | } | ||
123 | #endif /* CONFIG_PPC_PMAC */ | ||
124 | } | ||
125 | #ifdef CONFIG_PPC_PMAC | ||
126 | if (found_root_goodness) | ||
127 | note_bootable_part(state->bdev->bd_dev, found_root, | ||
128 | found_root_goodness); | ||
129 | #endif | ||
130 | |||
131 | put_dev_sector(sect); | ||
132 | strlcat(state->pp_buf, "\n", PAGE_SIZE); | ||
133 | return 1; | ||
134 | } | ||
diff --git a/block/partitions/mac.h b/block/partitions/mac.h deleted file mode 100644 index 3c7d9843638..00000000000 --- a/block/partitions/mac.h +++ /dev/null | |||
@@ -1,44 +0,0 @@ | |||
1 | /* | ||
2 | * fs/partitions/mac.h | ||
3 | */ | ||
4 | |||
5 | #define MAC_PARTITION_MAGIC 0x504d | ||
6 | |||
7 | /* type field value for A/UX or other Unix partitions */ | ||
8 | #define APPLE_AUX_TYPE "Apple_UNIX_SVR2" | ||
9 | |||
10 | struct mac_partition { | ||
11 | __be16 signature; /* expected to be MAC_PARTITION_MAGIC */ | ||
12 | __be16 res1; | ||
13 | __be32 map_count; /* # blocks in partition map */ | ||
14 | __be32 start_block; /* absolute starting block # of partition */ | ||
15 | __be32 block_count; /* number of blocks in partition */ | ||
16 | char name[32]; /* partition name */ | ||
17 | char type[32]; /* string type description */ | ||
18 | __be32 data_start; /* rel block # of first data block */ | ||
19 | __be32 data_count; /* number of data blocks */ | ||
20 | __be32 status; /* partition status bits */ | ||
21 | __be32 boot_start; | ||
22 | __be32 boot_size; | ||
23 | __be32 boot_load; | ||
24 | __be32 boot_load2; | ||
25 | __be32 boot_entry; | ||
26 | __be32 boot_entry2; | ||
27 | __be32 boot_cksum; | ||
28 | char processor[16]; /* identifies ISA of boot */ | ||
29 | /* there is more stuff after this that we don't need */ | ||
30 | }; | ||
31 | |||
32 | #define MAC_STATUS_BOOTABLE 8 /* partition is bootable */ | ||
33 | |||
34 | #define MAC_DRIVER_MAGIC 0x4552 | ||
35 | |||
36 | /* Driver descriptor structure, in block 0 */ | ||
37 | struct mac_driver_desc { | ||
38 | __be16 signature; /* expected to be MAC_DRIVER_MAGIC */ | ||
39 | __be16 block_size; | ||
40 | __be32 block_count; | ||
41 | /* ... more stuff */ | ||
42 | }; | ||
43 | |||
44 | int mac_partition(struct parsed_partitions *state); | ||
diff --git a/block/partitions/msdos.c b/block/partitions/msdos.c deleted file mode 100644 index 8752a5d2656..00000000000 --- a/block/partitions/msdos.c +++ /dev/null | |||
@@ -1,569 +0,0 @@ | |||
1 | /* | ||
2 | * fs/partitions/msdos.c | ||
3 | * | ||
4 | * Code extracted from drivers/block/genhd.c | ||
5 | * Copyright (C) 1991-1998 Linus Torvalds | ||
6 | * | ||
7 | * Thanks to Branko Lankester, lankeste@fwi.uva.nl, who found a bug | ||
8 | * in the early extended-partition checks and added DM partitions | ||
9 | * | ||
10 | * Support for DiskManager v6.0x added by Mark Lord, | ||
11 | * with information provided by OnTrack. This now works for linux fdisk | ||
12 | * and LILO, as well as loadlin and bootln. Note that disks other than | ||
13 | * /dev/hda *must* have a "DOS" type 0x51 partition in the first slot (hda1). | ||
14 | * | ||
15 | * More flexible handling of extended partitions - aeb, 950831 | ||
16 | * | ||
17 | * Check partition table on IDE disks for common CHS translations | ||
18 | * | ||
19 | * Re-organised Feb 1998 Russell King | ||
20 | */ | ||
21 | #include <linux/msdos_fs.h> | ||
22 | |||
23 | #include "check.h" | ||
24 | #include "msdos.h" | ||
25 | #include "efi.h" | ||
26 | |||
27 | /* | ||
28 | * Many architectures don't like unaligned accesses, while | ||
29 | * the nr_sects and start_sect partition table entries are | ||
30 | * at a 2 (mod 4) address. | ||
31 | */ | ||
32 | #include <asm/unaligned.h> | ||
33 | |||
34 | #define SYS_IND(p) get_unaligned(&p->sys_ind) | ||
35 | |||
36 | static inline sector_t nr_sects(struct partition *p) | ||
37 | { | ||
38 | return (sector_t)get_unaligned_le32(&p->nr_sects); | ||
39 | } | ||
40 | |||
41 | static inline sector_t start_sect(struct partition *p) | ||
42 | { | ||
43 | return (sector_t)get_unaligned_le32(&p->start_sect); | ||
44 | } | ||
45 | |||
46 | static inline int is_extended_partition(struct partition *p) | ||
47 | { | ||
48 | return (SYS_IND(p) == DOS_EXTENDED_PARTITION || | ||
49 | SYS_IND(p) == WIN98_EXTENDED_PARTITION || | ||
50 | SYS_IND(p) == LINUX_EXTENDED_PARTITION); | ||
51 | } | ||
52 | |||
53 | #define MSDOS_LABEL_MAGIC1 0x55 | ||
54 | #define MSDOS_LABEL_MAGIC2 0xAA | ||
55 | |||
56 | static inline int | ||
57 | msdos_magic_present(unsigned char *p) | ||
58 | { | ||
59 | return (p[0] == MSDOS_LABEL_MAGIC1 && p[1] == MSDOS_LABEL_MAGIC2); | ||
60 | } | ||
61 | |||
62 | /* Value is EBCDIC 'IBMA' */ | ||
63 | #define AIX_LABEL_MAGIC1 0xC9 | ||
64 | #define AIX_LABEL_MAGIC2 0xC2 | ||
65 | #define AIX_LABEL_MAGIC3 0xD4 | ||
66 | #define AIX_LABEL_MAGIC4 0xC1 | ||
67 | static int aix_magic_present(struct parsed_partitions *state, unsigned char *p) | ||
68 | { | ||
69 | struct partition *pt = (struct partition *) (p + 0x1be); | ||
70 | Sector sect; | ||
71 | unsigned char *d; | ||
72 | int slot, ret = 0; | ||
73 | |||
74 | if (!(p[0] == AIX_LABEL_MAGIC1 && | ||
75 | p[1] == AIX_LABEL_MAGIC2 && | ||
76 | p[2] == AIX_LABEL_MAGIC3 && | ||
77 | p[3] == AIX_LABEL_MAGIC4)) | ||
78 | return 0; | ||
79 | /* Assume the partition table is valid if Linux partitions exists */ | ||
80 | for (slot = 1; slot <= 4; slot++, pt++) { | ||
81 | if (pt->sys_ind == LINUX_SWAP_PARTITION || | ||
82 | pt->sys_ind == LINUX_RAID_PARTITION || | ||
83 | pt->sys_ind == LINUX_DATA_PARTITION || | ||
84 | pt->sys_ind == LINUX_LVM_PARTITION || | ||
85 | is_extended_partition(pt)) | ||
86 | return 0; | ||
87 | } | ||
88 | d = read_part_sector(state, 7, §); | ||
89 | if (d) { | ||
90 | if (d[0] == '_' && d[1] == 'L' && d[2] == 'V' && d[3] == 'M') | ||
91 | ret = 1; | ||
92 | put_dev_sector(sect); | ||
93 | }; | ||
94 | return ret; | ||
95 | } | ||
96 | |||
97 | static void set_info(struct parsed_partitions *state, int slot, | ||
98 | u32 disksig) | ||
99 | { | ||
100 | struct partition_meta_info *info = &state->parts[slot].info; | ||
101 | |||
102 | snprintf(info->uuid, sizeof(info->uuid), "%08x-%02x", disksig, | ||
103 | slot); | ||
104 | info->volname[0] = 0; | ||
105 | state->parts[slot].has_info = true; | ||
106 | } | ||
107 | |||
108 | /* | ||
109 | * Create devices for each logical partition in an extended partition. | ||
110 | * The logical partitions form a linked list, with each entry being | ||
111 | * a partition table with two entries. The first entry | ||
112 | * is the real data partition (with a start relative to the partition | ||
113 | * table start). The second is a pointer to the next logical partition | ||
114 | * (with a start relative to the entire extended partition). | ||
115 | * We do not create a Linux partition for the partition tables, but | ||
116 | * only for the actual data partitions. | ||
117 | */ | ||
118 | |||
119 | static void parse_extended(struct parsed_partitions *state, | ||
120 | sector_t first_sector, sector_t first_size, | ||
121 | u32 disksig) | ||
122 | { | ||
123 | struct partition *p; | ||
124 | Sector sect; | ||
125 | unsigned char *data; | ||
126 | sector_t this_sector, this_size; | ||
127 | sector_t sector_size = bdev_logical_block_size(state->bdev) / 512; | ||
128 | int loopct = 0; /* number of links followed | ||
129 | without finding a data partition */ | ||
130 | int i; | ||
131 | |||
132 | this_sector = first_sector; | ||
133 | this_size = first_size; | ||
134 | |||
135 | while (1) { | ||
136 | if (++loopct > 100) | ||
137 | return; | ||
138 | if (state->next == state->limit) | ||
139 | return; | ||
140 | data = read_part_sector(state, this_sector, §); | ||
141 | if (!data) | ||
142 | return; | ||
143 | |||
144 | if (!msdos_magic_present(data + 510)) | ||
145 | goto done; | ||
146 | |||
147 | p = (struct partition *) (data + 0x1be); | ||
148 | |||
149 | /* | ||
150 | * Usually, the first entry is the real data partition, | ||
151 | * the 2nd entry is the next extended partition, or empty, | ||
152 | * and the 3rd and 4th entries are unused. | ||
153 | * However, DRDOS sometimes has the extended partition as | ||
154 | * the first entry (when the data partition is empty), | ||
155 | * and OS/2 seems to use all four entries. | ||
156 | */ | ||
157 | |||
158 | /* | ||
159 | * First process the data partition(s) | ||
160 | */ | ||
161 | for (i=0; i<4; i++, p++) { | ||
162 | sector_t offs, size, next; | ||
163 | if (!nr_sects(p) || is_extended_partition(p)) | ||
164 | continue; | ||
165 | |||
166 | /* Check the 3rd and 4th entries - | ||
167 | these sometimes contain random garbage */ | ||
168 | offs = start_sect(p)*sector_size; | ||
169 | size = nr_sects(p)*sector_size; | ||
170 | next = this_sector + offs; | ||
171 | if (i >= 2) { | ||
172 | if (offs + size > this_size) | ||
173 | continue; | ||
174 | if (next < first_sector) | ||
175 | continue; | ||
176 | if (next + size > first_sector + first_size) | ||
177 | continue; | ||
178 | } | ||
179 | |||
180 | put_partition(state, state->next, next, size); | ||
181 | set_info(state, state->next, disksig); | ||
182 | if (SYS_IND(p) == LINUX_RAID_PARTITION) | ||
183 | state->parts[state->next].flags = ADDPART_FLAG_RAID; | ||
184 | loopct = 0; | ||
185 | if (++state->next == state->limit) | ||
186 | goto done; | ||
187 | } | ||
188 | /* | ||
189 | * Next, process the (first) extended partition, if present. | ||
190 | * (So far, there seems to be no reason to make | ||
191 | * parse_extended() recursive and allow a tree | ||
192 | * of extended partitions.) | ||
193 | * It should be a link to the next logical partition. | ||
194 | */ | ||
195 | p -= 4; | ||
196 | for (i=0; i<4; i++, p++) | ||
197 | if (nr_sects(p) && is_extended_partition(p)) | ||
198 | break; | ||
199 | if (i == 4) | ||
200 | goto done; /* nothing left to do */ | ||
201 | |||
202 | this_sector = first_sector + start_sect(p) * sector_size; | ||
203 | this_size = nr_sects(p) * sector_size; | ||
204 | put_dev_sector(sect); | ||
205 | } | ||
206 | done: | ||
207 | put_dev_sector(sect); | ||
208 | } | ||
209 | |||
210 | /* james@bpgc.com: Solaris has a nasty indicator: 0x82 which also | ||
211 | indicates linux swap. Be careful before believing this is Solaris. */ | ||
212 | |||
213 | static void parse_solaris_x86(struct parsed_partitions *state, | ||
214 | sector_t offset, sector_t size, int origin) | ||
215 | { | ||
216 | #ifdef CONFIG_SOLARIS_X86_PARTITION | ||
217 | Sector sect; | ||
218 | struct solaris_x86_vtoc *v; | ||
219 | int i; | ||
220 | short max_nparts; | ||
221 | |||
222 | v = read_part_sector(state, offset + 1, §); | ||
223 | if (!v) | ||
224 | return; | ||
225 | if (le32_to_cpu(v->v_sanity) != SOLARIS_X86_VTOC_SANE) { | ||
226 | put_dev_sector(sect); | ||
227 | return; | ||
228 | } | ||
229 | { | ||
230 | char tmp[1 + BDEVNAME_SIZE + 10 + 11 + 1]; | ||
231 | |||
232 | snprintf(tmp, sizeof(tmp), " %s%d: <solaris:", state->name, origin); | ||
233 | strlcat(state->pp_buf, tmp, PAGE_SIZE); | ||
234 | } | ||
235 | if (le32_to_cpu(v->v_version) != 1) { | ||
236 | char tmp[64]; | ||
237 | |||
238 | snprintf(tmp, sizeof(tmp), " cannot handle version %d vtoc>\n", | ||
239 | le32_to_cpu(v->v_version)); | ||
240 | strlcat(state->pp_buf, tmp, PAGE_SIZE); | ||
241 | put_dev_sector(sect); | ||
242 | return; | ||
243 | } | ||
244 | /* Ensure we can handle previous case of VTOC with 8 entries gracefully */ | ||
245 | max_nparts = le16_to_cpu (v->v_nparts) > 8 ? SOLARIS_X86_NUMSLICE : 8; | ||
246 | for (i=0; i<max_nparts && state->next<state->limit; i++) { | ||
247 | struct solaris_x86_slice *s = &v->v_slice[i]; | ||
248 | char tmp[3 + 10 + 1 + 1]; | ||
249 | |||
250 | if (s->s_size == 0) | ||
251 | continue; | ||
252 | snprintf(tmp, sizeof(tmp), " [s%d]", i); | ||
253 | strlcat(state->pp_buf, tmp, PAGE_SIZE); | ||
254 | /* solaris partitions are relative to current MS-DOS | ||
255 | * one; must add the offset of the current partition */ | ||
256 | put_partition(state, state->next++, | ||
257 | le32_to_cpu(s->s_start)+offset, | ||
258 | le32_to_cpu(s->s_size)); | ||
259 | } | ||
260 | put_dev_sector(sect); | ||
261 | strlcat(state->pp_buf, " >\n", PAGE_SIZE); | ||
262 | #endif | ||
263 | } | ||
264 | |||
265 | #if defined(CONFIG_BSD_DISKLABEL) | ||
266 | /* | ||
267 | * Create devices for BSD partitions listed in a disklabel, under a | ||
268 | * dos-like partition. See parse_extended() for more information. | ||
269 | */ | ||
270 | static void parse_bsd(struct parsed_partitions *state, | ||
271 | sector_t offset, sector_t size, int origin, char *flavour, | ||
272 | int max_partitions) | ||
273 | { | ||
274 | Sector sect; | ||
275 | struct bsd_disklabel *l; | ||
276 | struct bsd_partition *p; | ||
277 | char tmp[64]; | ||
278 | |||
279 | l = read_part_sector(state, offset + 1, §); | ||
280 | if (!l) | ||
281 | return; | ||
282 | if (le32_to_cpu(l->d_magic) != BSD_DISKMAGIC) { | ||
283 | put_dev_sector(sect); | ||
284 | return; | ||
285 | } | ||
286 | |||
287 | snprintf(tmp, sizeof(tmp), " %s%d: <%s:", state->name, origin, flavour); | ||
288 | strlcat(state->pp_buf, tmp, PAGE_SIZE); | ||
289 | |||
290 | if (le16_to_cpu(l->d_npartitions) < max_partitions) | ||
291 | max_partitions = le16_to_cpu(l->d_npartitions); | ||
292 | for (p = l->d_partitions; p - l->d_partitions < max_partitions; p++) { | ||
293 | sector_t bsd_start, bsd_size; | ||
294 | |||
295 | if (state->next == state->limit) | ||
296 | break; | ||
297 | if (p->p_fstype == BSD_FS_UNUSED) | ||
298 | continue; | ||
299 | bsd_start = le32_to_cpu(p->p_offset); | ||
300 | bsd_size = le32_to_cpu(p->p_size); | ||
301 | if (offset == bsd_start && size == bsd_size) | ||
302 | /* full parent partition, we have it already */ | ||
303 | continue; | ||
304 | if (offset > bsd_start || offset+size < bsd_start+bsd_size) { | ||
305 | strlcat(state->pp_buf, "bad subpartition - ignored\n", PAGE_SIZE); | ||
306 | continue; | ||
307 | } | ||
308 | put_partition(state, state->next++, bsd_start, bsd_size); | ||
309 | } | ||
310 | put_dev_sector(sect); | ||
311 | if (le16_to_cpu(l->d_npartitions) > max_partitions) { | ||
312 | snprintf(tmp, sizeof(tmp), " (ignored %d more)", | ||
313 | le16_to_cpu(l->d_npartitions) - max_partitions); | ||
314 | strlcat(state->pp_buf, tmp, PAGE_SIZE); | ||
315 | } | ||
316 | strlcat(state->pp_buf, " >\n", PAGE_SIZE); | ||
317 | } | ||
318 | #endif | ||
319 | |||
320 | static void parse_freebsd(struct parsed_partitions *state, | ||
321 | sector_t offset, sector_t size, int origin) | ||
322 | { | ||
323 | #ifdef CONFIG_BSD_DISKLABEL | ||
324 | parse_bsd(state, offset, size, origin, "bsd", BSD_MAXPARTITIONS); | ||
325 | #endif | ||
326 | } | ||
327 | |||
328 | static void parse_netbsd(struct parsed_partitions *state, | ||
329 | sector_t offset, sector_t size, int origin) | ||
330 | { | ||
331 | #ifdef CONFIG_BSD_DISKLABEL | ||
332 | parse_bsd(state, offset, size, origin, "netbsd", BSD_MAXPARTITIONS); | ||
333 | #endif | ||
334 | } | ||
335 | |||
336 | static void parse_openbsd(struct parsed_partitions *state, | ||
337 | sector_t offset, sector_t size, int origin) | ||
338 | { | ||
339 | #ifdef CONFIG_BSD_DISKLABEL | ||
340 | parse_bsd(state, offset, size, origin, "openbsd", | ||
341 | OPENBSD_MAXPARTITIONS); | ||
342 | #endif | ||
343 | } | ||
344 | |||
345 | /* | ||
346 | * Create devices for Unixware partitions listed in a disklabel, under a | ||
347 | * dos-like partition. See parse_extended() for more information. | ||
348 | */ | ||
349 | static void parse_unixware(struct parsed_partitions *state, | ||
350 | sector_t offset, sector_t size, int origin) | ||
351 | { | ||
352 | #ifdef CONFIG_UNIXWARE_DISKLABEL | ||
353 | Sector sect; | ||
354 | struct unixware_disklabel *l; | ||
355 | struct unixware_slice *p; | ||
356 | |||
357 | l = read_part_sector(state, offset + 29, §); | ||
358 | if (!l) | ||
359 | return; | ||
360 | if (le32_to_cpu(l->d_magic) != UNIXWARE_DISKMAGIC || | ||
361 | le32_to_cpu(l->vtoc.v_magic) != UNIXWARE_DISKMAGIC2) { | ||
362 | put_dev_sector(sect); | ||
363 | return; | ||
364 | } | ||
365 | { | ||
366 | char tmp[1 + BDEVNAME_SIZE + 10 + 12 + 1]; | ||
367 | |||
368 | snprintf(tmp, sizeof(tmp), " %s%d: <unixware:", state->name, origin); | ||
369 | strlcat(state->pp_buf, tmp, PAGE_SIZE); | ||
370 | } | ||
371 | p = &l->vtoc.v_slice[1]; | ||
372 | /* I omit the 0th slice as it is the same as whole disk. */ | ||
373 | while (p - &l->vtoc.v_slice[0] < UNIXWARE_NUMSLICE) { | ||
374 | if (state->next == state->limit) | ||
375 | break; | ||
376 | |||
377 | if (p->s_label != UNIXWARE_FS_UNUSED) | ||
378 | put_partition(state, state->next++, | ||
379 | le32_to_cpu(p->start_sect), | ||
380 | le32_to_cpu(p->nr_sects)); | ||
381 | p++; | ||
382 | } | ||
383 | put_dev_sector(sect); | ||
384 | strlcat(state->pp_buf, " >\n", PAGE_SIZE); | ||
385 | #endif | ||
386 | } | ||
387 | |||
388 | /* | ||
389 | * Minix 2.0.0/2.0.2 subpartition support. | ||
390 | * Anand Krishnamurthy <anandk@wiproge.med.ge.com> | ||
391 | * Rajeev V. Pillai <rajeevvp@yahoo.com> | ||
392 | */ | ||
393 | static void parse_minix(struct parsed_partitions *state, | ||
394 | sector_t offset, sector_t size, int origin) | ||
395 | { | ||
396 | #ifdef CONFIG_MINIX_SUBPARTITION | ||
397 | Sector sect; | ||
398 | unsigned char *data; | ||
399 | struct partition *p; | ||
400 | int i; | ||
401 | |||
402 | data = read_part_sector(state, offset, §); | ||
403 | if (!data) | ||
404 | return; | ||
405 | |||
406 | p = (struct partition *)(data + 0x1be); | ||
407 | |||
408 | /* The first sector of a Minix partition can have either | ||
409 | * a secondary MBR describing its subpartitions, or | ||
410 | * the normal boot sector. */ | ||
411 | if (msdos_magic_present (data + 510) && | ||
412 | SYS_IND(p) == MINIX_PARTITION) { /* subpartition table present */ | ||
413 | char tmp[1 + BDEVNAME_SIZE + 10 + 9 + 1]; | ||
414 | |||
415 | snprintf(tmp, sizeof(tmp), " %s%d: <minix:", state->name, origin); | ||
416 | strlcat(state->pp_buf, tmp, PAGE_SIZE); | ||
417 | for (i = 0; i < MINIX_NR_SUBPARTITIONS; i++, p++) { | ||
418 | if (state->next == state->limit) | ||
419 | break; | ||
420 | /* add each partition in use */ | ||
421 | if (SYS_IND(p) == MINIX_PARTITION) | ||
422 | put_partition(state, state->next++, | ||
423 | start_sect(p), nr_sects(p)); | ||
424 | } | ||
425 | strlcat(state->pp_buf, " >\n", PAGE_SIZE); | ||
426 | } | ||
427 | put_dev_sector(sect); | ||
428 | #endif /* CONFIG_MINIX_SUBPARTITION */ | ||
429 | } | ||
430 | |||
431 | static struct { | ||
432 | unsigned char id; | ||
433 | void (*parse)(struct parsed_partitions *, sector_t, sector_t, int); | ||
434 | } subtypes[] = { | ||
435 | {FREEBSD_PARTITION, parse_freebsd}, | ||
436 | {NETBSD_PARTITION, parse_netbsd}, | ||
437 | {OPENBSD_PARTITION, parse_openbsd}, | ||
438 | {MINIX_PARTITION, parse_minix}, | ||
439 | {UNIXWARE_PARTITION, parse_unixware}, | ||
440 | {SOLARIS_X86_PARTITION, parse_solaris_x86}, | ||
441 | {NEW_SOLARIS_X86_PARTITION, parse_solaris_x86}, | ||
442 | {0, NULL}, | ||
443 | }; | ||
444 | |||
445 | int msdos_partition(struct parsed_partitions *state) | ||
446 | { | ||
447 | sector_t sector_size = bdev_logical_block_size(state->bdev) / 512; | ||
448 | Sector sect; | ||
449 | unsigned char *data; | ||
450 | struct partition *p; | ||
451 | struct fat_boot_sector *fb; | ||
452 | int slot; | ||
453 | u32 disksig; | ||
454 | |||
455 | data = read_part_sector(state, 0, §); | ||
456 | if (!data) | ||
457 | return -1; | ||
458 | if (!msdos_magic_present(data + 510)) { | ||
459 | put_dev_sector(sect); | ||
460 | return 0; | ||
461 | } | ||
462 | |||
463 | if (aix_magic_present(state, data)) { | ||
464 | put_dev_sector(sect); | ||
465 | strlcat(state->pp_buf, " [AIX]", PAGE_SIZE); | ||
466 | return 0; | ||
467 | } | ||
468 | |||
469 | /* | ||
470 | * Now that the 55aa signature is present, this is probably | ||
471 | * either the boot sector of a FAT filesystem or a DOS-type | ||
472 | * partition table. Reject this in case the boot indicator | ||
473 | * is not 0 or 0x80. | ||
474 | */ | ||
475 | p = (struct partition *) (data + 0x1be); | ||
476 | for (slot = 1; slot <= 4; slot++, p++) { | ||
477 | if (p->boot_ind != 0 && p->boot_ind != 0x80) { | ||
478 | /* | ||
479 | * Even without a valid boot inidicator value | ||
480 | * its still possible this is valid FAT filesystem | ||
481 | * without a partition table. | ||
482 | */ | ||
483 | fb = (struct fat_boot_sector *) data; | ||
484 | if (slot == 1 && fb->reserved && fb->fats | ||
485 | && fat_valid_media(fb->media)) { | ||
486 | strlcat(state->pp_buf, "\n", PAGE_SIZE); | ||
487 | put_dev_sector(sect); | ||
488 | return 1; | ||
489 | } else { | ||
490 | put_dev_sector(sect); | ||
491 | return 0; | ||
492 | } | ||
493 | } | ||
494 | } | ||
495 | |||
496 | #ifdef CONFIG_EFI_PARTITION | ||
497 | p = (struct partition *) (data + 0x1be); | ||
498 | for (slot = 1 ; slot <= 4 ; slot++, p++) { | ||
499 | /* If this is an EFI GPT disk, msdos should ignore it. */ | ||
500 | if (SYS_IND(p) == EFI_PMBR_OSTYPE_EFI_GPT) { | ||
501 | put_dev_sector(sect); | ||
502 | return 0; | ||
503 | } | ||
504 | } | ||
505 | #endif | ||
506 | p = (struct partition *) (data + 0x1be); | ||
507 | |||
508 | disksig = le32_to_cpup((__le32 *)(data + 0x1b8)); | ||
509 | |||
510 | /* | ||
511 | * Look for partitions in two passes: | ||
512 | * First find the primary and DOS-type extended partitions. | ||
513 | * On the second pass look inside *BSD, Unixware and Solaris partitions. | ||
514 | */ | ||
515 | |||
516 | state->next = 5; | ||
517 | for (slot = 1 ; slot <= 4 ; slot++, p++) { | ||
518 | sector_t start = start_sect(p)*sector_size; | ||
519 | sector_t size = nr_sects(p)*sector_size; | ||
520 | if (!size) | ||
521 | continue; | ||
522 | if (is_extended_partition(p)) { | ||
523 | /* | ||
524 | * prevent someone doing mkfs or mkswap on an | ||
525 | * extended partition, but leave room for LILO | ||
526 | * FIXME: this uses one logical sector for > 512b | ||
527 | * sector, although it may not be enough/proper. | ||
528 | */ | ||
529 | sector_t n = 2; | ||
530 | n = min(size, max(sector_size, n)); | ||
531 | put_partition(state, slot, start, n); | ||
532 | |||
533 | strlcat(state->pp_buf, " <", PAGE_SIZE); | ||
534 | parse_extended(state, start, size, disksig); | ||
535 | strlcat(state->pp_buf, " >", PAGE_SIZE); | ||
536 | continue; | ||
537 | } | ||
538 | put_partition(state, slot, start, size); | ||
539 | set_info(state, slot, disksig); | ||
540 | if (SYS_IND(p) == LINUX_RAID_PARTITION) | ||
541 | state->parts[slot].flags = ADDPART_FLAG_RAID; | ||
542 | if (SYS_IND(p) == DM6_PARTITION) | ||
543 | strlcat(state->pp_buf, "[DM]", PAGE_SIZE); | ||
544 | if (SYS_IND(p) == EZD_PARTITION) | ||
545 | strlcat(state->pp_buf, "[EZD]", PAGE_SIZE); | ||
546 | } | ||
547 | |||
548 | strlcat(state->pp_buf, "\n", PAGE_SIZE); | ||
549 | |||
550 | /* second pass - output for each on a separate line */ | ||
551 | p = (struct partition *) (0x1be + data); | ||
552 | for (slot = 1 ; slot <= 4 ; slot++, p++) { | ||
553 | unsigned char id = SYS_IND(p); | ||
554 | int n; | ||
555 | |||
556 | if (!nr_sects(p)) | ||
557 | continue; | ||
558 | |||
559 | for (n = 0; subtypes[n].parse && id != subtypes[n].id; n++) | ||
560 | ; | ||
561 | |||
562 | if (!subtypes[n].parse) | ||
563 | continue; | ||
564 | subtypes[n].parse(state, start_sect(p) * sector_size, | ||
565 | nr_sects(p) * sector_size, slot); | ||
566 | } | ||
567 | put_dev_sector(sect); | ||
568 | return 1; | ||
569 | } | ||
diff --git a/block/partitions/msdos.h b/block/partitions/msdos.h deleted file mode 100644 index 38c781c490b..00000000000 --- a/block/partitions/msdos.h +++ /dev/null | |||
@@ -1,8 +0,0 @@ | |||
1 | /* | ||
2 | * fs/partitions/msdos.h | ||
3 | */ | ||
4 | |||
5 | #define MSDOS_LABEL_MAGIC 0xAA55 | ||
6 | |||
7 | int msdos_partition(struct parsed_partitions *state); | ||
8 | |||
diff --git a/block/partitions/osf.c b/block/partitions/osf.c deleted file mode 100644 index 764b86a0196..00000000000 --- a/block/partitions/osf.c +++ /dev/null | |||
@@ -1,86 +0,0 @@ | |||
1 | /* | ||
2 | * fs/partitions/osf.c | ||
3 | * | ||
4 | * Code extracted from drivers/block/genhd.c | ||
5 | * | ||
6 | * Copyright (C) 1991-1998 Linus Torvalds | ||
7 | * Re-organised Feb 1998 Russell King | ||
8 | */ | ||
9 | |||
10 | #include "check.h" | ||
11 | #include "osf.h" | ||
12 | |||
13 | #define MAX_OSF_PARTITIONS 18 | ||
14 | |||
15 | int osf_partition(struct parsed_partitions *state) | ||
16 | { | ||
17 | int i; | ||
18 | int slot = 1; | ||
19 | unsigned int npartitions; | ||
20 | Sector sect; | ||
21 | unsigned char *data; | ||
22 | struct disklabel { | ||
23 | __le32 d_magic; | ||
24 | __le16 d_type,d_subtype; | ||
25 | u8 d_typename[16]; | ||
26 | u8 d_packname[16]; | ||
27 | __le32 d_secsize; | ||
28 | __le32 d_nsectors; | ||
29 | __le32 d_ntracks; | ||
30 | __le32 d_ncylinders; | ||
31 | __le32 d_secpercyl; | ||
32 | __le32 d_secprtunit; | ||
33 | __le16 d_sparespertrack; | ||
34 | __le16 d_sparespercyl; | ||
35 | __le32 d_acylinders; | ||
36 | __le16 d_rpm, d_interleave, d_trackskew, d_cylskew; | ||
37 | __le32 d_headswitch, d_trkseek, d_flags; | ||
38 | __le32 d_drivedata[5]; | ||
39 | __le32 d_spare[5]; | ||
40 | __le32 d_magic2; | ||
41 | __le16 d_checksum; | ||
42 | __le16 d_npartitions; | ||
43 | __le32 d_bbsize, d_sbsize; | ||
44 | struct d_partition { | ||
45 | __le32 p_size; | ||
46 | __le32 p_offset; | ||
47 | __le32 p_fsize; | ||
48 | u8 p_fstype; | ||
49 | u8 p_frag; | ||
50 | __le16 p_cpg; | ||
51 | } d_partitions[MAX_OSF_PARTITIONS]; | ||
52 | } * label; | ||
53 | struct d_partition * partition; | ||
54 | |||
55 | data = read_part_sector(state, 0, §); | ||
56 | if (!data) | ||
57 | return -1; | ||
58 | |||
59 | label = (struct disklabel *) (data+64); | ||
60 | partition = label->d_partitions; | ||
61 | if (le32_to_cpu(label->d_magic) != DISKLABELMAGIC) { | ||
62 | put_dev_sector(sect); | ||
63 | return 0; | ||
64 | } | ||
65 | if (le32_to_cpu(label->d_magic2) != DISKLABELMAGIC) { | ||
66 | put_dev_sector(sect); | ||
67 | return 0; | ||
68 | } | ||
69 | npartitions = le16_to_cpu(label->d_npartitions); | ||
70 | if (npartitions > MAX_OSF_PARTITIONS) { | ||
71 | put_dev_sector(sect); | ||
72 | return 0; | ||
73 | } | ||
74 | for (i = 0 ; i < npartitions; i++, partition++) { | ||
75 | if (slot == state->limit) | ||
76 | break; | ||
77 | if (le32_to_cpu(partition->p_size)) | ||
78 | put_partition(state, slot, | ||
79 | le32_to_cpu(partition->p_offset), | ||
80 | le32_to_cpu(partition->p_size)); | ||
81 | slot++; | ||
82 | } | ||
83 | strlcat(state->pp_buf, "\n", PAGE_SIZE); | ||
84 | put_dev_sector(sect); | ||
85 | return 1; | ||
86 | } | ||
diff --git a/block/partitions/osf.h b/block/partitions/osf.h deleted file mode 100644 index 20ed2315ec1..00000000000 --- a/block/partitions/osf.h +++ /dev/null | |||
@@ -1,7 +0,0 @@ | |||
1 | /* | ||
2 | * fs/partitions/osf.h | ||
3 | */ | ||
4 | |||
5 | #define DISKLABELMAGIC (0x82564557UL) | ||
6 | |||
7 | int osf_partition(struct parsed_partitions *state); | ||
diff --git a/block/partitions/sgi.c b/block/partitions/sgi.c deleted file mode 100644 index ea8a86dceaf..00000000000 --- a/block/partitions/sgi.c +++ /dev/null | |||
@@ -1,82 +0,0 @@ | |||
1 | /* | ||
2 | * fs/partitions/sgi.c | ||
3 | * | ||
4 | * Code extracted from drivers/block/genhd.c | ||
5 | */ | ||
6 | |||
7 | #include "check.h" | ||
8 | #include "sgi.h" | ||
9 | |||
10 | struct sgi_disklabel { | ||
11 | __be32 magic_mushroom; /* Big fat spliff... */ | ||
12 | __be16 root_part_num; /* Root partition number */ | ||
13 | __be16 swap_part_num; /* Swap partition number */ | ||
14 | s8 boot_file[16]; /* Name of boot file for ARCS */ | ||
15 | u8 _unused0[48]; /* Device parameter useless crapola.. */ | ||
16 | struct sgi_volume { | ||
17 | s8 name[8]; /* Name of volume */ | ||
18 | __be32 block_num; /* Logical block number */ | ||
19 | __be32 num_bytes; /* How big, in bytes */ | ||
20 | } volume[15]; | ||
21 | struct sgi_partition { | ||
22 | __be32 num_blocks; /* Size in logical blocks */ | ||
23 | __be32 first_block; /* First logical block */ | ||
24 | __be32 type; /* Type of this partition */ | ||
25 | } partitions[16]; | ||
26 | __be32 csum; /* Disk label checksum */ | ||
27 | __be32 _unused1; /* Padding */ | ||
28 | }; | ||
29 | |||
30 | int sgi_partition(struct parsed_partitions *state) | ||
31 | { | ||
32 | int i, csum; | ||
33 | __be32 magic; | ||
34 | int slot = 1; | ||
35 | unsigned int start, blocks; | ||
36 | __be32 *ui, cs; | ||
37 | Sector sect; | ||
38 | struct sgi_disklabel *label; | ||
39 | struct sgi_partition *p; | ||
40 | char b[BDEVNAME_SIZE]; | ||
41 | |||
42 | label = read_part_sector(state, 0, §); | ||
43 | if (!label) | ||
44 | return -1; | ||
45 | p = &label->partitions[0]; | ||
46 | magic = label->magic_mushroom; | ||
47 | if(be32_to_cpu(magic) != SGI_LABEL_MAGIC) { | ||
48 | /*printk("Dev %s SGI disklabel: bad magic %08x\n", | ||
49 | bdevname(bdev, b), be32_to_cpu(magic));*/ | ||
50 | put_dev_sector(sect); | ||
51 | return 0; | ||
52 | } | ||
53 | ui = ((__be32 *) (label + 1)) - 1; | ||
54 | for(csum = 0; ui >= ((__be32 *) label);) { | ||
55 | cs = *ui--; | ||
56 | csum += be32_to_cpu(cs); | ||
57 | } | ||
58 | if(csum) { | ||
59 | printk(KERN_WARNING "Dev %s SGI disklabel: csum bad, label corrupted\n", | ||
60 | bdevname(state->bdev, b)); | ||
61 | put_dev_sector(sect); | ||
62 | return 0; | ||
63 | } | ||
64 | /* All SGI disk labels have 16 partitions, disks under Linux only | ||
65 | * have 15 minor's. Luckily there are always a few zero length | ||
66 | * partitions which we don't care about so we never overflow the | ||
67 | * current_minor. | ||
68 | */ | ||
69 | for(i = 0; i < 16; i++, p++) { | ||
70 | blocks = be32_to_cpu(p->num_blocks); | ||
71 | start = be32_to_cpu(p->first_block); | ||
72 | if (blocks) { | ||
73 | put_partition(state, slot, start, blocks); | ||
74 | if (be32_to_cpu(p->type) == LINUX_RAID_PARTITION) | ||
75 | state->parts[slot].flags = ADDPART_FLAG_RAID; | ||
76 | } | ||
77 | slot++; | ||
78 | } | ||
79 | strlcat(state->pp_buf, "\n", PAGE_SIZE); | ||
80 | put_dev_sector(sect); | ||
81 | return 1; | ||
82 | } | ||
diff --git a/block/partitions/sgi.h b/block/partitions/sgi.h deleted file mode 100644 index b9553ebdd5a..00000000000 --- a/block/partitions/sgi.h +++ /dev/null | |||
@@ -1,8 +0,0 @@ | |||
1 | /* | ||
2 | * fs/partitions/sgi.h | ||
3 | */ | ||
4 | |||
5 | extern int sgi_partition(struct parsed_partitions *state); | ||
6 | |||
7 | #define SGI_LABEL_MAGIC 0x0be5a941 | ||
8 | |||
diff --git a/block/partitions/sun.c b/block/partitions/sun.c deleted file mode 100644 index b5b6fcfb3d3..00000000000 --- a/block/partitions/sun.c +++ /dev/null | |||
@@ -1,122 +0,0 @@ | |||
1 | /* | ||
2 | * fs/partitions/sun.c | ||
3 | * | ||
4 | * Code extracted from drivers/block/genhd.c | ||
5 | * | ||
6 | * Copyright (C) 1991-1998 Linus Torvalds | ||
7 | * Re-organised Feb 1998 Russell King | ||
8 | */ | ||
9 | |||
10 | #include "check.h" | ||
11 | #include "sun.h" | ||
12 | |||
13 | int sun_partition(struct parsed_partitions *state) | ||
14 | { | ||
15 | int i; | ||
16 | __be16 csum; | ||
17 | int slot = 1; | ||
18 | __be16 *ush; | ||
19 | Sector sect; | ||
20 | struct sun_disklabel { | ||
21 | unsigned char info[128]; /* Informative text string */ | ||
22 | struct sun_vtoc { | ||
23 | __be32 version; /* Layout version */ | ||
24 | char volume[8]; /* Volume name */ | ||
25 | __be16 nparts; /* Number of partitions */ | ||
26 | struct sun_info { /* Partition hdrs, sec 2 */ | ||
27 | __be16 id; | ||
28 | __be16 flags; | ||
29 | } infos[8]; | ||
30 | __be16 padding; /* Alignment padding */ | ||
31 | __be32 bootinfo[3]; /* Info needed by mboot */ | ||
32 | __be32 sanity; /* To verify vtoc sanity */ | ||
33 | __be32 reserved[10]; /* Free space */ | ||
34 | __be32 timestamp[8]; /* Partition timestamp */ | ||
35 | } vtoc; | ||
36 | __be32 write_reinstruct; /* sectors to skip, writes */ | ||
37 | __be32 read_reinstruct; /* sectors to skip, reads */ | ||
38 | unsigned char spare[148]; /* Padding */ | ||
39 | __be16 rspeed; /* Disk rotational speed */ | ||
40 | __be16 pcylcount; /* Physical cylinder count */ | ||
41 | __be16 sparecyl; /* extra sects per cylinder */ | ||
42 | __be16 obs1; /* gap1 */ | ||
43 | __be16 obs2; /* gap2 */ | ||
44 | __be16 ilfact; /* Interleave factor */ | ||
45 | __be16 ncyl; /* Data cylinder count */ | ||
46 | __be16 nacyl; /* Alt. cylinder count */ | ||
47 | __be16 ntrks; /* Tracks per cylinder */ | ||
48 | __be16 nsect; /* Sectors per track */ | ||
49 | __be16 obs3; /* bhead - Label head offset */ | ||
50 | __be16 obs4; /* ppart - Physical Partition */ | ||
51 | struct sun_partition { | ||
52 | __be32 start_cylinder; | ||
53 | __be32 num_sectors; | ||
54 | } partitions[8]; | ||
55 | __be16 magic; /* Magic number */ | ||
56 | __be16 csum; /* Label xor'd checksum */ | ||
57 | } * label; | ||
58 | struct sun_partition *p; | ||
59 | unsigned long spc; | ||
60 | char b[BDEVNAME_SIZE]; | ||
61 | int use_vtoc; | ||
62 | int nparts; | ||
63 | |||
64 | label = read_part_sector(state, 0, §); | ||
65 | if (!label) | ||
66 | return -1; | ||
67 | |||
68 | p = label->partitions; | ||
69 | if (be16_to_cpu(label->magic) != SUN_LABEL_MAGIC) { | ||
70 | /* printk(KERN_INFO "Dev %s Sun disklabel: bad magic %04x\n", | ||
71 | bdevname(bdev, b), be16_to_cpu(label->magic)); */ | ||
72 | put_dev_sector(sect); | ||
73 | return 0; | ||
74 | } | ||
75 | /* Look at the checksum */ | ||
76 | ush = ((__be16 *) (label+1)) - 1; | ||
77 | for (csum = 0; ush >= ((__be16 *) label);) | ||
78 | csum ^= *ush--; | ||
79 | if (csum) { | ||
80 | printk("Dev %s Sun disklabel: Csum bad, label corrupted\n", | ||
81 | bdevname(state->bdev, b)); | ||
82 | put_dev_sector(sect); | ||
83 | return 0; | ||
84 | } | ||
85 | |||
86 | /* Check to see if we can use the VTOC table */ | ||
87 | use_vtoc = ((be32_to_cpu(label->vtoc.sanity) == SUN_VTOC_SANITY) && | ||
88 | (be32_to_cpu(label->vtoc.version) == 1) && | ||
89 | (be16_to_cpu(label->vtoc.nparts) <= 8)); | ||
90 | |||
91 | /* Use 8 partition entries if not specified in validated VTOC */ | ||
92 | nparts = (use_vtoc) ? be16_to_cpu(label->vtoc.nparts) : 8; | ||
93 | |||
94 | /* | ||
95 | * So that old Linux-Sun partitions continue to work, | ||
96 | * alow the VTOC to be used under the additional condition ... | ||
97 | */ | ||
98 | use_vtoc = use_vtoc || !(label->vtoc.sanity || | ||
99 | label->vtoc.version || label->vtoc.nparts); | ||
100 | spc = be16_to_cpu(label->ntrks) * be16_to_cpu(label->nsect); | ||
101 | for (i = 0; i < nparts; i++, p++) { | ||
102 | unsigned long st_sector; | ||
103 | unsigned int num_sectors; | ||
104 | |||
105 | st_sector = be32_to_cpu(p->start_cylinder) * spc; | ||
106 | num_sectors = be32_to_cpu(p->num_sectors); | ||
107 | if (num_sectors) { | ||
108 | put_partition(state, slot, st_sector, num_sectors); | ||
109 | state->parts[slot].flags = 0; | ||
110 | if (use_vtoc) { | ||
111 | if (be16_to_cpu(label->vtoc.infos[i].id) == LINUX_RAID_PARTITION) | ||
112 | state->parts[slot].flags |= ADDPART_FLAG_RAID; | ||
113 | else if (be16_to_cpu(label->vtoc.infos[i].id) == SUN_WHOLE_DISK) | ||
114 | state->parts[slot].flags |= ADDPART_FLAG_WHOLEDISK; | ||
115 | } | ||
116 | } | ||
117 | slot++; | ||
118 | } | ||
119 | strlcat(state->pp_buf, "\n", PAGE_SIZE); | ||
120 | put_dev_sector(sect); | ||
121 | return 1; | ||
122 | } | ||
diff --git a/block/partitions/sun.h b/block/partitions/sun.h deleted file mode 100644 index 2424baa8319..00000000000 --- a/block/partitions/sun.h +++ /dev/null | |||
@@ -1,8 +0,0 @@ | |||
1 | /* | ||
2 | * fs/partitions/sun.h | ||
3 | */ | ||
4 | |||
5 | #define SUN_LABEL_MAGIC 0xDABE | ||
6 | #define SUN_VTOC_SANITY 0x600DDEEE | ||
7 | |||
8 | int sun_partition(struct parsed_partitions *state); | ||
diff --git a/block/partitions/sysv68.c b/block/partitions/sysv68.c deleted file mode 100644 index 9627ccffc1c..00000000000 --- a/block/partitions/sysv68.c +++ /dev/null | |||
@@ -1,95 +0,0 @@ | |||
1 | /* | ||
2 | * fs/partitions/sysv68.c | ||
3 | * | ||
4 | * Copyright (C) 2007 Philippe De Muyter <phdm@macqel.be> | ||
5 | */ | ||
6 | |||
7 | #include "check.h" | ||
8 | #include "sysv68.h" | ||
9 | |||
10 | /* | ||
11 | * Volume ID structure: on first 256-bytes sector of disk | ||
12 | */ | ||
13 | |||
14 | struct volumeid { | ||
15 | u8 vid_unused[248]; | ||
16 | u8 vid_mac[8]; /* ASCII string "MOTOROLA" */ | ||
17 | }; | ||
18 | |||
19 | /* | ||
20 | * config block: second 256-bytes sector on disk | ||
21 | */ | ||
22 | |||
23 | struct dkconfig { | ||
24 | u8 ios_unused0[128]; | ||
25 | __be32 ios_slcblk; /* Slice table block number */ | ||
26 | __be16 ios_slccnt; /* Number of entries in slice table */ | ||
27 | u8 ios_unused1[122]; | ||
28 | }; | ||
29 | |||
30 | /* | ||
31 | * combined volumeid and dkconfig block | ||
32 | */ | ||
33 | |||
34 | struct dkblk0 { | ||
35 | struct volumeid dk_vid; | ||
36 | struct dkconfig dk_ios; | ||
37 | }; | ||
38 | |||
39 | /* | ||
40 | * Slice Table Structure | ||
41 | */ | ||
42 | |||
43 | struct slice { | ||
44 | __be32 nblocks; /* slice size (in blocks) */ | ||
45 | __be32 blkoff; /* block offset of slice */ | ||
46 | }; | ||
47 | |||
48 | |||
49 | int sysv68_partition(struct parsed_partitions *state) | ||
50 | { | ||
51 | int i, slices; | ||
52 | int slot = 1; | ||
53 | Sector sect; | ||
54 | unsigned char *data; | ||
55 | struct dkblk0 *b; | ||
56 | struct slice *slice; | ||
57 | char tmp[64]; | ||
58 | |||
59 | data = read_part_sector(state, 0, §); | ||
60 | if (!data) | ||
61 | return -1; | ||
62 | |||
63 | b = (struct dkblk0 *)data; | ||
64 | if (memcmp(b->dk_vid.vid_mac, "MOTOROLA", sizeof(b->dk_vid.vid_mac))) { | ||
65 | put_dev_sector(sect); | ||
66 | return 0; | ||
67 | } | ||
68 | slices = be16_to_cpu(b->dk_ios.ios_slccnt); | ||
69 | i = be32_to_cpu(b->dk_ios.ios_slcblk); | ||
70 | put_dev_sector(sect); | ||
71 | |||
72 | data = read_part_sector(state, i, §); | ||
73 | if (!data) | ||
74 | return -1; | ||
75 | |||
76 | slices -= 1; /* last slice is the whole disk */ | ||
77 | snprintf(tmp, sizeof(tmp), "sysV68: %s(s%u)", state->name, slices); | ||
78 | strlcat(state->pp_buf, tmp, PAGE_SIZE); | ||
79 | slice = (struct slice *)data; | ||
80 | for (i = 0; i < slices; i++, slice++) { | ||
81 | if (slot == state->limit) | ||
82 | break; | ||
83 | if (be32_to_cpu(slice->nblocks)) { | ||
84 | put_partition(state, slot, | ||
85 | be32_to_cpu(slice->blkoff), | ||
86 | be32_to_cpu(slice->nblocks)); | ||
87 | snprintf(tmp, sizeof(tmp), "(s%u)", i); | ||
88 | strlcat(state->pp_buf, tmp, PAGE_SIZE); | ||
89 | } | ||
90 | slot++; | ||
91 | } | ||
92 | strlcat(state->pp_buf, "\n", PAGE_SIZE); | ||
93 | put_dev_sector(sect); | ||
94 | return 1; | ||
95 | } | ||
diff --git a/block/partitions/sysv68.h b/block/partitions/sysv68.h deleted file mode 100644 index bf2f5ffa97a..00000000000 --- a/block/partitions/sysv68.h +++ /dev/null | |||
@@ -1 +0,0 @@ | |||
1 | extern int sysv68_partition(struct parsed_partitions *state); | ||
diff --git a/block/partitions/ultrix.c b/block/partitions/ultrix.c deleted file mode 100644 index 8dbaf9f77a9..00000000000 --- a/block/partitions/ultrix.c +++ /dev/null | |||
@@ -1,48 +0,0 @@ | |||
1 | /* | ||
2 | * fs/partitions/ultrix.c | ||
3 | * | ||
4 | * Code extracted from drivers/block/genhd.c | ||
5 | * | ||
6 | * Re-organised Jul 1999 Russell King | ||
7 | */ | ||
8 | |||
9 | #include "check.h" | ||
10 | #include "ultrix.h" | ||
11 | |||
12 | int ultrix_partition(struct parsed_partitions *state) | ||
13 | { | ||
14 | int i; | ||
15 | Sector sect; | ||
16 | unsigned char *data; | ||
17 | struct ultrix_disklabel { | ||
18 | s32 pt_magic; /* magic no. indicating part. info exits */ | ||
19 | s32 pt_valid; /* set by driver if pt is current */ | ||
20 | struct pt_info { | ||
21 | s32 pi_nblocks; /* no. of sectors */ | ||
22 | u32 pi_blkoff; /* block offset for start */ | ||
23 | } pt_part[8]; | ||
24 | } *label; | ||
25 | |||
26 | #define PT_MAGIC 0x032957 /* Partition magic number */ | ||
27 | #define PT_VALID 1 /* Indicates if struct is valid */ | ||
28 | |||
29 | data = read_part_sector(state, (16384 - sizeof(*label))/512, §); | ||
30 | if (!data) | ||
31 | return -1; | ||
32 | |||
33 | label = (struct ultrix_disklabel *)(data + 512 - sizeof(*label)); | ||
34 | |||
35 | if (label->pt_magic == PT_MAGIC && label->pt_valid == PT_VALID) { | ||
36 | for (i=0; i<8; i++) | ||
37 | if (label->pt_part[i].pi_nblocks) | ||
38 | put_partition(state, i+1, | ||
39 | label->pt_part[i].pi_blkoff, | ||
40 | label->pt_part[i].pi_nblocks); | ||
41 | put_dev_sector(sect); | ||
42 | strlcat(state->pp_buf, "\n", PAGE_SIZE); | ||
43 | return 1; | ||
44 | } else { | ||
45 | put_dev_sector(sect); | ||
46 | return 0; | ||
47 | } | ||
48 | } | ||
diff --git a/block/partitions/ultrix.h b/block/partitions/ultrix.h deleted file mode 100644 index a3cc00b2bde..00000000000 --- a/block/partitions/ultrix.h +++ /dev/null | |||
@@ -1,5 +0,0 @@ | |||
1 | /* | ||
2 | * fs/partitions/ultrix.h | ||
3 | */ | ||
4 | |||
5 | int ultrix_partition(struct parsed_partitions *state); | ||
diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c index 9a87daa6f4f..4f4230b79bb 100644 --- a/block/scsi_ioctl.c +++ b/block/scsi_ioctl.c | |||
@@ -24,7 +24,6 @@ | |||
24 | #include <linux/capability.h> | 24 | #include <linux/capability.h> |
25 | #include <linux/completion.h> | 25 | #include <linux/completion.h> |
26 | #include <linux/cdrom.h> | 26 | #include <linux/cdrom.h> |
27 | #include <linux/ratelimit.h> | ||
28 | #include <linux/slab.h> | 27 | #include <linux/slab.h> |
29 | #include <linux/times.h> | 28 | #include <linux/times.h> |
30 | #include <asm/uaccess.h> | 29 | #include <asm/uaccess.h> |
@@ -566,7 +565,7 @@ int scsi_cmd_ioctl(struct request_queue *q, struct gendisk *bd_disk, fmode_t mod | |||
566 | { | 565 | { |
567 | int err; | 566 | int err; |
568 | 567 | ||
569 | if (!q) | 568 | if (!q || blk_get_queue(q)) |
570 | return -ENXIO; | 569 | return -ENXIO; |
571 | 570 | ||
572 | switch (cmd) { | 571 | switch (cmd) { |
@@ -687,64 +686,11 @@ int scsi_cmd_ioctl(struct request_queue *q, struct gendisk *bd_disk, fmode_t mod | |||
687 | err = -ENOTTY; | 686 | err = -ENOTTY; |
688 | } | 687 | } |
689 | 688 | ||
689 | blk_put_queue(q); | ||
690 | return err; | 690 | return err; |
691 | } | 691 | } |
692 | EXPORT_SYMBOL(scsi_cmd_ioctl); | 692 | EXPORT_SYMBOL(scsi_cmd_ioctl); |
693 | 693 | ||
694 | int scsi_verify_blk_ioctl(struct block_device *bd, unsigned int cmd) | ||
695 | { | ||
696 | if (bd && bd == bd->bd_contains) | ||
697 | return 0; | ||
698 | |||
699 | /* Actually none of these is particularly useful on a partition, | ||
700 | * but they are safe. | ||
701 | */ | ||
702 | switch (cmd) { | ||
703 | case SCSI_IOCTL_GET_IDLUN: | ||
704 | case SCSI_IOCTL_GET_BUS_NUMBER: | ||
705 | case SCSI_IOCTL_GET_PCI: | ||
706 | case SCSI_IOCTL_PROBE_HOST: | ||
707 | case SG_GET_VERSION_NUM: | ||
708 | case SG_SET_TIMEOUT: | ||
709 | case SG_GET_TIMEOUT: | ||
710 | case SG_GET_RESERVED_SIZE: | ||
711 | case SG_SET_RESERVED_SIZE: | ||
712 | case SG_EMULATED_HOST: | ||
713 | return 0; | ||
714 | case CDROM_GET_CAPABILITY: | ||
715 | /* Keep this until we remove the printk below. udev sends it | ||
716 | * and we do not want to spam dmesg about it. CD-ROMs do | ||
717 | * not have partitions, so we get here only for disks. | ||
718 | */ | ||
719 | return -ENOIOCTLCMD; | ||
720 | default: | ||
721 | break; | ||
722 | } | ||
723 | |||
724 | if (capable(CAP_SYS_RAWIO)) | ||
725 | return 0; | ||
726 | |||
727 | /* In particular, rule out all resets and host-specific ioctls. */ | ||
728 | printk_ratelimited(KERN_WARNING | ||
729 | "%s: sending ioctl %x to a partition!\n", current->comm, cmd); | ||
730 | |||
731 | return -ENOIOCTLCMD; | ||
732 | } | ||
733 | EXPORT_SYMBOL(scsi_verify_blk_ioctl); | ||
734 | |||
735 | int scsi_cmd_blk_ioctl(struct block_device *bd, fmode_t mode, | ||
736 | unsigned int cmd, void __user *arg) | ||
737 | { | ||
738 | int ret; | ||
739 | |||
740 | ret = scsi_verify_blk_ioctl(bd, cmd); | ||
741 | if (ret < 0) | ||
742 | return ret; | ||
743 | |||
744 | return scsi_cmd_ioctl(bd->bd_disk->queue, bd->bd_disk, mode, cmd, arg); | ||
745 | } | ||
746 | EXPORT_SYMBOL(scsi_cmd_blk_ioctl); | ||
747 | |||
748 | static int __init blk_scsi_ioctl_init(void) | 694 | static int __init blk_scsi_ioctl_init(void) |
749 | { | 695 | { |
750 | blk_set_cmd_filter_defaults(&blk_default_cmd_filter); | 696 | blk_set_cmd_filter_defaults(&blk_default_cmd_filter); |