aboutsummaryrefslogtreecommitdiffstats
path: root/block
diff options
context:
space:
mode:
authorJonathan Herman <hermanjl@cs.unc.edu>2013-01-17 16:15:55 -0500
committerJonathan Herman <hermanjl@cs.unc.edu>2013-01-17 16:15:55 -0500
commit8dea78da5cee153b8af9c07a2745f6c55057fe12 (patch)
treea8f4d49d63b1ecc92f2fddceba0655b2472c5bd9 /block
parent406089d01562f1e2bf9f089fd7637009ebaad589 (diff)
Patched in Tegra support.
Diffstat (limited to 'block')
-rw-r--r--block/Kconfig9
-rw-r--r--block/Kconfig.iosched4
-rw-r--r--block/Makefile3
-rw-r--r--block/blk-cgroup.c2131
-rw-r--r--block/blk-cgroup.h765
-rw-r--r--block/blk-core.c1139
-rw-r--r--block/blk-exec.c27
-rw-r--r--block/blk-integrity.c1
-rw-r--r--block/blk-ioc.c410
-rw-r--r--block/blk-lib.c151
-rw-r--r--block/blk-map.c2
-rw-r--r--block/blk-merge.c193
-rw-r--r--block/blk-settings.c53
-rw-r--r--block/blk-softirq.c16
-rw-r--r--block/blk-sysfs.c101
-rw-r--r--block/blk-tag.c17
-rw-r--r--block/blk-throttle.c770
-rw-r--r--block/blk-timeout.c41
-rw-r--r--block/blk.h107
-rw-r--r--block/bsg-lib.c68
-rw-r--r--block/bsg.c9
-rw-r--r--block/cfq-iosched.c1655
-rw-r--r--block/compat_ioctl.c3
-rw-r--r--block/deadline-iosched.c14
-rw-r--r--block/elevator.c366
-rw-r--r--block/genhd.c104
-rw-r--r--block/ioctl.c121
-rw-r--r--block/noop-iosched.c12
-rw-r--r--block/partition-generic.c571
-rw-r--r--block/partitions/Kconfig251
-rw-r--r--block/partitions/Makefile20
-rw-r--r--block/partitions/acorn.c556
-rw-r--r--block/partitions/acorn.h14
-rw-r--r--block/partitions/amiga.c139
-rw-r--r--block/partitions/amiga.h6
-rw-r--r--block/partitions/atari.c149
-rw-r--r--block/partitions/atari.h34
-rw-r--r--block/partitions/check.c166
-rw-r--r--block/partitions/check.h52
-rw-r--r--block/partitions/efi.c670
-rw-r--r--block/partitions/efi.h134
-rw-r--r--block/partitions/ibm.c364
-rw-r--r--block/partitions/ibm.h1
-rw-r--r--block/partitions/karma.c57
-rw-r--r--block/partitions/karma.h8
-rw-r--r--block/partitions/ldm.c1567
-rw-r--r--block/partitions/ldm.h215
-rw-r--r--block/partitions/mac.c134
-rw-r--r--block/partitions/mac.h44
-rw-r--r--block/partitions/msdos.c569
-rw-r--r--block/partitions/msdos.h8
-rw-r--r--block/partitions/osf.c86
-rw-r--r--block/partitions/osf.h7
-rw-r--r--block/partitions/sgi.c82
-rw-r--r--block/partitions/sgi.h8
-rw-r--r--block/partitions/sun.c122
-rw-r--r--block/partitions/sun.h8
-rw-r--r--block/partitions/sysv68.c95
-rw-r--r--block/partitions/sysv68.h1
-rw-r--r--block/partitions/ultrix.c48
-rw-r--r--block/partitions/ultrix.h5
-rw-r--r--block/scsi_ioctl.c58
62 files changed, 4105 insertions, 10436 deletions
diff --git a/block/Kconfig b/block/Kconfig
index 4a85ccf8d4c..e97934eecec 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -4,7 +4,6 @@
4menuconfig BLOCK 4menuconfig BLOCK
5 bool "Enable the block layer" if EXPERT 5 bool "Enable the block layer" if EXPERT
6 default y 6 default y
7 select PERCPU_RWSEM
8 help 7 help
9 Provide block layer support for the kernel. 8 Provide block layer support for the kernel.
10 9
@@ -90,7 +89,7 @@ config BLK_DEV_INTEGRITY
90 89
91config BLK_DEV_THROTTLING 90config BLK_DEV_THROTTLING
92 bool "Block layer bio throttling support" 91 bool "Block layer bio throttling support"
93 depends on BLK_CGROUP=y 92 depends on BLK_CGROUP=y && EXPERIMENTAL
94 default n 93 default n
95 ---help--- 94 ---help---
96 Block layer bio throttling support. It can be used to limit 95 Block layer bio throttling support. It can be used to limit
@@ -100,12 +99,6 @@ config BLK_DEV_THROTTLING
100 99
101 See Documentation/cgroups/blkio-controller.txt for more information. 100 See Documentation/cgroups/blkio-controller.txt for more information.
102 101
103menu "Partition Types"
104
105source "block/partitions/Kconfig"
106
107endmenu
108
109endif # BLOCK 102endif # BLOCK
110 103
111config BLOCK_COMPAT 104config BLOCK_COMPAT
diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
index 421bef9c4c4..3199b76f795 100644
--- a/block/Kconfig.iosched
+++ b/block/Kconfig.iosched
@@ -23,6 +23,8 @@ config IOSCHED_DEADLINE
23 23
24config IOSCHED_CFQ 24config IOSCHED_CFQ
25 tristate "CFQ I/O scheduler" 25 tristate "CFQ I/O scheduler"
26 # If BLK_CGROUP is a module, CFQ has to be built as module.
27 depends on (BLK_CGROUP=m && m) || !BLK_CGROUP || BLK_CGROUP=y
26 default y 28 default y
27 ---help--- 29 ---help---
28 The CFQ I/O scheduler tries to distribute bandwidth equally 30 The CFQ I/O scheduler tries to distribute bandwidth equally
@@ -32,6 +34,8 @@ config IOSCHED_CFQ
32 34
33 This is the default I/O scheduler. 35 This is the default I/O scheduler.
34 36
37 Note: If BLK_CGROUP=m, then CFQ can be built only as module.
38
35config CFQ_GROUP_IOSCHED 39config CFQ_GROUP_IOSCHED
36 bool "CFQ Group Scheduling support" 40 bool "CFQ Group Scheduling support"
37 depends on IOSCHED_CFQ && BLK_CGROUP 41 depends on IOSCHED_CFQ && BLK_CGROUP
diff --git a/block/Makefile b/block/Makefile
index 39b76ba66ff..514c6e4f427 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -5,8 +5,7 @@
5obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \ 5obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \
6 blk-flush.o blk-settings.o blk-ioc.o blk-map.o \ 6 blk-flush.o blk-settings.o blk-ioc.o blk-map.o \
7 blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \ 7 blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \
8 blk-iopoll.o blk-lib.o ioctl.o genhd.o scsi_ioctl.o \ 8 blk-iopoll.o blk-lib.o ioctl.o genhd.o scsi_ioctl.o
9 partition-generic.o partitions/
10 9
11obj-$(CONFIG_BLK_DEV_BSG) += bsg.o 10obj-$(CONFIG_BLK_DEV_BSG) += bsg.o
12obj-$(CONFIG_BLK_DEV_BSGLIB) += bsg-lib.o 11obj-$(CONFIG_BLK_DEV_BSGLIB) += bsg-lib.o
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index b8858fb0caf..b596e54ddd7 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -11,959 +11,1656 @@
11 * Nauman Rafique <nauman@google.com> 11 * Nauman Rafique <nauman@google.com>
12 */ 12 */
13#include <linux/ioprio.h> 13#include <linux/ioprio.h>
14#include <linux/seq_file.h>
14#include <linux/kdev_t.h> 15#include <linux/kdev_t.h>
15#include <linux/module.h> 16#include <linux/module.h>
16#include <linux/err.h> 17#include <linux/err.h>
17#include <linux/blkdev.h> 18#include <linux/blkdev.h>
18#include <linux/slab.h> 19#include <linux/slab.h>
19#include <linux/genhd.h>
20#include <linux/delay.h>
21#include <linux/atomic.h>
22#include "blk-cgroup.h" 20#include "blk-cgroup.h"
23#include "blk.h" 21#include <linux/genhd.h>
24 22
25#define MAX_KEY_LEN 100 23#define MAX_KEY_LEN 100
26 24
27static DEFINE_MUTEX(blkcg_pol_mutex); 25static DEFINE_SPINLOCK(blkio_list_lock);
26static LIST_HEAD(blkio_list);
28 27
29struct blkcg blkcg_root = { .cfq_weight = 2 * CFQ_WEIGHT_DEFAULT }; 28struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT };
30EXPORT_SYMBOL_GPL(blkcg_root); 29EXPORT_SYMBOL_GPL(blkio_root_cgroup);
31 30
32static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS]; 31static struct cgroup_subsys_state *blkiocg_create(struct cgroup_subsys *,
32 struct cgroup *);
33static int blkiocg_can_attach_task(struct cgroup *, struct task_struct *);
34static void blkiocg_attach_task(struct cgroup *, struct task_struct *);
35static void blkiocg_destroy(struct cgroup_subsys *, struct cgroup *);
36static int blkiocg_populate(struct cgroup_subsys *, struct cgroup *);
33 37
34static bool blkcg_policy_enabled(struct request_queue *q, 38/* for encoding cft->private value on file */
35 const struct blkcg_policy *pol) 39#define BLKIOFILE_PRIVATE(x, val) (((x) << 16) | (val))
40/* What policy owns the file, proportional or throttle */
41#define BLKIOFILE_POLICY(val) (((val) >> 16) & 0xffff)
42#define BLKIOFILE_ATTR(val) ((val) & 0xffff)
43
44struct cgroup_subsys blkio_subsys = {
45 .name = "blkio",
46 .create = blkiocg_create,
47 .can_attach_task = blkiocg_can_attach_task,
48 .attach_task = blkiocg_attach_task,
49 .destroy = blkiocg_destroy,
50 .populate = blkiocg_populate,
51#ifdef CONFIG_BLK_CGROUP
52 /* note: blkio_subsys_id is otherwise defined in blk-cgroup.h */
53 .subsys_id = blkio_subsys_id,
54#endif
55 .use_id = 1,
56 .module = THIS_MODULE,
57};
58EXPORT_SYMBOL_GPL(blkio_subsys);
59
60static inline void blkio_policy_insert_node(struct blkio_cgroup *blkcg,
61 struct blkio_policy_node *pn)
36{ 62{
37 return pol && test_bit(pol->plid, q->blkcg_pols); 63 list_add(&pn->node, &blkcg->policy_list);
38} 64}
39 65
40/** 66static inline bool cftype_blkg_same_policy(struct cftype *cft,
41 * blkg_free - free a blkg 67 struct blkio_group *blkg)
42 * @blkg: blkg to free
43 *
44 * Free @blkg which may be partially allocated.
45 */
46static void blkg_free(struct blkcg_gq *blkg)
47{ 68{
48 int i; 69 enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
49 70
50 if (!blkg) 71 if (blkg->plid == plid)
51 return; 72 return 1;
73
74 return 0;
75}
52 76
53 for (i = 0; i < BLKCG_MAX_POLS; i++) { 77/* Determines if policy node matches cgroup file being accessed */
54 struct blkcg_policy *pol = blkcg_policy[i]; 78static inline bool pn_matches_cftype(struct cftype *cft,
55 struct blkg_policy_data *pd = blkg->pd[i]; 79 struct blkio_policy_node *pn)
80{
81 enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
82 int fileid = BLKIOFILE_ATTR(cft->private);
56 83
57 if (!pd) 84 return (plid == pn->plid && fileid == pn->fileid);
58 continue; 85}
59 86
60 if (pol && pol->pd_exit_fn) 87/* Must be called with blkcg->lock held */
61 pol->pd_exit_fn(blkg); 88static inline void blkio_policy_delete_node(struct blkio_policy_node *pn)
89{
90 list_del(&pn->node);
91}
62 92
63 kfree(pd); 93/* Must be called with blkcg->lock held */
94static struct blkio_policy_node *
95blkio_policy_search_node(const struct blkio_cgroup *blkcg, dev_t dev,
96 enum blkio_policy_id plid, int fileid)
97{
98 struct blkio_policy_node *pn;
99
100 list_for_each_entry(pn, &blkcg->policy_list, node) {
101 if (pn->dev == dev && pn->plid == plid && pn->fileid == fileid)
102 return pn;
64 } 103 }
65 104
66 blk_exit_rl(&blkg->rl); 105 return NULL;
67 kfree(blkg);
68} 106}
69 107
70/** 108struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup)
71 * blkg_alloc - allocate a blkg
72 * @blkcg: block cgroup the new blkg is associated with
73 * @q: request_queue the new blkg is associated with
74 * @gfp_mask: allocation mask to use
75 *
76 * Allocate a new blkg assocating @blkcg and @q.
77 */
78static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
79 gfp_t gfp_mask)
80{ 109{
81 struct blkcg_gq *blkg; 110 return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id),
82 int i; 111 struct blkio_cgroup, css);
112}
113EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup);
83 114
84 /* alloc and init base part */ 115struct blkio_cgroup *task_blkio_cgroup(struct task_struct *tsk)
85 blkg = kzalloc_node(sizeof(*blkg), gfp_mask, q->node); 116{
86 if (!blkg) 117 return container_of(task_subsys_state(tsk, blkio_subsys_id),
87 return NULL; 118 struct blkio_cgroup, css);
119}
120EXPORT_SYMBOL_GPL(task_blkio_cgroup);
88 121
89 blkg->q = q; 122static inline void
90 INIT_LIST_HEAD(&blkg->q_node); 123blkio_update_group_weight(struct blkio_group *blkg, unsigned int weight)
91 blkg->blkcg = blkcg; 124{
92 blkg->refcnt = 1; 125 struct blkio_policy_type *blkiop;
93 126
94 /* root blkg uses @q->root_rl, init rl only for !root blkgs */ 127 list_for_each_entry(blkiop, &blkio_list, list) {
95 if (blkcg != &blkcg_root) { 128 /* If this policy does not own the blkg, do not send updates */
96 if (blk_init_rl(&blkg->rl, q, gfp_mask)) 129 if (blkiop->plid != blkg->plid)
97 goto err_free; 130 continue;
98 blkg->rl.blkg = blkg; 131 if (blkiop->ops.blkio_update_group_weight_fn)
132 blkiop->ops.blkio_update_group_weight_fn(blkg->key,
133 blkg, weight);
99 } 134 }
135}
100 136
101 for (i = 0; i < BLKCG_MAX_POLS; i++) { 137static inline void blkio_update_group_bps(struct blkio_group *blkg, u64 bps,
102 struct blkcg_policy *pol = blkcg_policy[i]; 138 int fileid)
103 struct blkg_policy_data *pd; 139{
140 struct blkio_policy_type *blkiop;
104 141
105 if (!blkcg_policy_enabled(q, pol)) 142 list_for_each_entry(blkiop, &blkio_list, list) {
106 continue;
107 143
108 /* alloc per-policy data and attach it to blkg */ 144 /* If this policy does not own the blkg, do not send updates */
109 pd = kzalloc_node(pol->pd_size, gfp_mask, q->node); 145 if (blkiop->plid != blkg->plid)
110 if (!pd) 146 continue;
111 goto err_free;
112 147
113 blkg->pd[i] = pd; 148 if (fileid == BLKIO_THROTL_read_bps_device
114 pd->blkg = blkg; 149 && blkiop->ops.blkio_update_group_read_bps_fn)
150 blkiop->ops.blkio_update_group_read_bps_fn(blkg->key,
151 blkg, bps);
115 152
116 /* invoke per-policy init */ 153 if (fileid == BLKIO_THROTL_write_bps_device
117 if (blkcg_policy_enabled(blkg->q, pol)) 154 && blkiop->ops.blkio_update_group_write_bps_fn)
118 pol->pd_init_fn(blkg); 155 blkiop->ops.blkio_update_group_write_bps_fn(blkg->key,
156 blkg, bps);
119 } 157 }
120
121 return blkg;
122
123err_free:
124 blkg_free(blkg);
125 return NULL;
126} 158}
127 159
128static struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, 160static inline void blkio_update_group_iops(struct blkio_group *blkg,
129 struct request_queue *q) 161 unsigned int iops, int fileid)
130{ 162{
131 struct blkcg_gq *blkg; 163 struct blkio_policy_type *blkiop;
132 164
133 blkg = rcu_dereference(blkcg->blkg_hint); 165 list_for_each_entry(blkiop, &blkio_list, list) {
134 if (blkg && blkg->q == q)
135 return blkg;
136 166
137 /* 167 /* If this policy does not own the blkg, do not send updates */
138 * Hint didn't match. Look up from the radix tree. Note that we 168 if (blkiop->plid != blkg->plid)
139 * may not be holding queue_lock and thus are not sure whether 169 continue;
140 * @blkg from blkg_tree has already been removed or not, so we
141 * can't update hint to the lookup result. Leave it to the caller.
142 */
143 blkg = radix_tree_lookup(&blkcg->blkg_tree, q->id);
144 if (blkg && blkg->q == q)
145 return blkg;
146 170
147 return NULL; 171 if (fileid == BLKIO_THROTL_read_iops_device
172 && blkiop->ops.blkio_update_group_read_iops_fn)
173 blkiop->ops.blkio_update_group_read_iops_fn(blkg->key,
174 blkg, iops);
175
176 if (fileid == BLKIO_THROTL_write_iops_device
177 && blkiop->ops.blkio_update_group_write_iops_fn)
178 blkiop->ops.blkio_update_group_write_iops_fn(blkg->key,
179 blkg,iops);
180 }
148} 181}
149 182
150/** 183/*
151 * blkg_lookup - lookup blkg for the specified blkcg - q pair 184 * Add to the appropriate stat variable depending on the request type.
152 * @blkcg: blkcg of interest 185 * This should be called with the blkg->stats_lock held.
153 * @q: request_queue of interest
154 *
155 * Lookup blkg for the @blkcg - @q pair. This function should be called
156 * under RCU read lock and is guaranteed to return %NULL if @q is bypassing
157 * - see blk_queue_bypass_start() for details.
158 */ 186 */
159struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q) 187static void blkio_add_stat(uint64_t *stat, uint64_t add, bool direction,
188 bool sync)
160{ 189{
161 WARN_ON_ONCE(!rcu_read_lock_held()); 190 if (direction)
162 191 stat[BLKIO_STAT_WRITE] += add;
163 if (unlikely(blk_queue_bypass(q))) 192 else
164 return NULL; 193 stat[BLKIO_STAT_READ] += add;
165 return __blkg_lookup(blkcg, q); 194 if (sync)
195 stat[BLKIO_STAT_SYNC] += add;
196 else
197 stat[BLKIO_STAT_ASYNC] += add;
166} 198}
167EXPORT_SYMBOL_GPL(blkg_lookup);
168 199
169/* 200/*
170 * If @new_blkg is %NULL, this function tries to allocate a new one as 201 * Decrements the appropriate stat variable if non-zero depending on the
171 * necessary using %GFP_ATOMIC. @new_blkg is always consumed on return. 202 * request type. Panics on value being zero.
203 * This should be called with the blkg->stats_lock held.
172 */ 204 */
173static struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg, 205static void blkio_check_and_dec_stat(uint64_t *stat, bool direction, bool sync)
174 struct request_queue *q,
175 struct blkcg_gq *new_blkg)
176{ 206{
177 struct blkcg_gq *blkg; 207 if (direction) {
178 int ret; 208 BUG_ON(stat[BLKIO_STAT_WRITE] == 0);
209 stat[BLKIO_STAT_WRITE]--;
210 } else {
211 BUG_ON(stat[BLKIO_STAT_READ] == 0);
212 stat[BLKIO_STAT_READ]--;
213 }
214 if (sync) {
215 BUG_ON(stat[BLKIO_STAT_SYNC] == 0);
216 stat[BLKIO_STAT_SYNC]--;
217 } else {
218 BUG_ON(stat[BLKIO_STAT_ASYNC] == 0);
219 stat[BLKIO_STAT_ASYNC]--;
220 }
221}
179 222
180 WARN_ON_ONCE(!rcu_read_lock_held()); 223#ifdef CONFIG_DEBUG_BLK_CGROUP
181 lockdep_assert_held(q->queue_lock); 224/* This should be called with the blkg->stats_lock held. */
225static void blkio_set_start_group_wait_time(struct blkio_group *blkg,
226 struct blkio_group *curr_blkg)
227{
228 if (blkio_blkg_waiting(&blkg->stats))
229 return;
230 if (blkg == curr_blkg)
231 return;
232 blkg->stats.start_group_wait_time = sched_clock();
233 blkio_mark_blkg_waiting(&blkg->stats);
234}
182 235
183 /* lookup and update hint on success, see __blkg_lookup() for details */ 236/* This should be called with the blkg->stats_lock held. */
184 blkg = __blkg_lookup(blkcg, q); 237static void blkio_update_group_wait_time(struct blkio_group_stats *stats)
185 if (blkg) { 238{
186 rcu_assign_pointer(blkcg->blkg_hint, blkg); 239 unsigned long long now;
187 goto out_free;
188 }
189 240
190 /* blkg holds a reference to blkcg */ 241 if (!blkio_blkg_waiting(stats))
191 if (!css_tryget(&blkcg->css)) { 242 return;
192 blkg = ERR_PTR(-EINVAL);
193 goto out_free;
194 }
195 243
196 /* allocate */ 244 now = sched_clock();
197 if (!new_blkg) { 245 if (time_after64(now, stats->start_group_wait_time))
198 new_blkg = blkg_alloc(blkcg, q, GFP_ATOMIC); 246 stats->group_wait_time += now - stats->start_group_wait_time;
199 if (unlikely(!new_blkg)) { 247 blkio_clear_blkg_waiting(stats);
200 blkg = ERR_PTR(-ENOMEM); 248}
201 goto out_put;
202 }
203 }
204 blkg = new_blkg;
205 249
206 /* insert */ 250/* This should be called with the blkg->stats_lock held. */
207 spin_lock(&blkcg->lock); 251static void blkio_end_empty_time(struct blkio_group_stats *stats)
208 ret = radix_tree_insert(&blkcg->blkg_tree, q->id, blkg); 252{
209 if (likely(!ret)) { 253 unsigned long long now;
210 hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
211 list_add(&blkg->q_node, &q->blkg_list);
212 }
213 spin_unlock(&blkcg->lock);
214 254
215 if (!ret) 255 if (!blkio_blkg_empty(stats))
216 return blkg; 256 return;
217 257
218 blkg = ERR_PTR(ret); 258 now = sched_clock();
219out_put: 259 if (time_after64(now, stats->start_empty_time))
220 css_put(&blkcg->css); 260 stats->empty_time += now - stats->start_empty_time;
221out_free: 261 blkio_clear_blkg_empty(stats);
222 blkg_free(new_blkg);
223 return blkg;
224} 262}
225 263
226struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, 264void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg)
227 struct request_queue *q)
228{ 265{
229 /* 266 unsigned long flags;
230 * This could be the first entry point of blkcg implementation and 267
231 * we shouldn't allow anything to go through for a bypassing queue. 268 spin_lock_irqsave(&blkg->stats_lock, flags);
232 */ 269 BUG_ON(blkio_blkg_idling(&blkg->stats));
233 if (unlikely(blk_queue_bypass(q))) 270 blkg->stats.start_idle_time = sched_clock();
234 return ERR_PTR(blk_queue_dying(q) ? -EINVAL : -EBUSY); 271 blkio_mark_blkg_idling(&blkg->stats);
235 return __blkg_lookup_create(blkcg, q, NULL); 272 spin_unlock_irqrestore(&blkg->stats_lock, flags);
236} 273}
237EXPORT_SYMBOL_GPL(blkg_lookup_create); 274EXPORT_SYMBOL_GPL(blkiocg_update_set_idle_time_stats);
238 275
239static void blkg_destroy(struct blkcg_gq *blkg) 276void blkiocg_update_idle_time_stats(struct blkio_group *blkg)
240{ 277{
241 struct blkcg *blkcg = blkg->blkcg; 278 unsigned long flags;
279 unsigned long long now;
280 struct blkio_group_stats *stats;
281
282 spin_lock_irqsave(&blkg->stats_lock, flags);
283 stats = &blkg->stats;
284 if (blkio_blkg_idling(stats)) {
285 now = sched_clock();
286 if (time_after64(now, stats->start_idle_time))
287 stats->idle_time += now - stats->start_idle_time;
288 blkio_clear_blkg_idling(stats);
289 }
290 spin_unlock_irqrestore(&blkg->stats_lock, flags);
291}
292EXPORT_SYMBOL_GPL(blkiocg_update_idle_time_stats);
242 293
243 lockdep_assert_held(blkg->q->queue_lock); 294void blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg)
244 lockdep_assert_held(&blkcg->lock); 295{
296 unsigned long flags;
297 struct blkio_group_stats *stats;
298
299 spin_lock_irqsave(&blkg->stats_lock, flags);
300 stats = &blkg->stats;
301 stats->avg_queue_size_sum +=
302 stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] +
303 stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE];
304 stats->avg_queue_size_samples++;
305 blkio_update_group_wait_time(stats);
306 spin_unlock_irqrestore(&blkg->stats_lock, flags);
307}
308EXPORT_SYMBOL_GPL(blkiocg_update_avg_queue_size_stats);
245 309
246 /* Something wrong if we are trying to remove same group twice */ 310void blkiocg_set_start_empty_time(struct blkio_group *blkg)
247 WARN_ON_ONCE(list_empty(&blkg->q_node)); 311{
248 WARN_ON_ONCE(hlist_unhashed(&blkg->blkcg_node)); 312 unsigned long flags;
313 struct blkio_group_stats *stats;
249 314
250 radix_tree_delete(&blkcg->blkg_tree, blkg->q->id); 315 spin_lock_irqsave(&blkg->stats_lock, flags);
251 list_del_init(&blkg->q_node); 316 stats = &blkg->stats;
252 hlist_del_init_rcu(&blkg->blkcg_node);
253 317
254 /* 318 if (stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] ||
255 * Both setting lookup hint to and clearing it from @blkg are done 319 stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE]) {
256 * under queue_lock. If it's not pointing to @blkg now, it never 320 spin_unlock_irqrestore(&blkg->stats_lock, flags);
257 * will. Hint assignment itself can race safely. 321 return;
258 */ 322 }
259 if (rcu_dereference_raw(blkcg->blkg_hint) == blkg)
260 rcu_assign_pointer(blkcg->blkg_hint, NULL);
261 323
262 /* 324 /*
263 * Put the reference taken at the time of creation so that when all 325 * group is already marked empty. This can happen if cfqq got new
264 * queues are gone, group can be destroyed. 326 * request in parent group and moved to this group while being added
327 * to service tree. Just ignore the event and move on.
265 */ 328 */
266 blkg_put(blkg); 329 if(blkio_blkg_empty(stats)) {
330 spin_unlock_irqrestore(&blkg->stats_lock, flags);
331 return;
332 }
333
334 stats->start_empty_time = sched_clock();
335 blkio_mark_blkg_empty(stats);
336 spin_unlock_irqrestore(&blkg->stats_lock, flags);
267} 337}
338EXPORT_SYMBOL_GPL(blkiocg_set_start_empty_time);
268 339
269/** 340void blkiocg_update_dequeue_stats(struct blkio_group *blkg,
270 * blkg_destroy_all - destroy all blkgs associated with a request_queue 341 unsigned long dequeue)
271 * @q: request_queue of interest 342{
272 * 343 blkg->stats.dequeue += dequeue;
273 * Destroy all blkgs associated with @q. 344}
274 */ 345EXPORT_SYMBOL_GPL(blkiocg_update_dequeue_stats);
275static void blkg_destroy_all(struct request_queue *q) 346#else
347static inline void blkio_set_start_group_wait_time(struct blkio_group *blkg,
348 struct blkio_group *curr_blkg) {}
349static inline void blkio_end_empty_time(struct blkio_group_stats *stats) {}
350#endif
351
352void blkiocg_update_io_add_stats(struct blkio_group *blkg,
353 struct blkio_group *curr_blkg, bool direction,
354 bool sync)
276{ 355{
277 struct blkcg_gq *blkg, *n; 356 unsigned long flags;
357
358 spin_lock_irqsave(&blkg->stats_lock, flags);
359 blkio_add_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED], 1, direction,
360 sync);
361 blkio_end_empty_time(&blkg->stats);
362 blkio_set_start_group_wait_time(blkg, curr_blkg);
363 spin_unlock_irqrestore(&blkg->stats_lock, flags);
364}
365EXPORT_SYMBOL_GPL(blkiocg_update_io_add_stats);
278 366
279 lockdep_assert_held(q->queue_lock); 367void blkiocg_update_io_remove_stats(struct blkio_group *blkg,
368 bool direction, bool sync)
369{
370 unsigned long flags;
280 371
281 list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) { 372 spin_lock_irqsave(&blkg->stats_lock, flags);
282 struct blkcg *blkcg = blkg->blkcg; 373 blkio_check_and_dec_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED],
374 direction, sync);
375 spin_unlock_irqrestore(&blkg->stats_lock, flags);
376}
377EXPORT_SYMBOL_GPL(blkiocg_update_io_remove_stats);
283 378
284 spin_lock(&blkcg->lock); 379void blkiocg_update_timeslice_used(struct blkio_group *blkg, unsigned long time,
285 blkg_destroy(blkg); 380 unsigned long unaccounted_time)
286 spin_unlock(&blkcg->lock); 381{
287 } 382 unsigned long flags;
383
384 spin_lock_irqsave(&blkg->stats_lock, flags);
385 blkg->stats.time += time;
386#ifdef CONFIG_DEBUG_BLK_CGROUP
387 blkg->stats.unaccounted_time += unaccounted_time;
388#endif
389 spin_unlock_irqrestore(&blkg->stats_lock, flags);
390}
391EXPORT_SYMBOL_GPL(blkiocg_update_timeslice_used);
392
393/*
394 * should be called under rcu read lock or queue lock to make sure blkg pointer
395 * is valid.
396 */
397void blkiocg_update_dispatch_stats(struct blkio_group *blkg,
398 uint64_t bytes, bool direction, bool sync)
399{
400 struct blkio_group_stats_cpu *stats_cpu;
401 unsigned long flags;
288 402
289 /* 403 /*
290 * root blkg is destroyed. Just clear the pointer since 404 * Disabling interrupts to provide mutual exclusion between two
291 * root_rl does not take reference on root blkg. 405 * writes on same cpu. It probably is not needed for 64bit. Not
406 * optimizing that case yet.
292 */ 407 */
293 q->root_blkg = NULL; 408 local_irq_save(flags);
294 q->root_rl.blkg = NULL; 409
410 stats_cpu = this_cpu_ptr(blkg->stats_cpu);
411
412 u64_stats_update_begin(&stats_cpu->syncp);
413 stats_cpu->sectors += bytes >> 9;
414 blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_SERVICED],
415 1, direction, sync);
416 blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_SERVICE_BYTES],
417 bytes, direction, sync);
418 u64_stats_update_end(&stats_cpu->syncp);
419 local_irq_restore(flags);
295} 420}
421EXPORT_SYMBOL_GPL(blkiocg_update_dispatch_stats);
296 422
297static void blkg_rcu_free(struct rcu_head *rcu_head) 423void blkiocg_update_completion_stats(struct blkio_group *blkg,
424 uint64_t start_time, uint64_t io_start_time, bool direction, bool sync)
298{ 425{
299 blkg_free(container_of(rcu_head, struct blkcg_gq, rcu_head)); 426 struct blkio_group_stats *stats;
427 unsigned long flags;
428 unsigned long long now = sched_clock();
429
430 spin_lock_irqsave(&blkg->stats_lock, flags);
431 stats = &blkg->stats;
432 if (time_after64(now, io_start_time))
433 blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICE_TIME],
434 now - io_start_time, direction, sync);
435 if (time_after64(io_start_time, start_time))
436 blkio_add_stat(stats->stat_arr[BLKIO_STAT_WAIT_TIME],
437 io_start_time - start_time, direction, sync);
438 spin_unlock_irqrestore(&blkg->stats_lock, flags);
300} 439}
440EXPORT_SYMBOL_GPL(blkiocg_update_completion_stats);
301 441
302void __blkg_release(struct blkcg_gq *blkg) 442/* Merged stats are per cpu. */
443void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction,
444 bool sync)
303{ 445{
304 /* release the extra blkcg reference this blkg has been holding */ 446 struct blkio_group_stats_cpu *stats_cpu;
305 css_put(&blkg->blkcg->css); 447 unsigned long flags;
306 448
307 /* 449 /*
308 * A group is freed in rcu manner. But having an rcu lock does not 450 * Disabling interrupts to provide mutual exclusion between two
309 * mean that one can access all the fields of blkg and assume these 451 * writes on same cpu. It probably is not needed for 64bit. Not
310 * are valid. For example, don't try to follow throtl_data and 452 * optimizing that case yet.
311 * request queue links.
312 *
313 * Having a reference to blkg under an rcu allows acess to only
314 * values local to groups like group stats and group rate limits
315 */ 453 */
316 call_rcu(&blkg->rcu_head, blkg_rcu_free); 454 local_irq_save(flags);
455
456 stats_cpu = this_cpu_ptr(blkg->stats_cpu);
457
458 u64_stats_update_begin(&stats_cpu->syncp);
459 blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_MERGED], 1,
460 direction, sync);
461 u64_stats_update_end(&stats_cpu->syncp);
462 local_irq_restore(flags);
317} 463}
318EXPORT_SYMBOL_GPL(__blkg_release); 464EXPORT_SYMBOL_GPL(blkiocg_update_io_merged_stats);
319 465
320/* 466/*
321 * The next function used by blk_queue_for_each_rl(). It's a bit tricky 467 * This function allocates the per cpu stats for blkio_group. Should be called
322 * because the root blkg uses @q->root_rl instead of its own rl. 468 * from sleepable context as alloc_per_cpu() requires that.
323 */ 469 */
324struct request_list *__blk_queue_next_rl(struct request_list *rl, 470int blkio_alloc_blkg_stats(struct blkio_group *blkg)
325 struct request_queue *q)
326{ 471{
327 struct list_head *ent; 472 /* Allocate memory for per cpu stats */
328 struct blkcg_gq *blkg; 473 blkg->stats_cpu = alloc_percpu(struct blkio_group_stats_cpu);
474 if (!blkg->stats_cpu)
475 return -ENOMEM;
476 return 0;
477}
478EXPORT_SYMBOL_GPL(blkio_alloc_blkg_stats);
329 479
330 /* 480void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
331 * Determine the current blkg list_head. The first entry is 481 struct blkio_group *blkg, void *key, dev_t dev,
332 * root_rl which is off @q->blkg_list and mapped to the head. 482 enum blkio_policy_id plid)
333 */ 483{
334 if (rl == &q->root_rl) { 484 unsigned long flags;
335 ent = &q->blkg_list; 485
336 /* There are no more block groups, hence no request lists */ 486 spin_lock_irqsave(&blkcg->lock, flags);
337 if (list_empty(ent)) 487 spin_lock_init(&blkg->stats_lock);
338 return NULL; 488 rcu_assign_pointer(blkg->key, key);
339 } else { 489 blkg->blkcg_id = css_id(&blkcg->css);
340 blkg = container_of(rl, struct blkcg_gq, rl); 490 hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
341 ent = &blkg->q_node; 491 blkg->plid = plid;
342 } 492 spin_unlock_irqrestore(&blkcg->lock, flags);
493 /* Need to take css reference ? */
494 cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path));
495 blkg->dev = dev;
496}
497EXPORT_SYMBOL_GPL(blkiocg_add_blkio_group);
498
499static void __blkiocg_del_blkio_group(struct blkio_group *blkg)
500{
501 hlist_del_init_rcu(&blkg->blkcg_node);
502 blkg->blkcg_id = 0;
503}
343 504
344 /* walk to the next list_head, skip root blkcg */ 505/*
345 ent = ent->next; 506 * returns 0 if blkio_group was still on cgroup list. Otherwise returns 1
346 if (ent == &q->root_blkg->q_node) 507 * indicating that blk_group was unhashed by the time we got to it.
347 ent = ent->next; 508 */
348 if (ent == &q->blkg_list) 509int blkiocg_del_blkio_group(struct blkio_group *blkg)
349 return NULL; 510{
511 struct blkio_cgroup *blkcg;
512 unsigned long flags;
513 struct cgroup_subsys_state *css;
514 int ret = 1;
515
516 rcu_read_lock();
517 css = css_lookup(&blkio_subsys, blkg->blkcg_id);
518 if (css) {
519 blkcg = container_of(css, struct blkio_cgroup, css);
520 spin_lock_irqsave(&blkcg->lock, flags);
521 if (!hlist_unhashed(&blkg->blkcg_node)) {
522 __blkiocg_del_blkio_group(blkg);
523 ret = 0;
524 }
525 spin_unlock_irqrestore(&blkcg->lock, flags);
526 }
350 527
351 blkg = container_of(ent, struct blkcg_gq, q_node); 528 rcu_read_unlock();
352 return &blkg->rl; 529 return ret;
353} 530}
531EXPORT_SYMBOL_GPL(blkiocg_del_blkio_group);
354 532
355static int blkcg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, 533/* called under rcu_read_lock(). */
356 u64 val) 534struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key)
357{ 535{
358 struct blkcg *blkcg = cgroup_to_blkcg(cgroup); 536 struct blkio_group *blkg;
359 struct blkcg_gq *blkg;
360 struct hlist_node *n; 537 struct hlist_node *n;
361 int i; 538 void *__key;
362 539
363 mutex_lock(&blkcg_pol_mutex); 540 hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {
364 spin_lock_irq(&blkcg->lock); 541 __key = blkg->key;
542 if (__key == key)
543 return blkg;
544 }
545
546 return NULL;
547}
548EXPORT_SYMBOL_GPL(blkiocg_lookup_group);
365 549
550static void blkio_reset_stats_cpu(struct blkio_group *blkg)
551{
552 struct blkio_group_stats_cpu *stats_cpu;
553 int i, j, k;
366 /* 554 /*
367 * Note that stat reset is racy - it doesn't synchronize against 555 * Note: On 64 bit arch this should not be an issue. This has the
368 * stat updates. This is a debug feature which shouldn't exist 556 * possibility of returning some inconsistent value on 32bit arch
369 * anyway. If you get hit by a race, retry. 557 * as 64bit update on 32bit is non atomic. Taking care of this
558 * corner case makes code very complicated, like sending IPIs to
559 * cpus, taking care of stats of offline cpus etc.
560 *
561 * reset stats is anyway more of a debug feature and this sounds a
562 * corner case. So I am not complicating the code yet until and
563 * unless this becomes a real issue.
370 */ 564 */
371 hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) { 565 for_each_possible_cpu(i) {
372 for (i = 0; i < BLKCG_MAX_POLS; i++) { 566 stats_cpu = per_cpu_ptr(blkg->stats_cpu, i);
373 struct blkcg_policy *pol = blkcg_policy[i]; 567 stats_cpu->sectors = 0;
568 for(j = 0; j < BLKIO_STAT_CPU_NR; j++)
569 for (k = 0; k < BLKIO_STAT_TOTAL; k++)
570 stats_cpu->stat_arr_cpu[j][k] = 0;
571 }
572}
374 573
375 if (blkcg_policy_enabled(blkg->q, pol) && 574static int
376 pol->pd_reset_stats_fn) 575blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val)
377 pol->pd_reset_stats_fn(blkg); 576{
577 struct blkio_cgroup *blkcg;
578 struct blkio_group *blkg;
579 struct blkio_group_stats *stats;
580 struct hlist_node *n;
581 uint64_t queued[BLKIO_STAT_TOTAL];
582 int i;
583#ifdef CONFIG_DEBUG_BLK_CGROUP
584 bool idling, waiting, empty;
585 unsigned long long now = sched_clock();
586#endif
587
588 blkcg = cgroup_to_blkio_cgroup(cgroup);
589 spin_lock_irq(&blkcg->lock);
590 hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
591 spin_lock(&blkg->stats_lock);
592 stats = &blkg->stats;
593#ifdef CONFIG_DEBUG_BLK_CGROUP
594 idling = blkio_blkg_idling(stats);
595 waiting = blkio_blkg_waiting(stats);
596 empty = blkio_blkg_empty(stats);
597#endif
598 for (i = 0; i < BLKIO_STAT_TOTAL; i++)
599 queued[i] = stats->stat_arr[BLKIO_STAT_QUEUED][i];
600 memset(stats, 0, sizeof(struct blkio_group_stats));
601 for (i = 0; i < BLKIO_STAT_TOTAL; i++)
602 stats->stat_arr[BLKIO_STAT_QUEUED][i] = queued[i];
603#ifdef CONFIG_DEBUG_BLK_CGROUP
604 if (idling) {
605 blkio_mark_blkg_idling(stats);
606 stats->start_idle_time = now;
607 }
608 if (waiting) {
609 blkio_mark_blkg_waiting(stats);
610 stats->start_group_wait_time = now;
378 } 611 }
612 if (empty) {
613 blkio_mark_blkg_empty(stats);
614 stats->start_empty_time = now;
615 }
616#endif
617 spin_unlock(&blkg->stats_lock);
618
619 /* Reset Per cpu stats which don't take blkg->stats_lock */
620 blkio_reset_stats_cpu(blkg);
379 } 621 }
380 622
381 spin_unlock_irq(&blkcg->lock); 623 spin_unlock_irq(&blkcg->lock);
382 mutex_unlock(&blkcg_pol_mutex);
383 return 0; 624 return 0;
384} 625}
385 626
386static const char *blkg_dev_name(struct blkcg_gq *blkg) 627static void blkio_get_key_name(enum stat_sub_type type, dev_t dev, char *str,
628 int chars_left, bool diskname_only)
387{ 629{
388 /* some drivers (floppy) instantiate a queue w/o disk registered */ 630 snprintf(str, chars_left, "%d:%d", MAJOR(dev), MINOR(dev));
389 if (blkg->q->backing_dev_info.dev) 631 chars_left -= strlen(str);
390 return dev_name(blkg->q->backing_dev_info.dev); 632 if (chars_left <= 0) {
391 return NULL; 633 printk(KERN_WARNING
634 "Possibly incorrect cgroup stat display format");
635 return;
636 }
637 if (diskname_only)
638 return;
639 switch (type) {
640 case BLKIO_STAT_READ:
641 strlcat(str, " Read", chars_left);
642 break;
643 case BLKIO_STAT_WRITE:
644 strlcat(str, " Write", chars_left);
645 break;
646 case BLKIO_STAT_SYNC:
647 strlcat(str, " Sync", chars_left);
648 break;
649 case BLKIO_STAT_ASYNC:
650 strlcat(str, " Async", chars_left);
651 break;
652 case BLKIO_STAT_TOTAL:
653 strlcat(str, " Total", chars_left);
654 break;
655 default:
656 strlcat(str, " Invalid", chars_left);
657 }
392} 658}
393 659
394/** 660static uint64_t blkio_fill_stat(char *str, int chars_left, uint64_t val,
395 * blkcg_print_blkgs - helper for printing per-blkg data 661 struct cgroup_map_cb *cb, dev_t dev)
396 * @sf: seq_file to print to
397 * @blkcg: blkcg of interest
398 * @prfill: fill function to print out a blkg
399 * @pol: policy in question
400 * @data: data to be passed to @prfill
401 * @show_total: to print out sum of prfill return values or not
402 *
403 * This function invokes @prfill on each blkg of @blkcg if pd for the
404 * policy specified by @pol exists. @prfill is invoked with @sf, the
405 * policy data and @data. If @show_total is %true, the sum of the return
406 * values from @prfill is printed with "Total" label at the end.
407 *
408 * This is to be used to construct print functions for
409 * cftype->read_seq_string method.
410 */
411void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg,
412 u64 (*prfill)(struct seq_file *,
413 struct blkg_policy_data *, int),
414 const struct blkcg_policy *pol, int data,
415 bool show_total)
416{ 662{
417 struct blkcg_gq *blkg; 663 blkio_get_key_name(0, dev, str, chars_left, true);
418 struct hlist_node *n; 664 cb->fill(cb, str, val);
419 u64 total = 0; 665 return val;
666}
420 667
421 spin_lock_irq(&blkcg->lock);
422 hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node)
423 if (blkcg_policy_enabled(blkg->q, pol))
424 total += prfill(sf, blkg->pd[pol->plid], data);
425 spin_unlock_irq(&blkcg->lock);
426 668
427 if (show_total) 669static uint64_t blkio_read_stat_cpu(struct blkio_group *blkg,
428 seq_printf(sf, "Total %llu\n", (unsigned long long)total); 670 enum stat_type_cpu type, enum stat_sub_type sub_type)
671{
672 int cpu;
673 struct blkio_group_stats_cpu *stats_cpu;
674 u64 val = 0, tval;
675
676 for_each_possible_cpu(cpu) {
677 unsigned int start;
678 stats_cpu = per_cpu_ptr(blkg->stats_cpu, cpu);
679
680 do {
681 start = u64_stats_fetch_begin(&stats_cpu->syncp);
682 if (type == BLKIO_STAT_CPU_SECTORS)
683 tval = stats_cpu->sectors;
684 else
685 tval = stats_cpu->stat_arr_cpu[type][sub_type];
686 } while(u64_stats_fetch_retry(&stats_cpu->syncp, start));
687
688 val += tval;
689 }
690
691 return val;
429} 692}
430EXPORT_SYMBOL_GPL(blkcg_print_blkgs);
431 693
432/** 694static uint64_t blkio_get_stat_cpu(struct blkio_group *blkg,
433 * __blkg_prfill_u64 - prfill helper for a single u64 value 695 struct cgroup_map_cb *cb, dev_t dev, enum stat_type_cpu type)
434 * @sf: seq_file to print to
435 * @pd: policy private data of interest
436 * @v: value to print
437 *
438 * Print @v to @sf for the device assocaited with @pd.
439 */
440u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v)
441{ 696{
442 const char *dname = blkg_dev_name(pd->blkg); 697 uint64_t disk_total, val;
698 char key_str[MAX_KEY_LEN];
699 enum stat_sub_type sub_type;
443 700
444 if (!dname) 701 if (type == BLKIO_STAT_CPU_SECTORS) {
445 return 0; 702 val = blkio_read_stat_cpu(blkg, type, 0);
703 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, val, cb, dev);
704 }
446 705
447 seq_printf(sf, "%s %llu\n", dname, (unsigned long long)v); 706 for (sub_type = BLKIO_STAT_READ; sub_type < BLKIO_STAT_TOTAL;
448 return v; 707 sub_type++) {
708 blkio_get_key_name(sub_type, dev, key_str, MAX_KEY_LEN, false);
709 val = blkio_read_stat_cpu(blkg, type, sub_type);
710 cb->fill(cb, key_str, val);
711 }
712
713 disk_total = blkio_read_stat_cpu(blkg, type, BLKIO_STAT_READ) +
714 blkio_read_stat_cpu(blkg, type, BLKIO_STAT_WRITE);
715
716 blkio_get_key_name(BLKIO_STAT_TOTAL, dev, key_str, MAX_KEY_LEN, false);
717 cb->fill(cb, key_str, disk_total);
718 return disk_total;
449} 719}
450EXPORT_SYMBOL_GPL(__blkg_prfill_u64);
451 720
452/** 721/* This should be called with blkg->stats_lock held */
453 * __blkg_prfill_rwstat - prfill helper for a blkg_rwstat 722static uint64_t blkio_get_stat(struct blkio_group *blkg,
454 * @sf: seq_file to print to 723 struct cgroup_map_cb *cb, dev_t dev, enum stat_type type)
455 * @pd: policy private data of interest 724{
456 * @rwstat: rwstat to print 725 uint64_t disk_total;
457 * 726 char key_str[MAX_KEY_LEN];
458 * Print @rwstat to @sf for the device assocaited with @pd. 727 enum stat_sub_type sub_type;
459 */ 728
460u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, 729 if (type == BLKIO_STAT_TIME)
461 const struct blkg_rwstat *rwstat) 730 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
462{ 731 blkg->stats.time, cb, dev);
463 static const char *rwstr[] = { 732#ifdef CONFIG_DEBUG_BLK_CGROUP
464 [BLKG_RWSTAT_READ] = "Read", 733 if (type == BLKIO_STAT_UNACCOUNTED_TIME)
465 [BLKG_RWSTAT_WRITE] = "Write", 734 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
466 [BLKG_RWSTAT_SYNC] = "Sync", 735 blkg->stats.unaccounted_time, cb, dev);
467 [BLKG_RWSTAT_ASYNC] = "Async", 736 if (type == BLKIO_STAT_AVG_QUEUE_SIZE) {
468 }; 737 uint64_t sum = blkg->stats.avg_queue_size_sum;
469 const char *dname = blkg_dev_name(pd->blkg); 738 uint64_t samples = blkg->stats.avg_queue_size_samples;
470 u64 v; 739 if (samples)
471 int i; 740 do_div(sum, samples);
741 else
742 sum = 0;
743 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, sum, cb, dev);
744 }
745 if (type == BLKIO_STAT_GROUP_WAIT_TIME)
746 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
747 blkg->stats.group_wait_time, cb, dev);
748 if (type == BLKIO_STAT_IDLE_TIME)
749 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
750 blkg->stats.idle_time, cb, dev);
751 if (type == BLKIO_STAT_EMPTY_TIME)
752 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
753 blkg->stats.empty_time, cb, dev);
754 if (type == BLKIO_STAT_DEQUEUE)
755 return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
756 blkg->stats.dequeue, cb, dev);
757#endif
758
759 for (sub_type = BLKIO_STAT_READ; sub_type < BLKIO_STAT_TOTAL;
760 sub_type++) {
761 blkio_get_key_name(sub_type, dev, key_str, MAX_KEY_LEN, false);
762 cb->fill(cb, key_str, blkg->stats.stat_arr[type][sub_type]);
763 }
764 disk_total = blkg->stats.stat_arr[type][BLKIO_STAT_READ] +
765 blkg->stats.stat_arr[type][BLKIO_STAT_WRITE];
766 blkio_get_key_name(BLKIO_STAT_TOTAL, dev, key_str, MAX_KEY_LEN, false);
767 cb->fill(cb, key_str, disk_total);
768 return disk_total;
769}
472 770
473 if (!dname) 771static int blkio_check_dev_num(dev_t dev)
474 return 0; 772{
773 int part = 0;
774 struct gendisk *disk;
475 775
476 for (i = 0; i < BLKG_RWSTAT_NR; i++) 776 disk = get_gendisk(dev, &part);
477 seq_printf(sf, "%s %s %llu\n", dname, rwstr[i], 777 if (!disk || part)
478 (unsigned long long)rwstat->cnt[i]); 778 return -ENODEV;
479 779
480 v = rwstat->cnt[BLKG_RWSTAT_READ] + rwstat->cnt[BLKG_RWSTAT_WRITE]; 780 return 0;
481 seq_printf(sf, "%s Total %llu\n", dname, (unsigned long long)v);
482 return v;
483} 781}
484 782
485/** 783static int blkio_policy_parse_and_set(char *buf,
486 * blkg_prfill_stat - prfill callback for blkg_stat 784 struct blkio_policy_node *newpn, enum blkio_policy_id plid, int fileid)
487 * @sf: seq_file to print to
488 * @pd: policy private data of interest
489 * @off: offset to the blkg_stat in @pd
490 *
491 * prfill callback for printing a blkg_stat.
492 */
493u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, int off)
494{ 785{
495 return __blkg_prfill_u64(sf, pd, blkg_stat_read((void *)pd + off)); 786 char *s[4], *p, *major_s = NULL, *minor_s = NULL;
787 int ret;
788 unsigned long major, minor;
789 int i = 0;
790 dev_t dev;
791 u64 temp;
792
793 memset(s, 0, sizeof(s));
794
795 while ((p = strsep(&buf, " ")) != NULL) {
796 if (!*p)
797 continue;
798
799 s[i++] = p;
800
801 /* Prevent from inputing too many things */
802 if (i == 3)
803 break;
804 }
805
806 if (i != 2)
807 return -EINVAL;
808
809 p = strsep(&s[0], ":");
810 if (p != NULL)
811 major_s = p;
812 else
813 return -EINVAL;
814
815 minor_s = s[0];
816 if (!minor_s)
817 return -EINVAL;
818
819 ret = strict_strtoul(major_s, 10, &major);
820 if (ret)
821 return -EINVAL;
822
823 ret = strict_strtoul(minor_s, 10, &minor);
824 if (ret)
825 return -EINVAL;
826
827 dev = MKDEV(major, minor);
828
829 ret = strict_strtoull(s[1], 10, &temp);
830 if (ret)
831 return -EINVAL;
832
833 /* For rule removal, do not check for device presence. */
834 if (temp) {
835 ret = blkio_check_dev_num(dev);
836 if (ret)
837 return ret;
838 }
839
840 newpn->dev = dev;
841
842 switch (plid) {
843 case BLKIO_POLICY_PROP:
844 if ((temp < BLKIO_WEIGHT_MIN && temp > 0) ||
845 temp > BLKIO_WEIGHT_MAX)
846 return -EINVAL;
847
848 newpn->plid = plid;
849 newpn->fileid = fileid;
850 newpn->val.weight = temp;
851 break;
852 case BLKIO_POLICY_THROTL:
853 switch(fileid) {
854 case BLKIO_THROTL_read_bps_device:
855 case BLKIO_THROTL_write_bps_device:
856 newpn->plid = plid;
857 newpn->fileid = fileid;
858 newpn->val.bps = temp;
859 break;
860 case BLKIO_THROTL_read_iops_device:
861 case BLKIO_THROTL_write_iops_device:
862 if (temp > THROTL_IOPS_MAX)
863 return -EINVAL;
864
865 newpn->plid = plid;
866 newpn->fileid = fileid;
867 newpn->val.iops = (unsigned int)temp;
868 break;
869 }
870 break;
871 default:
872 BUG();
873 }
874
875 return 0;
496} 876}
497EXPORT_SYMBOL_GPL(blkg_prfill_stat);
498 877
499/** 878unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
500 * blkg_prfill_rwstat - prfill callback for blkg_rwstat 879 dev_t dev)
501 * @sf: seq_file to print to
502 * @pd: policy private data of interest
503 * @off: offset to the blkg_rwstat in @pd
504 *
505 * prfill callback for printing a blkg_rwstat.
506 */
507u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
508 int off)
509{ 880{
510 struct blkg_rwstat rwstat = blkg_rwstat_read((void *)pd + off); 881 struct blkio_policy_node *pn;
511 882
512 return __blkg_prfill_rwstat(sf, pd, &rwstat); 883 pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_PROP,
884 BLKIO_PROP_weight_device);
885 if (pn)
886 return pn->val.weight;
887 else
888 return blkcg->weight;
513} 889}
514EXPORT_SYMBOL_GPL(blkg_prfill_rwstat); 890EXPORT_SYMBOL_GPL(blkcg_get_weight);
515 891
516/** 892uint64_t blkcg_get_read_bps(struct blkio_cgroup *blkcg, dev_t dev)
517 * blkg_conf_prep - parse and prepare for per-blkg config update
518 * @blkcg: target block cgroup
519 * @pol: target policy
520 * @input: input string
521 * @ctx: blkg_conf_ctx to be filled
522 *
523 * Parse per-blkg config update from @input and initialize @ctx with the
524 * result. @ctx->blkg points to the blkg to be updated and @ctx->v the new
525 * value. This function returns with RCU read lock and queue lock held and
526 * must be paired with blkg_conf_finish().
527 */
528int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
529 const char *input, struct blkg_conf_ctx *ctx)
530 __acquires(rcu) __acquires(disk->queue->queue_lock)
531{ 893{
532 struct gendisk *disk; 894 struct blkio_policy_node *pn;
533 struct blkcg_gq *blkg;
534 unsigned int major, minor;
535 unsigned long long v;
536 int part, ret;
537 895
538 if (sscanf(input, "%u:%u %llu", &major, &minor, &v) != 3) 896 pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
539 return -EINVAL; 897 BLKIO_THROTL_read_bps_device);
898 if (pn)
899 return pn->val.bps;
900 else
901 return -1;
902}
540 903
541 disk = get_gendisk(MKDEV(major, minor), &part); 904uint64_t blkcg_get_write_bps(struct blkio_cgroup *blkcg, dev_t dev)
542 if (!disk || part) 905{
543 return -EINVAL; 906 struct blkio_policy_node *pn;
907 pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
908 BLKIO_THROTL_write_bps_device);
909 if (pn)
910 return pn->val.bps;
911 else
912 return -1;
913}
544 914
545 rcu_read_lock(); 915unsigned int blkcg_get_read_iops(struct blkio_cgroup *blkcg, dev_t dev)
546 spin_lock_irq(disk->queue->queue_lock); 916{
917 struct blkio_policy_node *pn;
547 918
548 if (blkcg_policy_enabled(disk->queue, pol)) 919 pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
549 blkg = blkg_lookup_create(blkcg, disk->queue); 920 BLKIO_THROTL_read_iops_device);
921 if (pn)
922 return pn->val.iops;
550 else 923 else
551 blkg = ERR_PTR(-EINVAL); 924 return -1;
925}
552 926
553 if (IS_ERR(blkg)) { 927unsigned int blkcg_get_write_iops(struct blkio_cgroup *blkcg, dev_t dev)
554 ret = PTR_ERR(blkg); 928{
555 rcu_read_unlock(); 929 struct blkio_policy_node *pn;
556 spin_unlock_irq(disk->queue->queue_lock); 930 pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
557 put_disk(disk); 931 BLKIO_THROTL_write_iops_device);
558 /* 932 if (pn)
559 * If queue was bypassing, we should retry. Do so after a 933 return pn->val.iops;
560 * short msleep(). It isn't strictly necessary but queue 934 else
561 * can be bypassing for some time and it's always nice to 935 return -1;
562 * avoid busy looping. 936}
563 */ 937
564 if (ret == -EBUSY) { 938/* Checks whether user asked for deleting a policy rule */
565 msleep(10); 939static bool blkio_delete_rule_command(struct blkio_policy_node *pn)
566 ret = restart_syscall(); 940{
941 switch(pn->plid) {
942 case BLKIO_POLICY_PROP:
943 if (pn->val.weight == 0)
944 return 1;
945 break;
946 case BLKIO_POLICY_THROTL:
947 switch(pn->fileid) {
948 case BLKIO_THROTL_read_bps_device:
949 case BLKIO_THROTL_write_bps_device:
950 if (pn->val.bps == 0)
951 return 1;
952 break;
953 case BLKIO_THROTL_read_iops_device:
954 case BLKIO_THROTL_write_iops_device:
955 if (pn->val.iops == 0)
956 return 1;
567 } 957 }
568 return ret; 958 break;
959 default:
960 BUG();
569 } 961 }
570 962
571 ctx->disk = disk;
572 ctx->blkg = blkg;
573 ctx->v = v;
574 return 0; 963 return 0;
575} 964}
576EXPORT_SYMBOL_GPL(blkg_conf_prep);
577 965
578/** 966static void blkio_update_policy_rule(struct blkio_policy_node *oldpn,
579 * blkg_conf_finish - finish up per-blkg config update 967 struct blkio_policy_node *newpn)
580 * @ctx: blkg_conf_ctx intiailized by blkg_conf_prep()
581 *
582 * Finish up after per-blkg config update. This function must be paired
583 * with blkg_conf_prep().
584 */
585void blkg_conf_finish(struct blkg_conf_ctx *ctx)
586 __releases(ctx->disk->queue->queue_lock) __releases(rcu)
587{ 968{
588 spin_unlock_irq(ctx->disk->queue->queue_lock); 969 switch(oldpn->plid) {
589 rcu_read_unlock(); 970 case BLKIO_POLICY_PROP:
590 put_disk(ctx->disk); 971 oldpn->val.weight = newpn->val.weight;
972 break;
973 case BLKIO_POLICY_THROTL:
974 switch(newpn->fileid) {
975 case BLKIO_THROTL_read_bps_device:
976 case BLKIO_THROTL_write_bps_device:
977 oldpn->val.bps = newpn->val.bps;
978 break;
979 case BLKIO_THROTL_read_iops_device:
980 case BLKIO_THROTL_write_iops_device:
981 oldpn->val.iops = newpn->val.iops;
982 }
983 break;
984 default:
985 BUG();
986 }
591} 987}
592EXPORT_SYMBOL_GPL(blkg_conf_finish);
593 988
594struct cftype blkcg_files[] = { 989/*
595 { 990 * Some rules/values in blkg have changed. Propagate those to respective
596 .name = "reset_stats", 991 * policies.
597 .write_u64 = blkcg_reset_stats, 992 */
598 }, 993static void blkio_update_blkg_policy(struct blkio_cgroup *blkcg,
599 { } /* terminate */ 994 struct blkio_group *blkg, struct blkio_policy_node *pn)
600}; 995{
996 unsigned int weight, iops;
997 u64 bps;
998
999 switch(pn->plid) {
1000 case BLKIO_POLICY_PROP:
1001 weight = pn->val.weight ? pn->val.weight :
1002 blkcg->weight;
1003 blkio_update_group_weight(blkg, weight);
1004 break;
1005 case BLKIO_POLICY_THROTL:
1006 switch(pn->fileid) {
1007 case BLKIO_THROTL_read_bps_device:
1008 case BLKIO_THROTL_write_bps_device:
1009 bps = pn->val.bps ? pn->val.bps : (-1);
1010 blkio_update_group_bps(blkg, bps, pn->fileid);
1011 break;
1012 case BLKIO_THROTL_read_iops_device:
1013 case BLKIO_THROTL_write_iops_device:
1014 iops = pn->val.iops ? pn->val.iops : (-1);
1015 blkio_update_group_iops(blkg, iops, pn->fileid);
1016 break;
1017 }
1018 break;
1019 default:
1020 BUG();
1021 }
1022}
601 1023
602/** 1024/*
603 * blkcg_css_offline - cgroup css_offline callback 1025 * A policy node rule has been updated. Propagate this update to all the
604 * @cgroup: cgroup of interest 1026 * block groups which might be affected by this update.
605 *
606 * This function is called when @cgroup is about to go away and responsible
607 * for shooting down all blkgs associated with @cgroup. blkgs should be
608 * removed while holding both q and blkcg locks. As blkcg lock is nested
609 * inside q lock, this function performs reverse double lock dancing.
610 *
611 * This is the blkcg counterpart of ioc_release_fn().
612 */ 1027 */
613static void blkcg_css_offline(struct cgroup *cgroup) 1028static void blkio_update_policy_node_blkg(struct blkio_cgroup *blkcg,
1029 struct blkio_policy_node *pn)
614{ 1030{
615 struct blkcg *blkcg = cgroup_to_blkcg(cgroup); 1031 struct blkio_group *blkg;
1032 struct hlist_node *n;
616 1033
1034 spin_lock(&blkio_list_lock);
617 spin_lock_irq(&blkcg->lock); 1035 spin_lock_irq(&blkcg->lock);
618 1036
619 while (!hlist_empty(&blkcg->blkg_list)) { 1037 hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
620 struct blkcg_gq *blkg = hlist_entry(blkcg->blkg_list.first, 1038 if (pn->dev != blkg->dev || pn->plid != blkg->plid)
621 struct blkcg_gq, blkcg_node); 1039 continue;
622 struct request_queue *q = blkg->q; 1040 blkio_update_blkg_policy(blkcg, blkg, pn);
623
624 if (spin_trylock(q->queue_lock)) {
625 blkg_destroy(blkg);
626 spin_unlock(q->queue_lock);
627 } else {
628 spin_unlock_irq(&blkcg->lock);
629 cpu_relax();
630 spin_lock_irq(&blkcg->lock);
631 }
632 } 1041 }
633 1042
634 spin_unlock_irq(&blkcg->lock); 1043 spin_unlock_irq(&blkcg->lock);
1044 spin_unlock(&blkio_list_lock);
635} 1045}
636 1046
637static void blkcg_css_free(struct cgroup *cgroup) 1047static int blkiocg_file_write(struct cgroup *cgrp, struct cftype *cft,
1048 const char *buffer)
638{ 1049{
639 struct blkcg *blkcg = cgroup_to_blkcg(cgroup); 1050 int ret = 0;
1051 char *buf;
1052 struct blkio_policy_node *newpn, *pn;
1053 struct blkio_cgroup *blkcg;
1054 int keep_newpn = 0;
1055 enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
1056 int fileid = BLKIOFILE_ATTR(cft->private);
1057
1058 buf = kstrdup(buffer, GFP_KERNEL);
1059 if (!buf)
1060 return -ENOMEM;
640 1061
641 if (blkcg != &blkcg_root) 1062 newpn = kzalloc(sizeof(*newpn), GFP_KERNEL);
642 kfree(blkcg); 1063 if (!newpn) {
643} 1064 ret = -ENOMEM;
1065 goto free_buf;
1066 }
644 1067
645static struct cgroup_subsys_state *blkcg_css_alloc(struct cgroup *cgroup) 1068 ret = blkio_policy_parse_and_set(buf, newpn, plid, fileid);
646{ 1069 if (ret)
647 static atomic64_t id_seq = ATOMIC64_INIT(0); 1070 goto free_newpn;
648 struct blkcg *blkcg;
649 struct cgroup *parent = cgroup->parent;
650 1071
651 if (!parent) { 1072 blkcg = cgroup_to_blkio_cgroup(cgrp);
652 blkcg = &blkcg_root; 1073
653 goto done; 1074 spin_lock_irq(&blkcg->lock);
1075
1076 pn = blkio_policy_search_node(blkcg, newpn->dev, plid, fileid);
1077 if (!pn) {
1078 if (!blkio_delete_rule_command(newpn)) {
1079 blkio_policy_insert_node(blkcg, newpn);
1080 keep_newpn = 1;
1081 }
1082 spin_unlock_irq(&blkcg->lock);
1083 goto update_io_group;
654 } 1084 }
655 1085
656 blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL); 1086 if (blkio_delete_rule_command(newpn)) {
657 if (!blkcg) 1087 blkio_policy_delete_node(pn);
658 return ERR_PTR(-ENOMEM); 1088 spin_unlock_irq(&blkcg->lock);
1089 goto update_io_group;
1090 }
1091 spin_unlock_irq(&blkcg->lock);
659 1092
660 blkcg->cfq_weight = CFQ_WEIGHT_DEFAULT; 1093 blkio_update_policy_rule(pn, newpn);
661 blkcg->id = atomic64_inc_return(&id_seq); /* root is 0, start from 1 */
662done:
663 spin_lock_init(&blkcg->lock);
664 INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_ATOMIC);
665 INIT_HLIST_HEAD(&blkcg->blkg_list);
666 1094
667 return &blkcg->css; 1095update_io_group:
1096 blkio_update_policy_node_blkg(blkcg, newpn);
1097
1098free_newpn:
1099 if (!keep_newpn)
1100 kfree(newpn);
1101free_buf:
1102 kfree(buf);
1103 return ret;
668} 1104}
669 1105
670/** 1106static void
671 * blkcg_init_queue - initialize blkcg part of request queue 1107blkio_print_policy_node(struct seq_file *m, struct blkio_policy_node *pn)
672 * @q: request_queue to initialize
673 *
674 * Called from blk_alloc_queue_node(). Responsible for initializing blkcg
675 * part of new request_queue @q.
676 *
677 * RETURNS:
678 * 0 on success, -errno on failure.
679 */
680int blkcg_init_queue(struct request_queue *q)
681{ 1108{
682 might_sleep(); 1109 switch(pn->plid) {
683 1110 case BLKIO_POLICY_PROP:
684 return blk_throtl_init(q); 1111 if (pn->fileid == BLKIO_PROP_weight_device)
1112 seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev),
1113 MINOR(pn->dev), pn->val.weight);
1114 break;
1115 case BLKIO_POLICY_THROTL:
1116 switch(pn->fileid) {
1117 case BLKIO_THROTL_read_bps_device:
1118 case BLKIO_THROTL_write_bps_device:
1119 seq_printf(m, "%u:%u\t%llu\n", MAJOR(pn->dev),
1120 MINOR(pn->dev), pn->val.bps);
1121 break;
1122 case BLKIO_THROTL_read_iops_device:
1123 case BLKIO_THROTL_write_iops_device:
1124 seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev),
1125 MINOR(pn->dev), pn->val.iops);
1126 break;
1127 }
1128 break;
1129 default:
1130 BUG();
1131 }
685} 1132}
686 1133
687/** 1134/* cgroup files which read their data from policy nodes end up here */
688 * blkcg_drain_queue - drain blkcg part of request_queue 1135static void blkio_read_policy_node_files(struct cftype *cft,
689 * @q: request_queue to drain 1136 struct blkio_cgroup *blkcg, struct seq_file *m)
690 *
691 * Called from blk_drain_queue(). Responsible for draining blkcg part.
692 */
693void blkcg_drain_queue(struct request_queue *q)
694{ 1137{
695 lockdep_assert_held(q->queue_lock); 1138 struct blkio_policy_node *pn;
696 1139
697 blk_throtl_drain(q); 1140 if (!list_empty(&blkcg->policy_list)) {
1141 spin_lock_irq(&blkcg->lock);
1142 list_for_each_entry(pn, &blkcg->policy_list, node) {
1143 if (!pn_matches_cftype(cft, pn))
1144 continue;
1145 blkio_print_policy_node(m, pn);
1146 }
1147 spin_unlock_irq(&blkcg->lock);
1148 }
698} 1149}
699 1150
700/** 1151static int blkiocg_file_read(struct cgroup *cgrp, struct cftype *cft,
701 * blkcg_exit_queue - exit and release blkcg part of request_queue 1152 struct seq_file *m)
702 * @q: request_queue being released
703 *
704 * Called from blk_release_queue(). Responsible for exiting blkcg part.
705 */
706void blkcg_exit_queue(struct request_queue *q)
707{ 1153{
708 spin_lock_irq(q->queue_lock); 1154 struct blkio_cgroup *blkcg;
709 blkg_destroy_all(q); 1155 enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
710 spin_unlock_irq(q->queue_lock); 1156 int name = BLKIOFILE_ATTR(cft->private);
1157
1158 blkcg = cgroup_to_blkio_cgroup(cgrp);
1159
1160 switch(plid) {
1161 case BLKIO_POLICY_PROP:
1162 switch(name) {
1163 case BLKIO_PROP_weight_device:
1164 blkio_read_policy_node_files(cft, blkcg, m);
1165 return 0;
1166 default:
1167 BUG();
1168 }
1169 break;
1170 case BLKIO_POLICY_THROTL:
1171 switch(name){
1172 case BLKIO_THROTL_read_bps_device:
1173 case BLKIO_THROTL_write_bps_device:
1174 case BLKIO_THROTL_read_iops_device:
1175 case BLKIO_THROTL_write_iops_device:
1176 blkio_read_policy_node_files(cft, blkcg, m);
1177 return 0;
1178 default:
1179 BUG();
1180 }
1181 break;
1182 default:
1183 BUG();
1184 }
711 1185
712 blk_throtl_exit(q); 1186 return 0;
713} 1187}
714 1188
715/* 1189static int blkio_read_blkg_stats(struct blkio_cgroup *blkcg,
716 * We cannot support shared io contexts, as we have no mean to support 1190 struct cftype *cft, struct cgroup_map_cb *cb,
717 * two tasks with the same ioc in two different groups without major rework 1191 enum stat_type type, bool show_total, bool pcpu)
718 * of the main cic data structures. For now we allow a task to change
719 * its cgroup only if it's the only owner of its ioc.
720 */
721static int blkcg_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
722{ 1192{
723 struct task_struct *task; 1193 struct blkio_group *blkg;
724 struct io_context *ioc; 1194 struct hlist_node *n;
725 int ret = 0; 1195 uint64_t cgroup_total = 0;
726 1196
727 /* task_lock() is needed to avoid races with exit_io_context() */ 1197 rcu_read_lock();
728 cgroup_taskset_for_each(task, cgrp, tset) { 1198 hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {
729 task_lock(task); 1199 if (blkg->dev) {
730 ioc = task->io_context; 1200 if (!cftype_blkg_same_policy(cft, blkg))
731 if (ioc && atomic_read(&ioc->nr_tasks) > 1) 1201 continue;
732 ret = -EINVAL; 1202 if (pcpu)
733 task_unlock(task); 1203 cgroup_total += blkio_get_stat_cpu(blkg, cb,
734 if (ret) 1204 blkg->dev, type);
735 break; 1205 else {
1206 spin_lock_irq(&blkg->stats_lock);
1207 cgroup_total += blkio_get_stat(blkg, cb,
1208 blkg->dev, type);
1209 spin_unlock_irq(&blkg->stats_lock);
1210 }
1211 }
736 } 1212 }
737 return ret; 1213 if (show_total)
1214 cb->fill(cb, "Total", cgroup_total);
1215 rcu_read_unlock();
1216 return 0;
738} 1217}
739 1218
740struct cgroup_subsys blkio_subsys = { 1219/* All map kind of cgroup file get serviced by this function */
741 .name = "blkio", 1220static int blkiocg_file_read_map(struct cgroup *cgrp, struct cftype *cft,
742 .css_alloc = blkcg_css_alloc, 1221 struct cgroup_map_cb *cb)
743 .css_offline = blkcg_css_offline, 1222{
744 .css_free = blkcg_css_free, 1223 struct blkio_cgroup *blkcg;
745 .can_attach = blkcg_can_attach, 1224 enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
746 .subsys_id = blkio_subsys_id, 1225 int name = BLKIOFILE_ATTR(cft->private);
747 .base_cftypes = blkcg_files, 1226
748 .module = THIS_MODULE, 1227 blkcg = cgroup_to_blkio_cgroup(cgrp);
1228
1229 switch(plid) {
1230 case BLKIO_POLICY_PROP:
1231 switch(name) {
1232 case BLKIO_PROP_time:
1233 return blkio_read_blkg_stats(blkcg, cft, cb,
1234 BLKIO_STAT_TIME, 0, 0);
1235 case BLKIO_PROP_sectors:
1236 return blkio_read_blkg_stats(blkcg, cft, cb,
1237 BLKIO_STAT_CPU_SECTORS, 0, 1);
1238 case BLKIO_PROP_io_service_bytes:
1239 return blkio_read_blkg_stats(blkcg, cft, cb,
1240 BLKIO_STAT_CPU_SERVICE_BYTES, 1, 1);
1241 case BLKIO_PROP_io_serviced:
1242 return blkio_read_blkg_stats(blkcg, cft, cb,
1243 BLKIO_STAT_CPU_SERVICED, 1, 1);
1244 case BLKIO_PROP_io_service_time:
1245 return blkio_read_blkg_stats(blkcg, cft, cb,
1246 BLKIO_STAT_SERVICE_TIME, 1, 0);
1247 case BLKIO_PROP_io_wait_time:
1248 return blkio_read_blkg_stats(blkcg, cft, cb,
1249 BLKIO_STAT_WAIT_TIME, 1, 0);
1250 case BLKIO_PROP_io_merged:
1251 return blkio_read_blkg_stats(blkcg, cft, cb,
1252 BLKIO_STAT_CPU_MERGED, 1, 1);
1253 case BLKIO_PROP_io_queued:
1254 return blkio_read_blkg_stats(blkcg, cft, cb,
1255 BLKIO_STAT_QUEUED, 1, 0);
1256#ifdef CONFIG_DEBUG_BLK_CGROUP
1257 case BLKIO_PROP_unaccounted_time:
1258 return blkio_read_blkg_stats(blkcg, cft, cb,
1259 BLKIO_STAT_UNACCOUNTED_TIME, 0, 0);
1260 case BLKIO_PROP_dequeue:
1261 return blkio_read_blkg_stats(blkcg, cft, cb,
1262 BLKIO_STAT_DEQUEUE, 0, 0);
1263 case BLKIO_PROP_avg_queue_size:
1264 return blkio_read_blkg_stats(blkcg, cft, cb,
1265 BLKIO_STAT_AVG_QUEUE_SIZE, 0, 0);
1266 case BLKIO_PROP_group_wait_time:
1267 return blkio_read_blkg_stats(blkcg, cft, cb,
1268 BLKIO_STAT_GROUP_WAIT_TIME, 0, 0);
1269 case BLKIO_PROP_idle_time:
1270 return blkio_read_blkg_stats(blkcg, cft, cb,
1271 BLKIO_STAT_IDLE_TIME, 0, 0);
1272 case BLKIO_PROP_empty_time:
1273 return blkio_read_blkg_stats(blkcg, cft, cb,
1274 BLKIO_STAT_EMPTY_TIME, 0, 0);
1275#endif
1276 default:
1277 BUG();
1278 }
1279 break;
1280 case BLKIO_POLICY_THROTL:
1281 switch(name){
1282 case BLKIO_THROTL_io_service_bytes:
1283 return blkio_read_blkg_stats(blkcg, cft, cb,
1284 BLKIO_STAT_CPU_SERVICE_BYTES, 1, 1);
1285 case BLKIO_THROTL_io_serviced:
1286 return blkio_read_blkg_stats(blkcg, cft, cb,
1287 BLKIO_STAT_CPU_SERVICED, 1, 1);
1288 default:
1289 BUG();
1290 }
1291 break;
1292 default:
1293 BUG();
1294 }
749 1295
750 /* 1296 return 0;
751 * blkio subsystem is utterly broken in terms of hierarchy support. 1297}
752 * It treats all cgroups equally regardless of where they're
753 * located in the hierarchy - all cgroups are treated as if they're
754 * right below the root. Fix it and remove the following.
755 */
756 .broken_hierarchy = true,
757};
758EXPORT_SYMBOL_GPL(blkio_subsys);
759 1298
760/** 1299static int blkio_weight_write(struct blkio_cgroup *blkcg, u64 val)
761 * blkcg_activate_policy - activate a blkcg policy on a request_queue
762 * @q: request_queue of interest
763 * @pol: blkcg policy to activate
764 *
765 * Activate @pol on @q. Requires %GFP_KERNEL context. @q goes through
766 * bypass mode to populate its blkgs with policy_data for @pol.
767 *
768 * Activation happens with @q bypassed, so nobody would be accessing blkgs
769 * from IO path. Update of each blkg is protected by both queue and blkcg
770 * locks so that holding either lock and testing blkcg_policy_enabled() is
771 * always enough for dereferencing policy data.
772 *
773 * The caller is responsible for synchronizing [de]activations and policy
774 * [un]registerations. Returns 0 on success, -errno on failure.
775 */
776int blkcg_activate_policy(struct request_queue *q,
777 const struct blkcg_policy *pol)
778{ 1300{
779 LIST_HEAD(pds); 1301 struct blkio_group *blkg;
780 struct blkcg_gq *blkg; 1302 struct hlist_node *n;
781 struct blkg_policy_data *pd, *n; 1303 struct blkio_policy_node *pn;
782 int cnt = 0, ret;
783 bool preloaded;
784
785 if (blkcg_policy_enabled(q, pol))
786 return 0;
787 1304
788 /* preallocations for root blkg */ 1305 if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX)
789 blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL); 1306 return -EINVAL;
790 if (!blkg)
791 return -ENOMEM;
792 1307
793 preloaded = !radix_tree_preload(GFP_KERNEL); 1308 spin_lock(&blkio_list_lock);
1309 spin_lock_irq(&blkcg->lock);
1310 blkcg->weight = (unsigned int)val;
794 1311
795 blk_queue_bypass_start(q); 1312 hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
1313 pn = blkio_policy_search_node(blkcg, blkg->dev,
1314 BLKIO_POLICY_PROP, BLKIO_PROP_weight_device);
1315 if (pn)
1316 continue;
796 1317
797 /* make sure the root blkg exists and count the existing blkgs */ 1318 blkio_update_group_weight(blkg, blkcg->weight);
798 spin_lock_irq(q->queue_lock); 1319 }
1320 spin_unlock_irq(&blkcg->lock);
1321 spin_unlock(&blkio_list_lock);
1322 return 0;
1323}
799 1324
800 rcu_read_lock(); 1325static u64 blkiocg_file_read_u64 (struct cgroup *cgrp, struct cftype *cft) {
801 blkg = __blkg_lookup_create(&blkcg_root, q, blkg); 1326 struct blkio_cgroup *blkcg;
802 rcu_read_unlock(); 1327 enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
1328 int name = BLKIOFILE_ATTR(cft->private);
803 1329
804 if (preloaded) 1330 blkcg = cgroup_to_blkio_cgroup(cgrp);
805 radix_tree_preload_end();
806 1331
807 if (IS_ERR(blkg)) { 1332 switch(plid) {
808 ret = PTR_ERR(blkg); 1333 case BLKIO_POLICY_PROP:
809 goto out_unlock; 1334 switch(name) {
1335 case BLKIO_PROP_weight:
1336 return (u64)blkcg->weight;
1337 }
1338 break;
1339 default:
1340 BUG();
810 } 1341 }
811 q->root_blkg = blkg; 1342 return 0;
812 q->root_rl.blkg = blkg; 1343}
813 1344
814 list_for_each_entry(blkg, &q->blkg_list, q_node) 1345static int
815 cnt++; 1346blkiocg_file_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
1347{
1348 struct blkio_cgroup *blkcg;
1349 enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
1350 int name = BLKIOFILE_ATTR(cft->private);
816 1351
817 spin_unlock_irq(q->queue_lock); 1352 blkcg = cgroup_to_blkio_cgroup(cgrp);
818 1353
819 /* allocate policy_data for all existing blkgs */ 1354 switch(plid) {
820 while (cnt--) { 1355 case BLKIO_POLICY_PROP:
821 pd = kzalloc_node(pol->pd_size, GFP_KERNEL, q->node); 1356 switch(name) {
822 if (!pd) { 1357 case BLKIO_PROP_weight:
823 ret = -ENOMEM; 1358 return blkio_weight_write(blkcg, val);
824 goto out_free;
825 } 1359 }
826 list_add_tail(&pd->alloc_node, &pds); 1360 break;
1361 default:
1362 BUG();
827 } 1363 }
828 1364
829 /* 1365 return 0;
830 * Install the allocated pds. With @q bypassing, no new blkg 1366}
831 * should have been created while the queue lock was dropped.
832 */
833 spin_lock_irq(q->queue_lock);
834 1367
835 list_for_each_entry(blkg, &q->blkg_list, q_node) { 1368struct cftype blkio_files[] = {
836 if (WARN_ON(list_empty(&pds))) { 1369 {
837 /* umm... this shouldn't happen, just abort */ 1370 .name = "weight_device",
838 ret = -ENOMEM; 1371 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
839 goto out_unlock; 1372 BLKIO_PROP_weight_device),
840 } 1373 .read_seq_string = blkiocg_file_read,
841 pd = list_first_entry(&pds, struct blkg_policy_data, alloc_node); 1374 .write_string = blkiocg_file_write,
842 list_del_init(&pd->alloc_node); 1375 .max_write_len = 256,
1376 },
1377 {
1378 .name = "weight",
1379 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1380 BLKIO_PROP_weight),
1381 .read_u64 = blkiocg_file_read_u64,
1382 .write_u64 = blkiocg_file_write_u64,
1383 },
1384 {
1385 .name = "time",
1386 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1387 BLKIO_PROP_time),
1388 .read_map = blkiocg_file_read_map,
1389 },
1390 {
1391 .name = "sectors",
1392 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1393 BLKIO_PROP_sectors),
1394 .read_map = blkiocg_file_read_map,
1395 },
1396 {
1397 .name = "io_service_bytes",
1398 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1399 BLKIO_PROP_io_service_bytes),
1400 .read_map = blkiocg_file_read_map,
1401 },
1402 {
1403 .name = "io_serviced",
1404 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1405 BLKIO_PROP_io_serviced),
1406 .read_map = blkiocg_file_read_map,
1407 },
1408 {
1409 .name = "io_service_time",
1410 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1411 BLKIO_PROP_io_service_time),
1412 .read_map = blkiocg_file_read_map,
1413 },
1414 {
1415 .name = "io_wait_time",
1416 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1417 BLKIO_PROP_io_wait_time),
1418 .read_map = blkiocg_file_read_map,
1419 },
1420 {
1421 .name = "io_merged",
1422 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1423 BLKIO_PROP_io_merged),
1424 .read_map = blkiocg_file_read_map,
1425 },
1426 {
1427 .name = "io_queued",
1428 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1429 BLKIO_PROP_io_queued),
1430 .read_map = blkiocg_file_read_map,
1431 },
1432 {
1433 .name = "reset_stats",
1434 .write_u64 = blkiocg_reset_stats,
1435 },
1436#ifdef CONFIG_BLK_DEV_THROTTLING
1437 {
1438 .name = "throttle.read_bps_device",
1439 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1440 BLKIO_THROTL_read_bps_device),
1441 .read_seq_string = blkiocg_file_read,
1442 .write_string = blkiocg_file_write,
1443 .max_write_len = 256,
1444 },
843 1445
844 /* grab blkcg lock too while installing @pd on @blkg */ 1446 {
845 spin_lock(&blkg->blkcg->lock); 1447 .name = "throttle.write_bps_device",
1448 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1449 BLKIO_THROTL_write_bps_device),
1450 .read_seq_string = blkiocg_file_read,
1451 .write_string = blkiocg_file_write,
1452 .max_write_len = 256,
1453 },
846 1454
847 blkg->pd[pol->plid] = pd; 1455 {
848 pd->blkg = blkg; 1456 .name = "throttle.read_iops_device",
849 pol->pd_init_fn(blkg); 1457 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1458 BLKIO_THROTL_read_iops_device),
1459 .read_seq_string = blkiocg_file_read,
1460 .write_string = blkiocg_file_write,
1461 .max_write_len = 256,
1462 },
850 1463
851 spin_unlock(&blkg->blkcg->lock); 1464 {
852 } 1465 .name = "throttle.write_iops_device",
1466 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1467 BLKIO_THROTL_write_iops_device),
1468 .read_seq_string = blkiocg_file_read,
1469 .write_string = blkiocg_file_write,
1470 .max_write_len = 256,
1471 },
1472 {
1473 .name = "throttle.io_service_bytes",
1474 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1475 BLKIO_THROTL_io_service_bytes),
1476 .read_map = blkiocg_file_read_map,
1477 },
1478 {
1479 .name = "throttle.io_serviced",
1480 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1481 BLKIO_THROTL_io_serviced),
1482 .read_map = blkiocg_file_read_map,
1483 },
1484#endif /* CONFIG_BLK_DEV_THROTTLING */
853 1485
854 __set_bit(pol->plid, q->blkcg_pols); 1486#ifdef CONFIG_DEBUG_BLK_CGROUP
855 ret = 0; 1487 {
856out_unlock: 1488 .name = "avg_queue_size",
857 spin_unlock_irq(q->queue_lock); 1489 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
858out_free: 1490 BLKIO_PROP_avg_queue_size),
859 blk_queue_bypass_end(q); 1491 .read_map = blkiocg_file_read_map,
860 list_for_each_entry_safe(pd, n, &pds, alloc_node) 1492 },
861 kfree(pd); 1493 {
862 return ret; 1494 .name = "group_wait_time",
1495 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1496 BLKIO_PROP_group_wait_time),
1497 .read_map = blkiocg_file_read_map,
1498 },
1499 {
1500 .name = "idle_time",
1501 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1502 BLKIO_PROP_idle_time),
1503 .read_map = blkiocg_file_read_map,
1504 },
1505 {
1506 .name = "empty_time",
1507 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1508 BLKIO_PROP_empty_time),
1509 .read_map = blkiocg_file_read_map,
1510 },
1511 {
1512 .name = "dequeue",
1513 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1514 BLKIO_PROP_dequeue),
1515 .read_map = blkiocg_file_read_map,
1516 },
1517 {
1518 .name = "unaccounted_time",
1519 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1520 BLKIO_PROP_unaccounted_time),
1521 .read_map = blkiocg_file_read_map,
1522 },
1523#endif
1524};
1525
1526static int blkiocg_populate(struct cgroup_subsys *subsys, struct cgroup *cgroup)
1527{
1528 return cgroup_add_files(cgroup, subsys, blkio_files,
1529 ARRAY_SIZE(blkio_files));
863} 1530}
864EXPORT_SYMBOL_GPL(blkcg_activate_policy);
865 1531
866/** 1532static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup)
867 * blkcg_deactivate_policy - deactivate a blkcg policy on a request_queue
868 * @q: request_queue of interest
869 * @pol: blkcg policy to deactivate
870 *
871 * Deactivate @pol on @q. Follows the same synchronization rules as
872 * blkcg_activate_policy().
873 */
874void blkcg_deactivate_policy(struct request_queue *q,
875 const struct blkcg_policy *pol)
876{ 1533{
877 struct blkcg_gq *blkg; 1534 struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
1535 unsigned long flags;
1536 struct blkio_group *blkg;
1537 void *key;
1538 struct blkio_policy_type *blkiop;
1539 struct blkio_policy_node *pn, *pntmp;
878 1540
879 if (!blkcg_policy_enabled(q, pol)) 1541 rcu_read_lock();
880 return; 1542 do {
1543 spin_lock_irqsave(&blkcg->lock, flags);
881 1544
882 blk_queue_bypass_start(q); 1545 if (hlist_empty(&blkcg->blkg_list)) {
883 spin_lock_irq(q->queue_lock); 1546 spin_unlock_irqrestore(&blkcg->lock, flags);
1547 break;
1548 }
884 1549
885 __clear_bit(pol->plid, q->blkcg_pols); 1550 blkg = hlist_entry(blkcg->blkg_list.first, struct blkio_group,
1551 blkcg_node);
1552 key = rcu_dereference(blkg->key);
1553 __blkiocg_del_blkio_group(blkg);
886 1554
887 /* if no policy is left, no need for blkgs - shoot them down */ 1555 spin_unlock_irqrestore(&blkcg->lock, flags);
888 if (bitmap_empty(q->blkcg_pols, BLKCG_MAX_POLS))
889 blkg_destroy_all(q);
890 1556
891 list_for_each_entry(blkg, &q->blkg_list, q_node) { 1557 /*
892 /* grab blkcg lock too while removing @pd from @blkg */ 1558 * This blkio_group is being unlinked as associated cgroup is
893 spin_lock(&blkg->blkcg->lock); 1559 * going away. Let all the IO controlling policies know about
1560 * this event.
1561 */
1562 spin_lock(&blkio_list_lock);
1563 list_for_each_entry(blkiop, &blkio_list, list) {
1564 if (blkiop->plid != blkg->plid)
1565 continue;
1566 blkiop->ops.blkio_unlink_group_fn(key, blkg);
1567 }
1568 spin_unlock(&blkio_list_lock);
1569 } while (1);
894 1570
895 if (pol->pd_exit_fn) 1571 list_for_each_entry_safe(pn, pntmp, &blkcg->policy_list, node) {
896 pol->pd_exit_fn(blkg); 1572 blkio_policy_delete_node(pn);
1573 kfree(pn);
1574 }
897 1575
898 kfree(blkg->pd[pol->plid]); 1576 free_css_id(&blkio_subsys, &blkcg->css);
899 blkg->pd[pol->plid] = NULL; 1577 rcu_read_unlock();
1578 if (blkcg != &blkio_root_cgroup)
1579 kfree(blkcg);
1580}
900 1581
901 spin_unlock(&blkg->blkcg->lock); 1582static struct cgroup_subsys_state *
1583blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup)
1584{
1585 struct blkio_cgroup *blkcg;
1586 struct cgroup *parent = cgroup->parent;
1587
1588 if (!parent) {
1589 blkcg = &blkio_root_cgroup;
1590 goto done;
902 } 1591 }
903 1592
904 spin_unlock_irq(q->queue_lock); 1593 blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
905 blk_queue_bypass_end(q); 1594 if (!blkcg)
1595 return ERR_PTR(-ENOMEM);
1596
1597 blkcg->weight = BLKIO_WEIGHT_DEFAULT;
1598done:
1599 spin_lock_init(&blkcg->lock);
1600 INIT_HLIST_HEAD(&blkcg->blkg_list);
1601
1602 INIT_LIST_HEAD(&blkcg->policy_list);
1603 return &blkcg->css;
906} 1604}
907EXPORT_SYMBOL_GPL(blkcg_deactivate_policy);
908 1605
909/** 1606/*
910 * blkcg_policy_register - register a blkcg policy 1607 * We cannot support shared io contexts, as we have no mean to support
911 * @pol: blkcg policy to register 1608 * two tasks with the same ioc in two different groups without major rework
912 * 1609 * of the main cic data structures. For now we allow a task to change
913 * Register @pol with blkcg core. Might sleep and @pol may be modified on 1610 * its cgroup only if it's the only owner of its ioc.
914 * successful registration. Returns 0 on success and -errno on failure.
915 */ 1611 */
916int blkcg_policy_register(struct blkcg_policy *pol) 1612static int blkiocg_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
917{ 1613{
918 int i, ret; 1614 struct io_context *ioc;
919 1615 int ret = 0;
920 if (WARN_ON(pol->pd_size < sizeof(struct blkg_policy_data)))
921 return -EINVAL;
922 1616
923 mutex_lock(&blkcg_pol_mutex); 1617 /* task_lock() is needed to avoid races with exit_io_context() */
1618 task_lock(tsk);
1619 ioc = tsk->io_context;
1620 if (ioc && atomic_read(&ioc->nr_tasks) > 1)
1621 ret = -EINVAL;
1622 task_unlock(tsk);
924 1623
925 /* find an empty slot */
926 ret = -ENOSPC;
927 for (i = 0; i < BLKCG_MAX_POLS; i++)
928 if (!blkcg_policy[i])
929 break;
930 if (i >= BLKCG_MAX_POLS)
931 goto out_unlock;
932
933 /* register and update blkgs */
934 pol->plid = i;
935 blkcg_policy[i] = pol;
936
937 /* everything is in place, add intf files for the new policy */
938 if (pol->cftypes)
939 WARN_ON(cgroup_add_cftypes(&blkio_subsys, pol->cftypes));
940 ret = 0;
941out_unlock:
942 mutex_unlock(&blkcg_pol_mutex);
943 return ret; 1624 return ret;
944} 1625}
945EXPORT_SYMBOL_GPL(blkcg_policy_register);
946 1626
947/** 1627static void blkiocg_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
948 * blkcg_policy_unregister - unregister a blkcg policy 1628{
949 * @pol: blkcg policy to unregister 1629 struct io_context *ioc;
950 * 1630
951 * Undo blkcg_policy_register(@pol). Might sleep. 1631 task_lock(tsk);
952 */ 1632 ioc = tsk->io_context;
953void blkcg_policy_unregister(struct blkcg_policy *pol) 1633 if (ioc)
1634 ioc->cgroup_changed = 1;
1635 task_unlock(tsk);
1636}
1637
1638void blkio_policy_register(struct blkio_policy_type *blkiop)
954{ 1639{
955 mutex_lock(&blkcg_pol_mutex); 1640 spin_lock(&blkio_list_lock);
1641 list_add_tail(&blkiop->list, &blkio_list);
1642 spin_unlock(&blkio_list_lock);
1643}
1644EXPORT_SYMBOL_GPL(blkio_policy_register);
956 1645
957 if (WARN_ON(blkcg_policy[pol->plid] != pol)) 1646void blkio_policy_unregister(struct blkio_policy_type *blkiop)
958 goto out_unlock; 1647{
1648 spin_lock(&blkio_list_lock);
1649 list_del_init(&blkiop->list);
1650 spin_unlock(&blkio_list_lock);
1651}
1652EXPORT_SYMBOL_GPL(blkio_policy_unregister);
959 1653
960 /* kill the intf files first */ 1654static int __init init_cgroup_blkio(void)
961 if (pol->cftypes) 1655{
962 cgroup_rm_cftypes(&blkio_subsys, pol->cftypes); 1656 return cgroup_load_subsys(&blkio_subsys);
1657}
963 1658
964 /* unregister and update blkgs */ 1659static void __exit exit_cgroup_blkio(void)
965 blkcg_policy[pol->plid] = NULL; 1660{
966out_unlock: 1661 cgroup_unload_subsys(&blkio_subsys);
967 mutex_unlock(&blkcg_pol_mutex);
968} 1662}
969EXPORT_SYMBOL_GPL(blkcg_policy_unregister); 1663
1664module_init(init_cgroup_blkio);
1665module_exit(exit_cgroup_blkio);
1666MODULE_LICENSE("GPL");
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 24597309e23..a71d2904ffb 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -15,491 +15,350 @@
15 15
16#include <linux/cgroup.h> 16#include <linux/cgroup.h>
17#include <linux/u64_stats_sync.h> 17#include <linux/u64_stats_sync.h>
18#include <linux/seq_file.h> 18
19#include <linux/radix-tree.h> 19enum blkio_policy_id {
20#include <linux/blkdev.h> 20 BLKIO_POLICY_PROP = 0, /* Proportional Bandwidth division */
21 BLKIO_POLICY_THROTL, /* Throttling */
22};
21 23
22/* Max limits for throttle policy */ 24/* Max limits for throttle policy */
23#define THROTL_IOPS_MAX UINT_MAX 25#define THROTL_IOPS_MAX UINT_MAX
24 26
25/* CFQ specific, out here for blkcg->cfq_weight */ 27#if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE)
26#define CFQ_WEIGHT_MIN 10 28
27#define CFQ_WEIGHT_MAX 1000 29#ifndef CONFIG_BLK_CGROUP
28#define CFQ_WEIGHT_DEFAULT 500 30/* When blk-cgroup is a module, its subsys_id isn't a compile-time constant */
29 31extern struct cgroup_subsys blkio_subsys;
30#ifdef CONFIG_BLK_CGROUP 32#define blkio_subsys_id blkio_subsys.subsys_id
31 33#endif
32enum blkg_rwstat_type { 34
33 BLKG_RWSTAT_READ, 35enum stat_type {
34 BLKG_RWSTAT_WRITE, 36 /* Total time spent (in ns) between request dispatch to the driver and
35 BLKG_RWSTAT_SYNC, 37 * request completion for IOs doen by this cgroup. This may not be
36 BLKG_RWSTAT_ASYNC, 38 * accurate when NCQ is turned on. */
37 39 BLKIO_STAT_SERVICE_TIME = 0,
38 BLKG_RWSTAT_NR, 40 /* Total time spent waiting in scheduler queue in ns */
39 BLKG_RWSTAT_TOTAL = BLKG_RWSTAT_NR, 41 BLKIO_STAT_WAIT_TIME,
42 /* Number of IOs queued up */
43 BLKIO_STAT_QUEUED,
44 /* All the single valued stats go below this */
45 BLKIO_STAT_TIME,
46#ifdef CONFIG_DEBUG_BLK_CGROUP
47 /* Time not charged to this cgroup */
48 BLKIO_STAT_UNACCOUNTED_TIME,
49 BLKIO_STAT_AVG_QUEUE_SIZE,
50 BLKIO_STAT_IDLE_TIME,
51 BLKIO_STAT_EMPTY_TIME,
52 BLKIO_STAT_GROUP_WAIT_TIME,
53 BLKIO_STAT_DEQUEUE
54#endif
40}; 55};
41 56
42struct blkcg_gq; 57/* Per cpu stats */
43 58enum stat_type_cpu {
44struct blkcg { 59 BLKIO_STAT_CPU_SECTORS,
45 struct cgroup_subsys_state css; 60 /* Total bytes transferred */
46 spinlock_t lock; 61 BLKIO_STAT_CPU_SERVICE_BYTES,
47 62 /* Total IOs serviced, post merge */
48 struct radix_tree_root blkg_tree; 63 BLKIO_STAT_CPU_SERVICED,
49 struct blkcg_gq *blkg_hint; 64 /* Number of IOs merged */
50 struct hlist_head blkg_list; 65 BLKIO_STAT_CPU_MERGED,
51 66 BLKIO_STAT_CPU_NR
52 /* for policies to test whether associated blkcg has changed */
53 uint64_t id;
54
55 /* TODO: per-policy storage in blkcg */
56 unsigned int cfq_weight; /* belongs to cfq */
57}; 67};
58 68
59struct blkg_stat { 69enum stat_sub_type {
60 struct u64_stats_sync syncp; 70 BLKIO_STAT_READ = 0,
61 uint64_t cnt; 71 BLKIO_STAT_WRITE,
72 BLKIO_STAT_SYNC,
73 BLKIO_STAT_ASYNC,
74 BLKIO_STAT_TOTAL
62}; 75};
63 76
64struct blkg_rwstat { 77/* blkg state flags */
65 struct u64_stats_sync syncp; 78enum blkg_state_flags {
66 uint64_t cnt[BLKG_RWSTAT_NR]; 79 BLKG_waiting = 0,
80 BLKG_idling,
81 BLKG_empty,
67}; 82};
68 83
69/* 84/* cgroup files owned by proportional weight policy */
70 * A blkcg_gq (blkg) is association between a block cgroup (blkcg) and a 85enum blkcg_file_name_prop {
71 * request_queue (q). This is used by blkcg policies which need to track 86 BLKIO_PROP_weight = 1,
72 * information per blkcg - q pair. 87 BLKIO_PROP_weight_device,
73 * 88 BLKIO_PROP_io_service_bytes,
74 * There can be multiple active blkcg policies and each has its private 89 BLKIO_PROP_io_serviced,
75 * data on each blkg, the size of which is determined by 90 BLKIO_PROP_time,
76 * blkcg_policy->pd_size. blkcg core allocates and frees such areas 91 BLKIO_PROP_sectors,
77 * together with blkg and invokes pd_init/exit_fn() methods. 92 BLKIO_PROP_unaccounted_time,
78 * 93 BLKIO_PROP_io_service_time,
79 * Such private data must embed struct blkg_policy_data (pd) at the 94 BLKIO_PROP_io_wait_time,
80 * beginning and pd_size can't be smaller than pd. 95 BLKIO_PROP_io_merged,
81 */ 96 BLKIO_PROP_io_queued,
82struct blkg_policy_data { 97 BLKIO_PROP_avg_queue_size,
83 /* the blkg this per-policy data belongs to */ 98 BLKIO_PROP_group_wait_time,
84 struct blkcg_gq *blkg; 99 BLKIO_PROP_idle_time,
85 100 BLKIO_PROP_empty_time,
86 /* used during policy activation */ 101 BLKIO_PROP_dequeue,
87 struct list_head alloc_node;
88}; 102};
89 103
90/* association between a blk cgroup and a request queue */ 104/* cgroup files owned by throttle policy */
91struct blkcg_gq { 105enum blkcg_file_name_throtl {
92 /* Pointer to the associated request_queue */ 106 BLKIO_THROTL_read_bps_device,
93 struct request_queue *q; 107 BLKIO_THROTL_write_bps_device,
94 struct list_head q_node; 108 BLKIO_THROTL_read_iops_device,
95 struct hlist_node blkcg_node; 109 BLKIO_THROTL_write_iops_device,
96 struct blkcg *blkcg; 110 BLKIO_THROTL_io_service_bytes,
97 /* request allocation list for this blkcg-q pair */ 111 BLKIO_THROTL_io_serviced,
98 struct request_list rl;
99 /* reference count */
100 int refcnt;
101
102 struct blkg_policy_data *pd[BLKCG_MAX_POLS];
103
104 struct rcu_head rcu_head;
105}; 112};
106 113
107typedef void (blkcg_pol_init_pd_fn)(struct blkcg_gq *blkg); 114struct blkio_cgroup {
108typedef void (blkcg_pol_exit_pd_fn)(struct blkcg_gq *blkg); 115 struct cgroup_subsys_state css;
109typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkcg_gq *blkg); 116 unsigned int weight;
110 117 spinlock_t lock;
111struct blkcg_policy { 118 struct hlist_head blkg_list;
112 int plid; 119 struct list_head policy_list; /* list of blkio_policy_node */
113 /* policy specific private data size */
114 size_t pd_size;
115 /* cgroup files for the policy */
116 struct cftype *cftypes;
117
118 /* operations */
119 blkcg_pol_init_pd_fn *pd_init_fn;
120 blkcg_pol_exit_pd_fn *pd_exit_fn;
121 blkcg_pol_reset_pd_stats_fn *pd_reset_stats_fn;
122}; 120};
123 121
124extern struct blkcg blkcg_root; 122struct blkio_group_stats {
125 123 /* total disk time and nr sectors dispatched by this group */
126struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q); 124 uint64_t time;
127struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, 125 uint64_t stat_arr[BLKIO_STAT_QUEUED + 1][BLKIO_STAT_TOTAL];
128 struct request_queue *q); 126#ifdef CONFIG_DEBUG_BLK_CGROUP
129int blkcg_init_queue(struct request_queue *q); 127 /* Time not charged to this cgroup */
130void blkcg_drain_queue(struct request_queue *q); 128 uint64_t unaccounted_time;
131void blkcg_exit_queue(struct request_queue *q); 129
132 130 /* Sum of number of IOs queued across all samples */
133/* Blkio controller policy registration */ 131 uint64_t avg_queue_size_sum;
134int blkcg_policy_register(struct blkcg_policy *pol); 132 /* Count of samples taken for average */
135void blkcg_policy_unregister(struct blkcg_policy *pol); 133 uint64_t avg_queue_size_samples;
136int blkcg_activate_policy(struct request_queue *q, 134 /* How many times this group has been removed from service tree */
137 const struct blkcg_policy *pol); 135 unsigned long dequeue;
138void blkcg_deactivate_policy(struct request_queue *q, 136
139 const struct blkcg_policy *pol); 137 /* Total time spent waiting for it to be assigned a timeslice. */
140 138 uint64_t group_wait_time;
141void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg, 139 uint64_t start_group_wait_time;
142 u64 (*prfill)(struct seq_file *, 140
143 struct blkg_policy_data *, int), 141 /* Time spent idling for this blkio_group */
144 const struct blkcg_policy *pol, int data, 142 uint64_t idle_time;
145 bool show_total); 143 uint64_t start_idle_time;
146u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v);
147u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
148 const struct blkg_rwstat *rwstat);
149u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, int off);
150u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
151 int off);
152
153struct blkg_conf_ctx {
154 struct gendisk *disk;
155 struct blkcg_gq *blkg;
156 u64 v;
157};
158
159int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
160 const char *input, struct blkg_conf_ctx *ctx);
161void blkg_conf_finish(struct blkg_conf_ctx *ctx);
162
163
164static inline struct blkcg *cgroup_to_blkcg(struct cgroup *cgroup)
165{
166 return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id),
167 struct blkcg, css);
168}
169
170static inline struct blkcg *task_blkcg(struct task_struct *tsk)
171{
172 return container_of(task_subsys_state(tsk, blkio_subsys_id),
173 struct blkcg, css);
174}
175
176static inline struct blkcg *bio_blkcg(struct bio *bio)
177{
178 if (bio && bio->bi_css)
179 return container_of(bio->bi_css, struct blkcg, css);
180 return task_blkcg(current);
181}
182
183/**
184 * blkg_to_pdata - get policy private data
185 * @blkg: blkg of interest
186 * @pol: policy of interest
187 *
188 * Return pointer to private data associated with the @blkg-@pol pair.
189 */
190static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg,
191 struct blkcg_policy *pol)
192{
193 return blkg ? blkg->pd[pol->plid] : NULL;
194}
195
196/**
197 * pdata_to_blkg - get blkg associated with policy private data
198 * @pd: policy private data of interest
199 *
200 * @pd is policy private data. Determine the blkg it's associated with.
201 */
202static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd)
203{
204 return pd ? pd->blkg : NULL;
205}
206
207/**
208 * blkg_path - format cgroup path of blkg
209 * @blkg: blkg of interest
210 * @buf: target buffer
211 * @buflen: target buffer length
212 *
213 * Format the path of the cgroup of @blkg into @buf.
214 */
215static inline int blkg_path(struct blkcg_gq *blkg, char *buf, int buflen)
216{
217 int ret;
218
219 rcu_read_lock();
220 ret = cgroup_path(blkg->blkcg->css.cgroup, buf, buflen);
221 rcu_read_unlock();
222 if (ret)
223 strncpy(buf, "<unavailable>", buflen);
224 return ret;
225}
226
227/**
228 * blkg_get - get a blkg reference
229 * @blkg: blkg to get
230 *
231 * The caller should be holding queue_lock and an existing reference.
232 */
233static inline void blkg_get(struct blkcg_gq *blkg)
234{
235 lockdep_assert_held(blkg->q->queue_lock);
236 WARN_ON_ONCE(!blkg->refcnt);
237 blkg->refcnt++;
238}
239
240void __blkg_release(struct blkcg_gq *blkg);
241
242/**
243 * blkg_put - put a blkg reference
244 * @blkg: blkg to put
245 *
246 * The caller should be holding queue_lock.
247 */
248static inline void blkg_put(struct blkcg_gq *blkg)
249{
250 lockdep_assert_held(blkg->q->queue_lock);
251 WARN_ON_ONCE(blkg->refcnt <= 0);
252 if (!--blkg->refcnt)
253 __blkg_release(blkg);
254}
255
256/**
257 * blk_get_rl - get request_list to use
258 * @q: request_queue of interest
259 * @bio: bio which will be attached to the allocated request (may be %NULL)
260 *
261 * The caller wants to allocate a request from @q to use for @bio. Find
262 * the request_list to use and obtain a reference on it. Should be called
263 * under queue_lock. This function is guaranteed to return non-%NULL
264 * request_list.
265 */
266static inline struct request_list *blk_get_rl(struct request_queue *q,
267 struct bio *bio)
268{
269 struct blkcg *blkcg;
270 struct blkcg_gq *blkg;
271
272 rcu_read_lock();
273
274 blkcg = bio_blkcg(bio);
275
276 /* bypass blkg lookup and use @q->root_rl directly for root */
277 if (blkcg == &blkcg_root)
278 goto root_rl;
279
280 /* 144 /*
281 * Try to use blkg->rl. blkg lookup may fail under memory pressure 145 * Total time when we have requests queued and do not contain the
282 * or if either the blkcg or queue is going away. Fall back to 146 * current active queue.
283 * root_rl in such cases.
284 */ 147 */
285 blkg = blkg_lookup_create(blkcg, q); 148 uint64_t empty_time;
286 if (unlikely(IS_ERR(blkg))) 149 uint64_t start_empty_time;
287 goto root_rl; 150 uint16_t flags;
288 151#endif
289 blkg_get(blkg); 152};
290 rcu_read_unlock();
291 return &blkg->rl;
292root_rl:
293 rcu_read_unlock();
294 return &q->root_rl;
295}
296
297/**
298 * blk_put_rl - put request_list
299 * @rl: request_list to put
300 *
301 * Put the reference acquired by blk_get_rl(). Should be called under
302 * queue_lock.
303 */
304static inline void blk_put_rl(struct request_list *rl)
305{
306 /* root_rl may not have blkg set */
307 if (rl->blkg && rl->blkg->blkcg != &blkcg_root)
308 blkg_put(rl->blkg);
309}
310
311/**
312 * blk_rq_set_rl - associate a request with a request_list
313 * @rq: request of interest
314 * @rl: target request_list
315 *
316 * Associate @rq with @rl so that accounting and freeing can know the
317 * request_list @rq came from.
318 */
319static inline void blk_rq_set_rl(struct request *rq, struct request_list *rl)
320{
321 rq->rl = rl;
322}
323
324/**
325 * blk_rq_rl - return the request_list a request came from
326 * @rq: request of interest
327 *
328 * Return the request_list @rq is allocated from.
329 */
330static inline struct request_list *blk_rq_rl(struct request *rq)
331{
332 return rq->rl;
333}
334
335struct request_list *__blk_queue_next_rl(struct request_list *rl,
336 struct request_queue *q);
337/**
338 * blk_queue_for_each_rl - iterate through all request_lists of a request_queue
339 *
340 * Should be used under queue_lock.
341 */
342#define blk_queue_for_each_rl(rl, q) \
343 for ((rl) = &(q)->root_rl; (rl); (rl) = __blk_queue_next_rl((rl), (q)))
344
345/**
346 * blkg_stat_add - add a value to a blkg_stat
347 * @stat: target blkg_stat
348 * @val: value to add
349 *
350 * Add @val to @stat. The caller is responsible for synchronizing calls to
351 * this function.
352 */
353static inline void blkg_stat_add(struct blkg_stat *stat, uint64_t val)
354{
355 u64_stats_update_begin(&stat->syncp);
356 stat->cnt += val;
357 u64_stats_update_end(&stat->syncp);
358}
359
360/**
361 * blkg_stat_read - read the current value of a blkg_stat
362 * @stat: blkg_stat to read
363 *
364 * Read the current value of @stat. This function can be called without
365 * synchroniztion and takes care of u64 atomicity.
366 */
367static inline uint64_t blkg_stat_read(struct blkg_stat *stat)
368{
369 unsigned int start;
370 uint64_t v;
371
372 do {
373 start = u64_stats_fetch_begin(&stat->syncp);
374 v = stat->cnt;
375 } while (u64_stats_fetch_retry(&stat->syncp, start));
376
377 return v;
378}
379
380/**
381 * blkg_stat_reset - reset a blkg_stat
382 * @stat: blkg_stat to reset
383 */
384static inline void blkg_stat_reset(struct blkg_stat *stat)
385{
386 stat->cnt = 0;
387}
388 153
389/** 154/* Per cpu blkio group stats */
390 * blkg_rwstat_add - add a value to a blkg_rwstat 155struct blkio_group_stats_cpu {
391 * @rwstat: target blkg_rwstat 156 uint64_t sectors;
392 * @rw: mask of REQ_{WRITE|SYNC} 157 uint64_t stat_arr_cpu[BLKIO_STAT_CPU_NR][BLKIO_STAT_TOTAL];
393 * @val: value to add 158 struct u64_stats_sync syncp;
394 * 159};
395 * Add @val to @rwstat. The counters are chosen according to @rw. The
396 * caller is responsible for synchronizing calls to this function.
397 */
398static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat,
399 int rw, uint64_t val)
400{
401 u64_stats_update_begin(&rwstat->syncp);
402
403 if (rw & REQ_WRITE)
404 rwstat->cnt[BLKG_RWSTAT_WRITE] += val;
405 else
406 rwstat->cnt[BLKG_RWSTAT_READ] += val;
407 if (rw & REQ_SYNC)
408 rwstat->cnt[BLKG_RWSTAT_SYNC] += val;
409 else
410 rwstat->cnt[BLKG_RWSTAT_ASYNC] += val;
411
412 u64_stats_update_end(&rwstat->syncp);
413}
414 160
415/** 161struct blkio_group {
416 * blkg_rwstat_read - read the current values of a blkg_rwstat 162 /* An rcu protected unique identifier for the group */
417 * @rwstat: blkg_rwstat to read 163 void *key;
418 * 164 struct hlist_node blkcg_node;
419 * Read the current snapshot of @rwstat and return it as the return value. 165 unsigned short blkcg_id;
420 * This function can be called without synchronization and takes care of 166 /* Store cgroup path */
421 * u64 atomicity. 167 char path[128];
422 */ 168 /* The device MKDEV(major, minor), this group has been created for */
423static inline struct blkg_rwstat blkg_rwstat_read(struct blkg_rwstat *rwstat) 169 dev_t dev;
424{ 170 /* policy which owns this blk group */
425 unsigned int start; 171 enum blkio_policy_id plid;
426 struct blkg_rwstat tmp; 172
173 /* Need to serialize the stats in the case of reset/update */
174 spinlock_t stats_lock;
175 struct blkio_group_stats stats;
176 /* Per cpu stats pointer */
177 struct blkio_group_stats_cpu __percpu *stats_cpu;
178};
427 179
428 do { 180struct blkio_policy_node {
429 start = u64_stats_fetch_begin(&rwstat->syncp); 181 struct list_head node;
430 tmp = *rwstat; 182 dev_t dev;
431 } while (u64_stats_fetch_retry(&rwstat->syncp, start)); 183 /* This node belongs to max bw policy or porportional weight policy */
184 enum blkio_policy_id plid;
185 /* cgroup file to which this rule belongs to */
186 int fileid;
187
188 union {
189 unsigned int weight;
190 /*
191 * Rate read/write in terms of byptes per second
192 * Whether this rate represents read or write is determined
193 * by file type "fileid".
194 */
195 u64 bps;
196 unsigned int iops;
197 } val;
198};
432 199
433 return tmp; 200extern unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
434} 201 dev_t dev);
202extern uint64_t blkcg_get_read_bps(struct blkio_cgroup *blkcg,
203 dev_t dev);
204extern uint64_t blkcg_get_write_bps(struct blkio_cgroup *blkcg,
205 dev_t dev);
206extern unsigned int blkcg_get_read_iops(struct blkio_cgroup *blkcg,
207 dev_t dev);
208extern unsigned int blkcg_get_write_iops(struct blkio_cgroup *blkcg,
209 dev_t dev);
210
211typedef void (blkio_unlink_group_fn) (void *key, struct blkio_group *blkg);
212
213typedef void (blkio_update_group_weight_fn) (void *key,
214 struct blkio_group *blkg, unsigned int weight);
215typedef void (blkio_update_group_read_bps_fn) (void * key,
216 struct blkio_group *blkg, u64 read_bps);
217typedef void (blkio_update_group_write_bps_fn) (void *key,
218 struct blkio_group *blkg, u64 write_bps);
219typedef void (blkio_update_group_read_iops_fn) (void *key,
220 struct blkio_group *blkg, unsigned int read_iops);
221typedef void (blkio_update_group_write_iops_fn) (void *key,
222 struct blkio_group *blkg, unsigned int write_iops);
223
224struct blkio_policy_ops {
225 blkio_unlink_group_fn *blkio_unlink_group_fn;
226 blkio_update_group_weight_fn *blkio_update_group_weight_fn;
227 blkio_update_group_read_bps_fn *blkio_update_group_read_bps_fn;
228 blkio_update_group_write_bps_fn *blkio_update_group_write_bps_fn;
229 blkio_update_group_read_iops_fn *blkio_update_group_read_iops_fn;
230 blkio_update_group_write_iops_fn *blkio_update_group_write_iops_fn;
231};
435 232
436/** 233struct blkio_policy_type {
437 * blkg_rwstat_sum - read the total count of a blkg_rwstat 234 struct list_head list;
438 * @rwstat: blkg_rwstat to read 235 struct blkio_policy_ops ops;
439 * 236 enum blkio_policy_id plid;
440 * Return the total count of @rwstat regardless of the IO direction. This 237};
441 * function can be called without synchronization and takes care of u64
442 * atomicity.
443 */
444static inline uint64_t blkg_rwstat_sum(struct blkg_rwstat *rwstat)
445{
446 struct blkg_rwstat tmp = blkg_rwstat_read(rwstat);
447 238
448 return tmp.cnt[BLKG_RWSTAT_READ] + tmp.cnt[BLKG_RWSTAT_WRITE]; 239/* Blkio controller policy registration */
449} 240extern void blkio_policy_register(struct blkio_policy_type *);
241extern void blkio_policy_unregister(struct blkio_policy_type *);
450 242
451/** 243static inline char *blkg_path(struct blkio_group *blkg)
452 * blkg_rwstat_reset - reset a blkg_rwstat
453 * @rwstat: blkg_rwstat to reset
454 */
455static inline void blkg_rwstat_reset(struct blkg_rwstat *rwstat)
456{ 244{
457 memset(rwstat->cnt, 0, sizeof(rwstat->cnt)); 245 return blkg->path;
458} 246}
459 247
460#else /* CONFIG_BLK_CGROUP */ 248#else
461
462struct cgroup;
463struct blkcg;
464 249
465struct blkg_policy_data { 250struct blkio_group {
466}; 251};
467 252
468struct blkcg_gq { 253struct blkio_policy_type {
469}; 254};
470 255
471struct blkcg_policy { 256static inline void blkio_policy_register(struct blkio_policy_type *blkiop) { }
472}; 257static inline void blkio_policy_unregister(struct blkio_policy_type *blkiop) { }
473 258
474static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; } 259static inline char *blkg_path(struct blkio_group *blkg) { return NULL; }
475static inline int blkcg_init_queue(struct request_queue *q) { return 0; } 260
476static inline void blkcg_drain_queue(struct request_queue *q) { } 261#endif
477static inline void blkcg_exit_queue(struct request_queue *q) { } 262
478static inline int blkcg_policy_register(struct blkcg_policy *pol) { return 0; } 263#define BLKIO_WEIGHT_MIN 10
479static inline void blkcg_policy_unregister(struct blkcg_policy *pol) { } 264#define BLKIO_WEIGHT_MAX 1000
480static inline int blkcg_activate_policy(struct request_queue *q, 265#define BLKIO_WEIGHT_DEFAULT 500
481 const struct blkcg_policy *pol) { return 0; } 266
482static inline void blkcg_deactivate_policy(struct request_queue *q, 267#ifdef CONFIG_DEBUG_BLK_CGROUP
483 const struct blkcg_policy *pol) { } 268void blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg);
484 269void blkiocg_update_dequeue_stats(struct blkio_group *blkg,
485static inline struct blkcg *cgroup_to_blkcg(struct cgroup *cgroup) { return NULL; } 270 unsigned long dequeue);
486static inline struct blkcg *bio_blkcg(struct bio *bio) { return NULL; } 271void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg);
487 272void blkiocg_update_idle_time_stats(struct blkio_group *blkg);
488static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg, 273void blkiocg_set_start_empty_time(struct blkio_group *blkg);
489 struct blkcg_policy *pol) { return NULL; } 274
490static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) { return NULL; } 275#define BLKG_FLAG_FNS(name) \
491static inline char *blkg_path(struct blkcg_gq *blkg) { return NULL; } 276static inline void blkio_mark_blkg_##name( \
492static inline void blkg_get(struct blkcg_gq *blkg) { } 277 struct blkio_group_stats *stats) \
493static inline void blkg_put(struct blkcg_gq *blkg) { } 278{ \
494 279 stats->flags |= (1 << BLKG_##name); \
495static inline struct request_list *blk_get_rl(struct request_queue *q, 280} \
496 struct bio *bio) { return &q->root_rl; } 281static inline void blkio_clear_blkg_##name( \
497static inline void blk_put_rl(struct request_list *rl) { } 282 struct blkio_group_stats *stats) \
498static inline void blk_rq_set_rl(struct request *rq, struct request_list *rl) { } 283{ \
499static inline struct request_list *blk_rq_rl(struct request *rq) { return &rq->q->root_rl; } 284 stats->flags &= ~(1 << BLKG_##name); \
500 285} \
501#define blk_queue_for_each_rl(rl, q) \ 286static inline int blkio_blkg_##name(struct blkio_group_stats *stats) \
502 for ((rl) = &(q)->root_rl; (rl); (rl) = NULL) 287{ \
503 288 return (stats->flags & (1 << BLKG_##name)) != 0; \
504#endif /* CONFIG_BLK_CGROUP */ 289} \
505#endif /* _BLK_CGROUP_H */ 290
291BLKG_FLAG_FNS(waiting)
292BLKG_FLAG_FNS(idling)
293BLKG_FLAG_FNS(empty)
294#undef BLKG_FLAG_FNS
295#else
296static inline void blkiocg_update_avg_queue_size_stats(
297 struct blkio_group *blkg) {}
298static inline void blkiocg_update_dequeue_stats(struct blkio_group *blkg,
299 unsigned long dequeue) {}
300static inline void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg)
301{}
302static inline void blkiocg_update_idle_time_stats(struct blkio_group *blkg) {}
303static inline void blkiocg_set_start_empty_time(struct blkio_group *blkg) {}
304#endif
305
306#if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE)
307extern struct blkio_cgroup blkio_root_cgroup;
308extern struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup);
309extern struct blkio_cgroup *task_blkio_cgroup(struct task_struct *tsk);
310extern void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
311 struct blkio_group *blkg, void *key, dev_t dev,
312 enum blkio_policy_id plid);
313extern int blkio_alloc_blkg_stats(struct blkio_group *blkg);
314extern int blkiocg_del_blkio_group(struct blkio_group *blkg);
315extern struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg,
316 void *key);
317void blkiocg_update_timeslice_used(struct blkio_group *blkg,
318 unsigned long time,
319 unsigned long unaccounted_time);
320void blkiocg_update_dispatch_stats(struct blkio_group *blkg, uint64_t bytes,
321 bool direction, bool sync);
322void blkiocg_update_completion_stats(struct blkio_group *blkg,
323 uint64_t start_time, uint64_t io_start_time, bool direction, bool sync);
324void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction,
325 bool sync);
326void blkiocg_update_io_add_stats(struct blkio_group *blkg,
327 struct blkio_group *curr_blkg, bool direction, bool sync);
328void blkiocg_update_io_remove_stats(struct blkio_group *blkg,
329 bool direction, bool sync);
330#else
331struct cgroup;
332static inline struct blkio_cgroup *
333cgroup_to_blkio_cgroup(struct cgroup *cgroup) { return NULL; }
334static inline struct blkio_cgroup *
335task_blkio_cgroup(struct task_struct *tsk) { return NULL; }
336
337static inline void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
338 struct blkio_group *blkg, void *key, dev_t dev,
339 enum blkio_policy_id plid) {}
340
341static inline int blkio_alloc_blkg_stats(struct blkio_group *blkg) { return 0; }
342
343static inline int
344blkiocg_del_blkio_group(struct blkio_group *blkg) { return 0; }
345
346static inline struct blkio_group *
347blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key) { return NULL; }
348static inline void blkiocg_update_timeslice_used(struct blkio_group *blkg,
349 unsigned long time,
350 unsigned long unaccounted_time)
351{}
352static inline void blkiocg_update_dispatch_stats(struct blkio_group *blkg,
353 uint64_t bytes, bool direction, bool sync) {}
354static inline void blkiocg_update_completion_stats(struct blkio_group *blkg,
355 uint64_t start_time, uint64_t io_start_time, bool direction,
356 bool sync) {}
357static inline void blkiocg_update_io_merged_stats(struct blkio_group *blkg,
358 bool direction, bool sync) {}
359static inline void blkiocg_update_io_add_stats(struct blkio_group *blkg,
360 struct blkio_group *curr_blkg, bool direction, bool sync) {}
361static inline void blkiocg_update_io_remove_stats(struct blkio_group *blkg,
362 bool direction, bool sync) {}
363#endif
364#endif /* _BLK_CGROUP_H */
diff --git a/block/blk-core.c b/block/blk-core.c
index c973249d68c..8fc4ae28a19 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -28,21 +28,17 @@
28#include <linux/task_io_accounting_ops.h> 28#include <linux/task_io_accounting_ops.h>
29#include <linux/fault-inject.h> 29#include <linux/fault-inject.h>
30#include <linux/list_sort.h> 30#include <linux/list_sort.h>
31#include <linux/delay.h>
32#include <linux/ratelimit.h>
33 31
34#define CREATE_TRACE_POINTS 32#define CREATE_TRACE_POINTS
35#include <trace/events/block.h> 33#include <trace/events/block.h>
36 34
37#include "blk.h" 35#include "blk.h"
38#include "blk-cgroup.h"
39 36
40EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap); 37EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);
41EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap); 38EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap);
42EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete); 39EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete);
43EXPORT_TRACEPOINT_SYMBOL_GPL(block_unplug);
44 40
45DEFINE_IDA(blk_queue_ida); 41static int __make_request(struct request_queue *q, struct bio *bio);
46 42
47/* 43/*
48 * For the allocated request tables 44 * For the allocated request tables
@@ -220,13 +216,12 @@ static void blk_delay_work(struct work_struct *work)
220 * Description: 216 * Description:
221 * Sometimes queueing needs to be postponed for a little while, to allow 217 * Sometimes queueing needs to be postponed for a little while, to allow
222 * resources to come back. This function will make sure that queueing is 218 * resources to come back. This function will make sure that queueing is
223 * restarted around the specified time. Queue lock must be held. 219 * restarted around the specified time.
224 */ 220 */
225void blk_delay_queue(struct request_queue *q, unsigned long msecs) 221void blk_delay_queue(struct request_queue *q, unsigned long msecs)
226{ 222{
227 if (likely(!blk_queue_dead(q))) 223 queue_delayed_work(kblockd_workqueue, &q->delay_work,
228 queue_delayed_work(kblockd_workqueue, &q->delay_work, 224 msecs_to_jiffies(msecs));
229 msecs_to_jiffies(msecs));
230} 225}
231EXPORT_SYMBOL(blk_delay_queue); 226EXPORT_SYMBOL(blk_delay_queue);
232 227
@@ -264,7 +259,7 @@ EXPORT_SYMBOL(blk_start_queue);
264 **/ 259 **/
265void blk_stop_queue(struct request_queue *q) 260void blk_stop_queue(struct request_queue *q)
266{ 261{
267 cancel_delayed_work(&q->delay_work); 262 __cancel_delayed_work(&q->delay_work);
268 queue_flag_set(QUEUE_FLAG_STOPPED, q); 263 queue_flag_set(QUEUE_FLAG_STOPPED, q);
269} 264}
270EXPORT_SYMBOL(blk_stop_queue); 265EXPORT_SYMBOL(blk_stop_queue);
@@ -284,7 +279,7 @@ EXPORT_SYMBOL(blk_stop_queue);
284 * 279 *
285 * This function does not cancel any asynchronous activity arising 280 * This function does not cancel any asynchronous activity arising
286 * out of elevator or throttling code. That would require elevaotor_exit() 281 * out of elevator or throttling code. That would require elevaotor_exit()
287 * and blkcg_exit_queue() to be called with queue lock initialized. 282 * and blk_throtl_exit() to be called with queue lock initialized.
288 * 283 *
289 */ 284 */
290void blk_sync_queue(struct request_queue *q) 285void blk_sync_queue(struct request_queue *q)
@@ -295,34 +290,6 @@ void blk_sync_queue(struct request_queue *q)
295EXPORT_SYMBOL(blk_sync_queue); 290EXPORT_SYMBOL(blk_sync_queue);
296 291
297/** 292/**
298 * __blk_run_queue_uncond - run a queue whether or not it has been stopped
299 * @q: The queue to run
300 *
301 * Description:
302 * Invoke request handling on a queue if there are any pending requests.
303 * May be used to restart request handling after a request has completed.
304 * This variant runs the queue whether or not the queue has been
305 * stopped. Must be called with the queue lock held and interrupts
306 * disabled. See also @blk_run_queue.
307 */
308inline void __blk_run_queue_uncond(struct request_queue *q)
309{
310 if (unlikely(blk_queue_dead(q)))
311 return;
312
313 /*
314 * Some request_fn implementations, e.g. scsi_request_fn(), unlock
315 * the queue lock internally. As a result multiple threads may be
316 * running such a request function concurrently. Keep track of the
317 * number of active request_fn invocations such that blk_drain_queue()
318 * can wait until all these request_fn calls have finished.
319 */
320 q->request_fn_active++;
321 q->request_fn(q);
322 q->request_fn_active--;
323}
324
325/**
326 * __blk_run_queue - run a single device queue 293 * __blk_run_queue - run a single device queue
327 * @q: The queue to run 294 * @q: The queue to run
328 * 295 *
@@ -335,7 +302,7 @@ void __blk_run_queue(struct request_queue *q)
335 if (unlikely(blk_queue_stopped(q))) 302 if (unlikely(blk_queue_stopped(q)))
336 return; 303 return;
337 304
338 __blk_run_queue_uncond(q); 305 q->request_fn(q);
339} 306}
340EXPORT_SYMBOL(__blk_run_queue); 307EXPORT_SYMBOL(__blk_run_queue);
341 308
@@ -345,12 +312,14 @@ EXPORT_SYMBOL(__blk_run_queue);
345 * 312 *
346 * Description: 313 * Description:
347 * Tells kblockd to perform the equivalent of @blk_run_queue on behalf 314 * Tells kblockd to perform the equivalent of @blk_run_queue on behalf
348 * of us. The caller must hold the queue lock. 315 * of us.
349 */ 316 */
350void blk_run_queue_async(struct request_queue *q) 317void blk_run_queue_async(struct request_queue *q)
351{ 318{
352 if (likely(!blk_queue_stopped(q) && !blk_queue_dead(q))) 319 if (likely(!blk_queue_stopped(q))) {
353 mod_delayed_work(kblockd_workqueue, &q->delay_work, 0); 320 __cancel_delayed_work(&q->delay_work);
321 queue_delayed_work(kblockd_workqueue, &q->delay_work, 0);
322 }
354} 323}
355EXPORT_SYMBOL(blk_run_queue_async); 324EXPORT_SYMBOL(blk_run_queue_async);
356 325
@@ -378,219 +347,59 @@ void blk_put_queue(struct request_queue *q)
378} 347}
379EXPORT_SYMBOL(blk_put_queue); 348EXPORT_SYMBOL(blk_put_queue);
380 349
381/** 350/*
382 * __blk_drain_queue - drain requests from request_queue 351 * Note: If a driver supplied the queue lock, it is disconnected
383 * @q: queue to drain 352 * by this function. The actual state of the lock doesn't matter
384 * @drain_all: whether to drain all requests or only the ones w/ ELVPRIV 353 * here as the request_queue isn't accessible after this point
385 * 354 * (QUEUE_FLAG_DEAD is set) and no other requests will be queued.
386 * Drain requests from @q. If @drain_all is set, all requests are drained.
387 * If not, only ELVPRIV requests are drained. The caller is responsible
388 * for ensuring that no new requests which need to be drained are queued.
389 */
390static void __blk_drain_queue(struct request_queue *q, bool drain_all)
391 __releases(q->queue_lock)
392 __acquires(q->queue_lock)
393{
394 int i;
395
396 lockdep_assert_held(q->queue_lock);
397
398 while (true) {
399 bool drain = false;
400
401 /*
402 * The caller might be trying to drain @q before its
403 * elevator is initialized.
404 */
405 if (q->elevator)
406 elv_drain_elevator(q);
407
408 blkcg_drain_queue(q);
409
410 /*
411 * This function might be called on a queue which failed
412 * driver init after queue creation or is not yet fully
413 * active yet. Some drivers (e.g. fd and loop) get unhappy
414 * in such cases. Kick queue iff dispatch queue has
415 * something on it and @q has request_fn set.
416 */
417 if (!list_empty(&q->queue_head) && q->request_fn)
418 __blk_run_queue(q);
419
420 drain |= q->nr_rqs_elvpriv;
421 drain |= q->request_fn_active;
422
423 /*
424 * Unfortunately, requests are queued at and tracked from
425 * multiple places and there's no single counter which can
426 * be drained. Check all the queues and counters.
427 */
428 if (drain_all) {
429 drain |= !list_empty(&q->queue_head);
430 for (i = 0; i < 2; i++) {
431 drain |= q->nr_rqs[i];
432 drain |= q->in_flight[i];
433 drain |= !list_empty(&q->flush_queue[i]);
434 }
435 }
436
437 if (!drain)
438 break;
439
440 spin_unlock_irq(q->queue_lock);
441
442 msleep(10);
443
444 spin_lock_irq(q->queue_lock);
445 }
446
447 /*
448 * With queue marked dead, any woken up waiter will fail the
449 * allocation path, so the wakeup chaining is lost and we're
450 * left with hung waiters. We need to wake up those waiters.
451 */
452 if (q->request_fn) {
453 struct request_list *rl;
454
455 blk_queue_for_each_rl(rl, q)
456 for (i = 0; i < ARRAY_SIZE(rl->wait); i++)
457 wake_up_all(&rl->wait[i]);
458 }
459}
460
461/**
462 * blk_queue_bypass_start - enter queue bypass mode
463 * @q: queue of interest
464 *
465 * In bypass mode, only the dispatch FIFO queue of @q is used. This
466 * function makes @q enter bypass mode and drains all requests which were
467 * throttled or issued before. On return, it's guaranteed that no request
468 * is being throttled or has ELVPRIV set and blk_queue_bypass() %true
469 * inside queue or RCU read lock.
470 */
471void blk_queue_bypass_start(struct request_queue *q)
472{
473 bool drain;
474
475 spin_lock_irq(q->queue_lock);
476 drain = !q->bypass_depth++;
477 queue_flag_set(QUEUE_FLAG_BYPASS, q);
478 spin_unlock_irq(q->queue_lock);
479
480 if (drain) {
481 spin_lock_irq(q->queue_lock);
482 __blk_drain_queue(q, false);
483 spin_unlock_irq(q->queue_lock);
484
485 /* ensure blk_queue_bypass() is %true inside RCU read lock */
486 synchronize_rcu();
487 }
488}
489EXPORT_SYMBOL_GPL(blk_queue_bypass_start);
490
491/**
492 * blk_queue_bypass_end - leave queue bypass mode
493 * @q: queue of interest
494 *
495 * Leave bypass mode and restore the normal queueing behavior.
496 */
497void blk_queue_bypass_end(struct request_queue *q)
498{
499 spin_lock_irq(q->queue_lock);
500 if (!--q->bypass_depth)
501 queue_flag_clear(QUEUE_FLAG_BYPASS, q);
502 WARN_ON_ONCE(q->bypass_depth < 0);
503 spin_unlock_irq(q->queue_lock);
504}
505EXPORT_SYMBOL_GPL(blk_queue_bypass_end);
506
507/**
508 * blk_cleanup_queue - shutdown a request queue
509 * @q: request queue to shutdown
510 *
511 * Mark @q DYING, drain all pending requests, mark @q DEAD, destroy and
512 * put it. All future requests will be failed immediately with -ENODEV.
513 */ 355 */
514void blk_cleanup_queue(struct request_queue *q) 356void blk_cleanup_queue(struct request_queue *q)
515{ 357{
516 spinlock_t *lock = q->queue_lock;
517
518 /* mark @q DYING, no new request or merges will be allowed afterwards */
519 mutex_lock(&q->sysfs_lock);
520 queue_flag_set_unlocked(QUEUE_FLAG_DYING, q);
521 spin_lock_irq(lock);
522
523 /*
524 * A dying queue is permanently in bypass mode till released. Note
525 * that, unlike blk_queue_bypass_start(), we aren't performing
526 * synchronize_rcu() after entering bypass mode to avoid the delay
527 * as some drivers create and destroy a lot of queues while
528 * probing. This is still safe because blk_release_queue() will be
529 * called only after the queue refcnt drops to zero and nothing,
530 * RCU or not, would be traversing the queue by then.
531 */
532 q->bypass_depth++;
533 queue_flag_set(QUEUE_FLAG_BYPASS, q);
534
535 queue_flag_set(QUEUE_FLAG_NOMERGES, q);
536 queue_flag_set(QUEUE_FLAG_NOXMERGES, q);
537 queue_flag_set(QUEUE_FLAG_DYING, q);
538 spin_unlock_irq(lock);
539 mutex_unlock(&q->sysfs_lock);
540
541 /* 358 /*
542 * Drain all requests queued before DYING marking. Set DEAD flag to 359 * We know we have process context here, so we can be a little
543 * prevent that q->request_fn() gets invoked after draining finished. 360 * cautious and ensure that pending block actions on this device
361 * are done before moving on. Going into this function, we should
362 * not have processes doing IO to this device.
544 */ 363 */
545 spin_lock_irq(lock); 364 blk_sync_queue(q);
546 __blk_drain_queue(q, true);
547 queue_flag_set(QUEUE_FLAG_DEAD, q);
548 spin_unlock_irq(lock);
549 365
550 /* @q won't process any more request, flush async actions */
551 del_timer_sync(&q->backing_dev_info.laptop_mode_wb_timer); 366 del_timer_sync(&q->backing_dev_info.laptop_mode_wb_timer);
552 blk_sync_queue(q); 367 mutex_lock(&q->sysfs_lock);
368 queue_flag_set_unlocked(QUEUE_FLAG_DEAD, q);
369 mutex_unlock(&q->sysfs_lock);
553 370
554 spin_lock_irq(lock);
555 if (q->queue_lock != &q->__queue_lock) 371 if (q->queue_lock != &q->__queue_lock)
556 q->queue_lock = &q->__queue_lock; 372 q->queue_lock = &q->__queue_lock;
557 spin_unlock_irq(lock);
558 373
559 /* @q is and will stay empty, shutdown and put */
560 blk_put_queue(q); 374 blk_put_queue(q);
561} 375}
562EXPORT_SYMBOL(blk_cleanup_queue); 376EXPORT_SYMBOL(blk_cleanup_queue);
563 377
564int blk_init_rl(struct request_list *rl, struct request_queue *q, 378static int blk_init_free_list(struct request_queue *q)
565 gfp_t gfp_mask)
566{ 379{
380 struct request_list *rl = &q->rq;
381
567 if (unlikely(rl->rq_pool)) 382 if (unlikely(rl->rq_pool))
568 return 0; 383 return 0;
569 384
570 rl->q = q;
571 rl->count[BLK_RW_SYNC] = rl->count[BLK_RW_ASYNC] = 0; 385 rl->count[BLK_RW_SYNC] = rl->count[BLK_RW_ASYNC] = 0;
572 rl->starved[BLK_RW_SYNC] = rl->starved[BLK_RW_ASYNC] = 0; 386 rl->starved[BLK_RW_SYNC] = rl->starved[BLK_RW_ASYNC] = 0;
387 rl->elvpriv = 0;
573 init_waitqueue_head(&rl->wait[BLK_RW_SYNC]); 388 init_waitqueue_head(&rl->wait[BLK_RW_SYNC]);
574 init_waitqueue_head(&rl->wait[BLK_RW_ASYNC]); 389 init_waitqueue_head(&rl->wait[BLK_RW_ASYNC]);
575 390
576 rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ, mempool_alloc_slab, 391 rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ, mempool_alloc_slab,
577 mempool_free_slab, request_cachep, 392 mempool_free_slab, request_cachep, q->node);
578 gfp_mask, q->node); 393
579 if (!rl->rq_pool) 394 if (!rl->rq_pool)
580 return -ENOMEM; 395 return -ENOMEM;
581 396
582 return 0; 397 return 0;
583} 398}
584 399
585void blk_exit_rl(struct request_list *rl)
586{
587 if (rl->rq_pool)
588 mempool_destroy(rl->rq_pool);
589}
590
591struct request_queue *blk_alloc_queue(gfp_t gfp_mask) 400struct request_queue *blk_alloc_queue(gfp_t gfp_mask)
592{ 401{
593 return blk_alloc_queue_node(gfp_mask, NUMA_NO_NODE); 402 return blk_alloc_queue_node(gfp_mask, -1);
594} 403}
595EXPORT_SYMBOL(blk_alloc_queue); 404EXPORT_SYMBOL(blk_alloc_queue);
596 405
@@ -604,10 +413,6 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
604 if (!q) 413 if (!q)
605 return NULL; 414 return NULL;
606 415
607 q->id = ida_simple_get(&blk_queue_ida, 0, 0, gfp_mask);
608 if (q->id < 0)
609 goto fail_q;
610
611 q->backing_dev_info.ra_pages = 416 q->backing_dev_info.ra_pages =
612 (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; 417 (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
613 q->backing_dev_info.state = 0; 418 q->backing_dev_info.state = 0;
@@ -616,18 +421,20 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
616 q->node = node_id; 421 q->node = node_id;
617 422
618 err = bdi_init(&q->backing_dev_info); 423 err = bdi_init(&q->backing_dev_info);
619 if (err) 424 if (err) {
620 goto fail_id; 425 kmem_cache_free(blk_requestq_cachep, q);
426 return NULL;
427 }
428
429 if (blk_throtl_init(q)) {
430 kmem_cache_free(blk_requestq_cachep, q);
431 return NULL;
432 }
621 433
622 setup_timer(&q->backing_dev_info.laptop_mode_wb_timer, 434 setup_timer(&q->backing_dev_info.laptop_mode_wb_timer,
623 laptop_mode_timer_fn, (unsigned long) q); 435 laptop_mode_timer_fn, (unsigned long) q);
624 setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q); 436 setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q);
625 INIT_LIST_HEAD(&q->queue_head);
626 INIT_LIST_HEAD(&q->timeout_list); 437 INIT_LIST_HEAD(&q->timeout_list);
627 INIT_LIST_HEAD(&q->icq_list);
628#ifdef CONFIG_BLK_CGROUP
629 INIT_LIST_HEAD(&q->blkg_list);
630#endif
631 INIT_LIST_HEAD(&q->flush_queue[0]); 438 INIT_LIST_HEAD(&q->flush_queue[0]);
632 INIT_LIST_HEAD(&q->flush_queue[1]); 439 INIT_LIST_HEAD(&q->flush_queue[1]);
633 INIT_LIST_HEAD(&q->flush_data_in_flight); 440 INIT_LIST_HEAD(&q->flush_data_in_flight);
@@ -644,25 +451,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
644 */ 451 */
645 q->queue_lock = &q->__queue_lock; 452 q->queue_lock = &q->__queue_lock;
646 453
647 /*
648 * A queue starts its life with bypass turned on to avoid
649 * unnecessary bypass on/off overhead and nasty surprises during
650 * init. The initial bypass will be finished when the queue is
651 * registered by blk_register_queue().
652 */
653 q->bypass_depth = 1;
654 __set_bit(QUEUE_FLAG_BYPASS, &q->queue_flags);
655
656 if (blkcg_init_queue(q))
657 goto fail_id;
658
659 return q; 454 return q;
660
661fail_id:
662 ida_simple_remove(&blk_queue_ida, q->id);
663fail_q:
664 kmem_cache_free(blk_requestq_cachep, q);
665 return NULL;
666} 455}
667EXPORT_SYMBOL(blk_alloc_queue_node); 456EXPORT_SYMBOL(blk_alloc_queue_node);
668 457
@@ -701,7 +490,7 @@ EXPORT_SYMBOL(blk_alloc_queue_node);
701 490
702struct request_queue *blk_init_queue(request_fn_proc *rfn, spinlock_t *lock) 491struct request_queue *blk_init_queue(request_fn_proc *rfn, spinlock_t *lock)
703{ 492{
704 return blk_init_queue_node(rfn, lock, NUMA_NO_NODE); 493 return blk_init_queue_node(rfn, lock, -1);
705} 494}
706EXPORT_SYMBOL(blk_init_queue); 495EXPORT_SYMBOL(blk_init_queue);
707 496
@@ -729,13 +518,13 @@ blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn,
729 if (!q) 518 if (!q)
730 return NULL; 519 return NULL;
731 520
732 if (blk_init_rl(&q->root_rl, q, GFP_KERNEL)) 521 if (blk_init_free_list(q))
733 return NULL; 522 return NULL;
734 523
735 q->request_fn = rfn; 524 q->request_fn = rfn;
736 q->prep_rq_fn = NULL; 525 q->prep_rq_fn = NULL;
737 q->unprep_rq_fn = NULL; 526 q->unprep_rq_fn = NULL;
738 q->queue_flags |= QUEUE_FLAG_DEFAULT; 527 q->queue_flags = QUEUE_FLAG_DEFAULT;
739 528
740 /* Override internal queue lock with supplied lock pointer */ 529 /* Override internal queue lock with supplied lock pointer */
741 if (lock) 530 if (lock)
@@ -744,37 +533,61 @@ blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn,
744 /* 533 /*
745 * This also sets hw/phys segments, boundary and size 534 * This also sets hw/phys segments, boundary and size
746 */ 535 */
747 blk_queue_make_request(q, blk_queue_bio); 536 blk_queue_make_request(q, __make_request);
748 537
749 q->sg_reserved_size = INT_MAX; 538 q->sg_reserved_size = INT_MAX;
750 539
751 /* init elevator */ 540 /*
752 if (elevator_init(q, NULL)) 541 * all done
753 return NULL; 542 */
754 return q; 543 if (!elevator_init(q, NULL)) {
544 blk_queue_congestion_threshold(q);
545 return q;
546 }
547
548 return NULL;
755} 549}
756EXPORT_SYMBOL(blk_init_allocated_queue); 550EXPORT_SYMBOL(blk_init_allocated_queue);
757 551
758bool blk_get_queue(struct request_queue *q) 552int blk_get_queue(struct request_queue *q)
759{ 553{
760 if (likely(!blk_queue_dying(q))) { 554 if (likely(!test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) {
761 __blk_get_queue(q); 555 kobject_get(&q->kobj);
762 return true; 556 return 0;
763 } 557 }
764 558
765 return false; 559 return 1;
766} 560}
767EXPORT_SYMBOL(blk_get_queue); 561EXPORT_SYMBOL(blk_get_queue);
768 562
769static inline void blk_free_request(struct request_list *rl, struct request *rq) 563static inline void blk_free_request(struct request_queue *q, struct request *rq)
770{ 564{
771 if (rq->cmd_flags & REQ_ELVPRIV) { 565 if (rq->cmd_flags & REQ_ELVPRIV)
772 elv_put_request(rl->q, rq); 566 elv_put_request(q, rq);
773 if (rq->elv.icq) 567 mempool_free(rq, q->rq.rq_pool);
774 put_io_context(rq->elv.icq->ioc); 568}
569
570static struct request *
571blk_alloc_request(struct request_queue *q, int flags, int priv, gfp_t gfp_mask)
572{
573 struct request *rq = mempool_alloc(q->rq.rq_pool, gfp_mask);
574
575 if (!rq)
576 return NULL;
577
578 blk_rq_init(q, rq);
579
580 rq->cmd_flags = flags | REQ_ALLOCED;
581
582 if (priv) {
583 if (unlikely(elv_set_request(q, rq, gfp_mask))) {
584 mempool_free(rq, q->rq.rq_pool);
585 return NULL;
586 }
587 rq->cmd_flags |= REQ_ELVPRIV;
775 } 588 }
776 589
777 mempool_free(rq, rl->rq_pool); 590 return rq;
778} 591}
779 592
780/* 593/*
@@ -811,23 +624,18 @@ static void ioc_set_batching(struct request_queue *q, struct io_context *ioc)
811 ioc->last_waited = jiffies; 624 ioc->last_waited = jiffies;
812} 625}
813 626
814static void __freed_request(struct request_list *rl, int sync) 627static void __freed_request(struct request_queue *q, int sync)
815{ 628{
816 struct request_queue *q = rl->q; 629 struct request_list *rl = &q->rq;
817 630
818 /* 631 if (rl->count[sync] < queue_congestion_off_threshold(q))
819 * bdi isn't aware of blkcg yet. As all async IOs end up root
820 * blkcg anyway, just use root blkcg state.
821 */
822 if (rl == &q->root_rl &&
823 rl->count[sync] < queue_congestion_off_threshold(q))
824 blk_clear_queue_congested(q, sync); 632 blk_clear_queue_congested(q, sync);
825 633
826 if (rl->count[sync] + 1 <= q->nr_requests) { 634 if (rl->count[sync] + 1 <= q->nr_requests) {
827 if (waitqueue_active(&rl->wait[sync])) 635 if (waitqueue_active(&rl->wait[sync]))
828 wake_up(&rl->wait[sync]); 636 wake_up(&rl->wait[sync]);
829 637
830 blk_clear_rl_full(rl, sync); 638 blk_clear_queue_full(q, sync);
831 } 639 }
832} 640}
833 641
@@ -835,20 +643,18 @@ static void __freed_request(struct request_list *rl, int sync)
835 * A request has just been released. Account for it, update the full and 643 * A request has just been released. Account for it, update the full and
836 * congestion status, wake up any waiters. Called under q->queue_lock. 644 * congestion status, wake up any waiters. Called under q->queue_lock.
837 */ 645 */
838static void freed_request(struct request_list *rl, unsigned int flags) 646static void freed_request(struct request_queue *q, int sync, int priv)
839{ 647{
840 struct request_queue *q = rl->q; 648 struct request_list *rl = &q->rq;
841 int sync = rw_is_sync(flags);
842 649
843 q->nr_rqs[sync]--;
844 rl->count[sync]--; 650 rl->count[sync]--;
845 if (flags & REQ_ELVPRIV) 651 if (priv)
846 q->nr_rqs_elvpriv--; 652 rl->elvpriv--;
847 653
848 __freed_request(rl, sync); 654 __freed_request(q, sync);
849 655
850 if (unlikely(rl->starved[sync ^ 1])) 656 if (unlikely(rl->starved[sync ^ 1]))
851 __freed_request(rl, sync ^ 1); 657 __freed_request(q, sync ^ 1);
852} 658}
853 659
854/* 660/*
@@ -870,49 +676,19 @@ static bool blk_rq_should_init_elevator(struct bio *bio)
870 return true; 676 return true;
871} 677}
872 678
873/** 679/*
874 * rq_ioc - determine io_context for request allocation 680 * Get a free request, queue_lock must be held.
875 * @bio: request being allocated is for this bio (can be %NULL) 681 * Returns NULL on failure, with queue_lock held.
876 * 682 * Returns !NULL on success, with queue_lock *not held*.
877 * Determine io_context to use for request allocation for @bio. May return
878 * %NULL if %current->io_context doesn't exist.
879 */
880static struct io_context *rq_ioc(struct bio *bio)
881{
882#ifdef CONFIG_BLK_CGROUP
883 if (bio && bio->bi_ioc)
884 return bio->bi_ioc;
885#endif
886 return current->io_context;
887}
888
889/**
890 * __get_request - get a free request
891 * @rl: request list to allocate from
892 * @rw_flags: RW and SYNC flags
893 * @bio: bio to allocate request for (can be %NULL)
894 * @gfp_mask: allocation mask
895 *
896 * Get a free request from @q. This function may fail under memory
897 * pressure or if @q is dead.
898 *
899 * Must be callled with @q->queue_lock held and,
900 * Returns %NULL on failure, with @q->queue_lock held.
901 * Returns !%NULL on success, with @q->queue_lock *not held*.
902 */ 683 */
903static struct request *__get_request(struct request_list *rl, int rw_flags, 684static struct request *get_request(struct request_queue *q, int rw_flags,
904 struct bio *bio, gfp_t gfp_mask) 685 struct bio *bio, gfp_t gfp_mask)
905{ 686{
906 struct request_queue *q = rl->q; 687 struct request *rq = NULL;
907 struct request *rq; 688 struct request_list *rl = &q->rq;
908 struct elevator_type *et = q->elevator->type; 689 struct io_context *ioc = NULL;
909 struct io_context *ioc = rq_ioc(bio);
910 struct io_cq *icq = NULL;
911 const bool is_sync = rw_is_sync(rw_flags) != 0; 690 const bool is_sync = rw_is_sync(rw_flags) != 0;
912 int may_queue; 691 int may_queue, priv = 0;
913
914 if (unlikely(blk_queue_dying(q)))
915 return NULL;
916 692
917 may_queue = elv_may_queue(q, rw_flags); 693 may_queue = elv_may_queue(q, rw_flags);
918 if (may_queue == ELV_MQUEUE_NO) 694 if (may_queue == ELV_MQUEUE_NO)
@@ -920,15 +696,16 @@ static struct request *__get_request(struct request_list *rl, int rw_flags,
920 696
921 if (rl->count[is_sync]+1 >= queue_congestion_on_threshold(q)) { 697 if (rl->count[is_sync]+1 >= queue_congestion_on_threshold(q)) {
922 if (rl->count[is_sync]+1 >= q->nr_requests) { 698 if (rl->count[is_sync]+1 >= q->nr_requests) {
699 ioc = current_io_context(GFP_ATOMIC, q->node);
923 /* 700 /*
924 * The queue will fill after this allocation, so set 701 * The queue will fill after this allocation, so set
925 * it as full, and mark this process as "batching". 702 * it as full, and mark this process as "batching".
926 * This process will be allowed to complete a batch of 703 * This process will be allowed to complete a batch of
927 * requests, others will be blocked. 704 * requests, others will be blocked.
928 */ 705 */
929 if (!blk_rl_full(rl, is_sync)) { 706 if (!blk_queue_full(q, is_sync)) {
930 ioc_set_batching(q, ioc); 707 ioc_set_batching(q, ioc);
931 blk_set_rl_full(rl, is_sync); 708 blk_set_queue_full(q, is_sync);
932 } else { 709 } else {
933 if (may_queue != ELV_MQUEUE_MUST 710 if (may_queue != ELV_MQUEUE_MUST
934 && !ioc_batching(q, ioc)) { 711 && !ioc_batching(q, ioc)) {
@@ -937,16 +714,11 @@ static struct request *__get_request(struct request_list *rl, int rw_flags,
937 * process is not a "batcher", and not 714 * process is not a "batcher", and not
938 * exempted by the IO scheduler 715 * exempted by the IO scheduler
939 */ 716 */
940 return NULL; 717 goto out;
941 } 718 }
942 } 719 }
943 } 720 }
944 /* 721 blk_set_queue_congested(q, is_sync);
945 * bdi isn't aware of blkcg yet. As all async IOs end up
946 * root blkcg anyway, just use root blkcg state.
947 */
948 if (rl == &q->root_rl)
949 blk_set_queue_congested(q, is_sync);
950 } 722 }
951 723
952 /* 724 /*
@@ -955,60 +727,47 @@ static struct request *__get_request(struct request_list *rl, int rw_flags,
955 * allocated with any setting of ->nr_requests 727 * allocated with any setting of ->nr_requests
956 */ 728 */
957 if (rl->count[is_sync] >= (3 * q->nr_requests / 2)) 729 if (rl->count[is_sync] >= (3 * q->nr_requests / 2))
958 return NULL; 730 goto out;
959 731
960 q->nr_rqs[is_sync]++;
961 rl->count[is_sync]++; 732 rl->count[is_sync]++;
962 rl->starved[is_sync] = 0; 733 rl->starved[is_sync] = 0;
963 734
964 /* 735 if (blk_rq_should_init_elevator(bio)) {
965 * Decide whether the new request will be managed by elevator. If 736 priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
966 * so, mark @rw_flags and increment elvpriv. Non-zero elvpriv will 737 if (priv)
967 * prevent the current elevator from being destroyed until the new 738 rl->elvpriv++;
968 * request is freed. This guarantees icq's won't be destroyed and
969 * makes creating new ones safe.
970 *
971 * Also, lookup icq while holding queue_lock. If it doesn't exist,
972 * it will be created after releasing queue_lock.
973 */
974 if (blk_rq_should_init_elevator(bio) && !blk_queue_bypass(q)) {
975 rw_flags |= REQ_ELVPRIV;
976 q->nr_rqs_elvpriv++;
977 if (et->icq_cache && ioc)
978 icq = ioc_lookup_icq(ioc, q);
979 } 739 }
980 740
981 if (blk_queue_io_stat(q)) 741 if (blk_queue_io_stat(q))
982 rw_flags |= REQ_IO_STAT; 742 rw_flags |= REQ_IO_STAT;
983 spin_unlock_irq(q->queue_lock); 743 spin_unlock_irq(q->queue_lock);
984 744
985 /* allocate and init request */ 745 rq = blk_alloc_request(q, rw_flags, priv, gfp_mask);
986 rq = mempool_alloc(rl->rq_pool, gfp_mask); 746 if (unlikely(!rq)) {
987 if (!rq) 747 /*
988 goto fail_alloc; 748 * Allocation failed presumably due to memory. Undo anything
989 749 * we might have messed up.
990 blk_rq_init(q, rq); 750 *
991 blk_rq_set_rl(rq, rl); 751 * Allocating task should really be put onto the front of the
992 rq->cmd_flags = rw_flags | REQ_ALLOCED; 752 * wait queue, but this is pretty rare.
993 753 */
994 /* init elvpriv */ 754 spin_lock_irq(q->queue_lock);
995 if (rw_flags & REQ_ELVPRIV) { 755 freed_request(q, is_sync, priv);
996 if (unlikely(et->icq_cache && !icq)) {
997 if (ioc)
998 icq = ioc_create_icq(ioc, q, gfp_mask);
999 if (!icq)
1000 goto fail_elvpriv;
1001 }
1002 756
1003 rq->elv.icq = icq; 757 /*
1004 if (unlikely(elv_set_request(q, rq, bio, gfp_mask))) 758 * in the very unlikely event that allocation failed and no
1005 goto fail_elvpriv; 759 * requests for this direction was pending, mark us starved
760 * so that freeing of a request in the other direction will
761 * notice us. another possible fix would be to split the
762 * rq mempool into READ and WRITE
763 */
764rq_starved:
765 if (unlikely(rl->count[is_sync] == 0))
766 rl->starved[is_sync] = 1;
1006 767
1007 /* @rq->elv.icq holds io_context until @rq is freed */ 768 goto out;
1008 if (icq)
1009 get_io_context(icq->ioc);
1010 } 769 }
1011out: 770
1012 /* 771 /*
1013 * ioc may be NULL here, and ioc_batching will be false. That's 772 * ioc may be NULL here, and ioc_batching will be false. That's
1014 * OK, if the queue is under the request limit then requests need 773 * OK, if the queue is under the request limit then requests need
@@ -1019,118 +778,71 @@ out:
1019 ioc->nr_batch_requests--; 778 ioc->nr_batch_requests--;
1020 779
1021 trace_block_getrq(q, bio, rw_flags & 1); 780 trace_block_getrq(q, bio, rw_flags & 1);
781out:
1022 return rq; 782 return rq;
1023
1024fail_elvpriv:
1025 /*
1026 * elvpriv init failed. ioc, icq and elvpriv aren't mempool backed
1027 * and may fail indefinitely under memory pressure and thus
1028 * shouldn't stall IO. Treat this request as !elvpriv. This will
1029 * disturb iosched and blkcg but weird is bettern than dead.
1030 */
1031 printk_ratelimited(KERN_WARNING "%s: request aux data allocation failed, iosched may be disturbed\n",
1032 dev_name(q->backing_dev_info.dev));
1033
1034 rq->cmd_flags &= ~REQ_ELVPRIV;
1035 rq->elv.icq = NULL;
1036
1037 spin_lock_irq(q->queue_lock);
1038 q->nr_rqs_elvpriv--;
1039 spin_unlock_irq(q->queue_lock);
1040 goto out;
1041
1042fail_alloc:
1043 /*
1044 * Allocation failed presumably due to memory. Undo anything we
1045 * might have messed up.
1046 *
1047 * Allocating task should really be put onto the front of the wait
1048 * queue, but this is pretty rare.
1049 */
1050 spin_lock_irq(q->queue_lock);
1051 freed_request(rl, rw_flags);
1052
1053 /*
1054 * in the very unlikely event that allocation failed and no
1055 * requests for this direction was pending, mark us starved so that
1056 * freeing of a request in the other direction will notice
1057 * us. another possible fix would be to split the rq mempool into
1058 * READ and WRITE
1059 */
1060rq_starved:
1061 if (unlikely(rl->count[is_sync] == 0))
1062 rl->starved[is_sync] = 1;
1063 return NULL;
1064} 783}
1065 784
1066/** 785/*
1067 * get_request - get a free request 786 * No available requests for this queue, wait for some requests to become
1068 * @q: request_queue to allocate request from 787 * available.
1069 * @rw_flags: RW and SYNC flags 788 *
1070 * @bio: bio to allocate request for (can be %NULL) 789 * Called with q->queue_lock held, and returns with it unlocked.
1071 * @gfp_mask: allocation mask
1072 *
1073 * Get a free request from @q. If %__GFP_WAIT is set in @gfp_mask, this
1074 * function keeps retrying under memory pressure and fails iff @q is dead.
1075 *
1076 * Must be callled with @q->queue_lock held and,
1077 * Returns %NULL on failure, with @q->queue_lock held.
1078 * Returns !%NULL on success, with @q->queue_lock *not held*.
1079 */ 790 */
1080static struct request *get_request(struct request_queue *q, int rw_flags, 791static struct request *get_request_wait(struct request_queue *q, int rw_flags,
1081 struct bio *bio, gfp_t gfp_mask) 792 struct bio *bio)
1082{ 793{
1083 const bool is_sync = rw_is_sync(rw_flags) != 0; 794 const bool is_sync = rw_is_sync(rw_flags) != 0;
1084 DEFINE_WAIT(wait);
1085 struct request_list *rl;
1086 struct request *rq; 795 struct request *rq;
1087 796
1088 rl = blk_get_rl(q, bio); /* transferred to @rq on success */ 797 rq = get_request(q, rw_flags, bio, GFP_NOIO);
1089retry: 798 while (!rq) {
1090 rq = __get_request(rl, rw_flags, bio, gfp_mask); 799 DEFINE_WAIT(wait);
1091 if (rq) 800 struct io_context *ioc;
1092 return rq; 801 struct request_list *rl = &q->rq;
1093 802
1094 if (!(gfp_mask & __GFP_WAIT) || unlikely(blk_queue_dying(q))) { 803 prepare_to_wait_exclusive(&rl->wait[is_sync], &wait,
1095 blk_put_rl(rl); 804 TASK_UNINTERRUPTIBLE);
1096 return NULL;
1097 }
1098 805
1099 /* wait on @rl and retry */ 806 trace_block_sleeprq(q, bio, rw_flags & 1);
1100 prepare_to_wait_exclusive(&rl->wait[is_sync], &wait,
1101 TASK_UNINTERRUPTIBLE);
1102 807
1103 trace_block_sleeprq(q, bio, rw_flags & 1); 808 spin_unlock_irq(q->queue_lock);
809 io_schedule();
1104 810
1105 spin_unlock_irq(q->queue_lock); 811 /*
1106 io_schedule(); 812 * After sleeping, we become a "batching" process and
813 * will be able to allocate at least one request, and
814 * up to a big batch of them for a small period time.
815 * See ioc_batching, ioc_set_batching
816 */
817 ioc = current_io_context(GFP_NOIO, q->node);
818 ioc_set_batching(q, ioc);
1107 819
1108 /* 820 spin_lock_irq(q->queue_lock);
1109 * After sleeping, we become a "batching" process and will be able 821 finish_wait(&rl->wait[is_sync], &wait);
1110 * to allocate at least one request, and up to a big batch of them
1111 * for a small period time. See ioc_batching, ioc_set_batching
1112 */
1113 ioc_set_batching(q, current->io_context);
1114 822
1115 spin_lock_irq(q->queue_lock); 823 rq = get_request(q, rw_flags, bio, GFP_NOIO);
1116 finish_wait(&rl->wait[is_sync], &wait); 824 };
1117 825
1118 goto retry; 826 return rq;
1119} 827}
1120 828
1121struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask) 829struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask)
1122{ 830{
1123 struct request *rq; 831 struct request *rq;
1124 832
1125 BUG_ON(rw != READ && rw != WRITE); 833 if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))
834 return NULL;
1126 835
1127 /* create ioc upfront */ 836 BUG_ON(rw != READ && rw != WRITE);
1128 create_io_context(gfp_mask, q->node);
1129 837
1130 spin_lock_irq(q->queue_lock); 838 spin_lock_irq(q->queue_lock);
1131 rq = get_request(q, rw, NULL, gfp_mask); 839 if (gfp_mask & __GFP_WAIT) {
1132 if (!rq) 840 rq = get_request_wait(q, rw, NULL);
1133 spin_unlock_irq(q->queue_lock); 841 } else {
842 rq = get_request(q, rw, NULL, gfp_mask);
843 if (!rq)
844 spin_unlock_irq(q->queue_lock);
845 }
1134 /* q->queue_lock is unlocked at this point */ 846 /* q->queue_lock is unlocked at this point */
1135 847
1136 return rq; 848 return rq;
@@ -1224,6 +936,54 @@ static void add_acct_request(struct request_queue *q, struct request *rq,
1224 __elv_add_request(q, rq, where); 936 __elv_add_request(q, rq, where);
1225} 937}
1226 938
939/**
940 * blk_insert_request - insert a special request into a request queue
941 * @q: request queue where request should be inserted
942 * @rq: request to be inserted
943 * @at_head: insert request at head or tail of queue
944 * @data: private data
945 *
946 * Description:
947 * Many block devices need to execute commands asynchronously, so they don't
948 * block the whole kernel from preemption during request execution. This is
949 * accomplished normally by inserting aritficial requests tagged as
950 * REQ_TYPE_SPECIAL in to the corresponding request queue, and letting them
951 * be scheduled for actual execution by the request queue.
952 *
953 * We have the option of inserting the head or the tail of the queue.
954 * Typically we use the tail for new ioctls and so forth. We use the head
955 * of the queue for things like a QUEUE_FULL message from a device, or a
956 * host that is unable to accept a particular command.
957 */
958void blk_insert_request(struct request_queue *q, struct request *rq,
959 int at_head, void *data)
960{
961 int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK;
962 unsigned long flags;
963
964 /*
965 * tell I/O scheduler that this isn't a regular read/write (ie it
966 * must not attempt merges on this) and that it acts as a soft
967 * barrier
968 */
969 rq->cmd_type = REQ_TYPE_SPECIAL;
970
971 rq->special = data;
972
973 spin_lock_irqsave(q->queue_lock, flags);
974
975 /*
976 * If command is tagged, release the tag
977 */
978 if (blk_rq_tagged(rq))
979 blk_queue_end_tag(q, rq);
980
981 add_acct_request(q, rq, where);
982 __blk_run_queue(q);
983 spin_unlock_irqrestore(q->queue_lock, flags);
984}
985EXPORT_SYMBOL(blk_insert_request);
986
1227static void part_round_stats_single(int cpu, struct hd_struct *part, 987static void part_round_stats_single(int cpu, struct hd_struct *part,
1228 unsigned long now) 988 unsigned long now)
1229{ 989{
@@ -1284,15 +1044,14 @@ void __blk_put_request(struct request_queue *q, struct request *req)
1284 * it didn't come out of our reserved rq pools 1044 * it didn't come out of our reserved rq pools
1285 */ 1045 */
1286 if (req->cmd_flags & REQ_ALLOCED) { 1046 if (req->cmd_flags & REQ_ALLOCED) {
1287 unsigned int flags = req->cmd_flags; 1047 int is_sync = rq_is_sync(req) != 0;
1288 struct request_list *rl = blk_rq_rl(req); 1048 int priv = req->cmd_flags & REQ_ELVPRIV;
1289 1049
1290 BUG_ON(!list_empty(&req->queuelist)); 1050 BUG_ON(!list_empty(&req->queuelist));
1291 BUG_ON(!hlist_unhashed(&req->hash)); 1051 BUG_ON(!hlist_unhashed(&req->hash));
1292 1052
1293 blk_free_request(rl, req); 1053 blk_free_request(q, req);
1294 freed_request(rl, flags); 1054 freed_request(q, is_sync, priv);
1295 blk_put_rl(rl);
1296 } 1055 }
1297} 1056}
1298EXPORT_SYMBOL_GPL(__blk_put_request); 1057EXPORT_SYMBOL_GPL(__blk_put_request);
@@ -1359,6 +1118,7 @@ static bool bio_attempt_back_merge(struct request_queue *q, struct request *req,
1359 req->ioprio = ioprio_best(req->ioprio, bio_prio(bio)); 1118 req->ioprio = ioprio_best(req->ioprio, bio_prio(bio));
1360 1119
1361 drive_stat_acct(req, 0); 1120 drive_stat_acct(req, 0);
1121 elv_bio_merged(q, req, bio);
1362 return true; 1122 return true;
1363} 1123}
1364 1124
@@ -1389,34 +1149,22 @@ static bool bio_attempt_front_merge(struct request_queue *q,
1389 req->ioprio = ioprio_best(req->ioprio, bio_prio(bio)); 1149 req->ioprio = ioprio_best(req->ioprio, bio_prio(bio));
1390 1150
1391 drive_stat_acct(req, 0); 1151 drive_stat_acct(req, 0);
1152 elv_bio_merged(q, req, bio);
1392 return true; 1153 return true;
1393} 1154}
1394 1155
1395/** 1156/*
1396 * attempt_plug_merge - try to merge with %current's plugged list 1157 * Attempts to merge with the plugged list in the current process. Returns
1397 * @q: request_queue new bio is being queued at 1158 * true if merge was successful, otherwise false.
1398 * @bio: new bio being queued
1399 * @request_count: out parameter for number of traversed plugged requests
1400 *
1401 * Determine whether @bio being queued on @q can be merged with a request
1402 * on %current's plugged list. Returns %true if merge was successful,
1403 * otherwise %false.
1404 *
1405 * Plugging coalesces IOs from the same issuer for the same purpose without
1406 * going through @q->queue_lock. As such it's more of an issuing mechanism
1407 * than scheduling, and the request, while may have elvpriv data, is not
1408 * added on the elevator at this point. In addition, we don't have
1409 * reliable access to the elevator outside queue lock. Only check basic
1410 * merging parameters without querying the elevator.
1411 */ 1159 */
1412static bool attempt_plug_merge(struct request_queue *q, struct bio *bio, 1160static bool attempt_plug_merge(struct task_struct *tsk, struct request_queue *q,
1413 unsigned int *request_count) 1161 struct bio *bio, unsigned int *request_count)
1414{ 1162{
1415 struct blk_plug *plug; 1163 struct blk_plug *plug;
1416 struct request *rq; 1164 struct request *rq;
1417 bool ret = false; 1165 bool ret = false;
1418 1166
1419 plug = current->plug; 1167 plug = tsk->plug;
1420 if (!plug) 1168 if (!plug)
1421 goto out; 1169 goto out;
1422 *request_count = 0; 1170 *request_count = 0;
@@ -1424,13 +1172,12 @@ static bool attempt_plug_merge(struct request_queue *q, struct bio *bio,
1424 list_for_each_entry_reverse(rq, &plug->list, queuelist) { 1172 list_for_each_entry_reverse(rq, &plug->list, queuelist) {
1425 int el_ret; 1173 int el_ret;
1426 1174
1427 if (rq->q == q) 1175 (*request_count)++;
1428 (*request_count)++;
1429 1176
1430 if (rq->q != q || !blk_rq_merge_ok(rq, bio)) 1177 if (rq->q != q)
1431 continue; 1178 continue;
1432 1179
1433 el_ret = blk_try_merge(rq, bio); 1180 el_ret = elv_try_merge(rq, bio);
1434 if (el_ret == ELEVATOR_BACK_MERGE) { 1181 if (el_ret == ELEVATOR_BACK_MERGE) {
1435 ret = bio_attempt_back_merge(q, rq, bio); 1182 ret = bio_attempt_back_merge(q, rq, bio);
1436 if (ret) 1183 if (ret)
@@ -1447,6 +1194,7 @@ out:
1447 1194
1448void init_request_from_bio(struct request *req, struct bio *bio) 1195void init_request_from_bio(struct request *req, struct bio *bio)
1449{ 1196{
1197 req->cpu = bio->bi_comp_cpu;
1450 req->cmd_type = REQ_TYPE_FS; 1198 req->cmd_type = REQ_TYPE_FS;
1451 1199
1452 req->cmd_flags |= bio->bi_rw & REQ_COMMON_MASK; 1200 req->cmd_flags |= bio->bi_rw & REQ_COMMON_MASK;
@@ -1459,7 +1207,7 @@ void init_request_from_bio(struct request *req, struct bio *bio)
1459 blk_rq_bio_prep(req->q, req, bio); 1207 blk_rq_bio_prep(req->q, req, bio);
1460} 1208}
1461 1209
1462void blk_queue_bio(struct request_queue *q, struct bio *bio) 1210static int __make_request(struct request_queue *q, struct bio *bio)
1463{ 1211{
1464 const bool sync = !!(bio->bi_rw & REQ_SYNC); 1212 const bool sync = !!(bio->bi_rw & REQ_SYNC);
1465 struct blk_plug *plug; 1213 struct blk_plug *plug;
@@ -1484,22 +1232,20 @@ void blk_queue_bio(struct request_queue *q, struct bio *bio)
1484 * Check if we can merge with the plugged list before grabbing 1232 * Check if we can merge with the plugged list before grabbing
1485 * any locks. 1233 * any locks.
1486 */ 1234 */
1487 if (attempt_plug_merge(q, bio, &request_count)) 1235 if (attempt_plug_merge(current, q, bio, &request_count))
1488 return; 1236 goto out;
1489 1237
1490 spin_lock_irq(q->queue_lock); 1238 spin_lock_irq(q->queue_lock);
1491 1239
1492 el_ret = elv_merge(q, &req, bio); 1240 el_ret = elv_merge(q, &req, bio);
1493 if (el_ret == ELEVATOR_BACK_MERGE) { 1241 if (el_ret == ELEVATOR_BACK_MERGE) {
1494 if (bio_attempt_back_merge(q, req, bio)) { 1242 if (bio_attempt_back_merge(q, req, bio)) {
1495 elv_bio_merged(q, req, bio);
1496 if (!attempt_back_merge(q, req)) 1243 if (!attempt_back_merge(q, req))
1497 elv_merged_request(q, req, el_ret); 1244 elv_merged_request(q, req, el_ret);
1498 goto out_unlock; 1245 goto out_unlock;
1499 } 1246 }
1500 } else if (el_ret == ELEVATOR_FRONT_MERGE) { 1247 } else if (el_ret == ELEVATOR_FRONT_MERGE) {
1501 if (bio_attempt_front_merge(q, req, bio)) { 1248 if (bio_attempt_front_merge(q, req, bio)) {
1502 elv_bio_merged(q, req, bio);
1503 if (!attempt_front_merge(q, req)) 1249 if (!attempt_front_merge(q, req))
1504 elv_merged_request(q, req, el_ret); 1250 elv_merged_request(q, req, el_ret);
1505 goto out_unlock; 1251 goto out_unlock;
@@ -1520,11 +1266,7 @@ get_rq:
1520 * Grab a free request. This is might sleep but can not fail. 1266 * Grab a free request. This is might sleep but can not fail.
1521 * Returns with the queue unlocked. 1267 * Returns with the queue unlocked.
1522 */ 1268 */
1523 req = get_request(q, rw_flags, bio, GFP_NOIO); 1269 req = get_request_wait(q, rw_flags, bio);
1524 if (unlikely(!req)) {
1525 bio_endio(bio, -ENODEV); /* @q is dead */
1526 goto out_unlock;
1527 }
1528 1270
1529 /* 1271 /*
1530 * After dropping the lock and possibly sleeping here, our request 1272 * After dropping the lock and possibly sleeping here, our request
@@ -1534,7 +1276,8 @@ get_rq:
1534 */ 1276 */
1535 init_request_from_bio(req, bio); 1277 init_request_from_bio(req, bio);
1536 1278
1537 if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags)) 1279 if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) ||
1280 bio_flagged(bio, BIO_CPU_AFFINE))
1538 req->cpu = raw_smp_processor_id(); 1281 req->cpu = raw_smp_processor_id();
1539 1282
1540 plug = current->plug; 1283 plug = current->plug;
@@ -1547,19 +1290,15 @@ get_rq:
1547 */ 1290 */
1548 if (list_empty(&plug->list)) 1291 if (list_empty(&plug->list))
1549 trace_block_plug(q); 1292 trace_block_plug(q);
1550 else { 1293 else if (!plug->should_sort) {
1551 if (!plug->should_sort) { 1294 struct request *__rq;
1552 struct request *__rq;
1553 1295
1554 __rq = list_entry_rq(plug->list.prev); 1296 __rq = list_entry_rq(plug->list.prev);
1555 if (__rq->q != q) 1297 if (__rq->q != q)
1556 plug->should_sort = 1; 1298 plug->should_sort = 1;
1557 }
1558 if (request_count >= BLK_MAX_REQUEST_COUNT) {
1559 blk_flush_plug_list(plug, false);
1560 trace_block_plug(q);
1561 }
1562 } 1299 }
1300 if (request_count >= BLK_MAX_REQUEST_COUNT)
1301 blk_flush_plug_list(plug, false);
1563 list_add_tail(&req->queuelist, &plug->list); 1302 list_add_tail(&req->queuelist, &plug->list);
1564 drive_stat_acct(req, 1); 1303 drive_stat_acct(req, 1);
1565 } else { 1304 } else {
@@ -1569,8 +1308,9 @@ get_rq:
1569out_unlock: 1308out_unlock:
1570 spin_unlock_irq(q->queue_lock); 1309 spin_unlock_irq(q->queue_lock);
1571 } 1310 }
1311out:
1312 return 0;
1572} 1313}
1573EXPORT_SYMBOL_GPL(blk_queue_bio); /* for device mapper only */
1574 1314
1575/* 1315/*
1576 * If bio->bi_dev is a partition, remap the location 1316 * If bio->bi_dev is a partition, remap the location
@@ -1669,147 +1409,165 @@ static inline int bio_check_eod(struct bio *bio, unsigned int nr_sectors)
1669 return 0; 1409 return 0;
1670} 1410}
1671 1411
1672static noinline_for_stack bool 1412/**
1673generic_make_request_checks(struct bio *bio) 1413 * generic_make_request - hand a buffer to its device driver for I/O
1414 * @bio: The bio describing the location in memory and on the device.
1415 *
1416 * generic_make_request() is used to make I/O requests of block
1417 * devices. It is passed a &struct bio, which describes the I/O that needs
1418 * to be done.
1419 *
1420 * generic_make_request() does not return any status. The
1421 * success/failure status of the request, along with notification of
1422 * completion, is delivered asynchronously through the bio->bi_end_io
1423 * function described (one day) else where.
1424 *
1425 * The caller of generic_make_request must make sure that bi_io_vec
1426 * are set to describe the memory buffer, and that bi_dev and bi_sector are
1427 * set to describe the device address, and the
1428 * bi_end_io and optionally bi_private are set to describe how
1429 * completion notification should be signaled.
1430 *
1431 * generic_make_request and the drivers it calls may use bi_next if this
1432 * bio happens to be merged with someone else, and may change bi_dev and
1433 * bi_sector for remaps as it sees fit. So the values of these fields
1434 * should NOT be depended on after the call to generic_make_request.
1435 */
1436static inline void __generic_make_request(struct bio *bio)
1674{ 1437{
1675 struct request_queue *q; 1438 struct request_queue *q;
1676 int nr_sectors = bio_sectors(bio); 1439 sector_t old_sector;
1440 int ret, nr_sectors = bio_sectors(bio);
1441 dev_t old_dev;
1677 int err = -EIO; 1442 int err = -EIO;
1678 char b[BDEVNAME_SIZE];
1679 struct hd_struct *part;
1680 1443
1681 might_sleep(); 1444 might_sleep();
1682 1445
1683 if (bio_check_eod(bio, nr_sectors)) 1446 if (bio_check_eod(bio, nr_sectors))
1684 goto end_io; 1447 goto end_io;
1685 1448
1686 q = bdev_get_queue(bio->bi_bdev); 1449 /*
1687 if (unlikely(!q)) { 1450 * Resolve the mapping until finished. (drivers are
1688 printk(KERN_ERR 1451 * still free to implement/resolve their own stacking
1689 "generic_make_request: Trying to access " 1452 * by explicitly returning 0)
1690 "nonexistent block-device %s (%Lu)\n", 1453 *
1691 bdevname(bio->bi_bdev, b), 1454 * NOTE: we don't repeat the blk_size check for each new device.
1692 (long long) bio->bi_sector); 1455 * Stacking drivers are expected to know what they are doing.
1693 goto end_io; 1456 */
1694 } 1457 old_sector = -1;
1458 old_dev = 0;
1459 do {
1460 char b[BDEVNAME_SIZE];
1461 struct hd_struct *part;
1695 1462
1696 if (likely(bio_is_rw(bio) && 1463 q = bdev_get_queue(bio->bi_bdev);
1697 nr_sectors > queue_max_hw_sectors(q))) { 1464 if (unlikely(!q)) {
1698 printk(KERN_ERR "bio too big device %s (%u > %u)\n", 1465 printk(KERN_ERR
1699 bdevname(bio->bi_bdev, b), 1466 "generic_make_request: Trying to access "
1700 bio_sectors(bio), 1467 "nonexistent block-device %s (%Lu)\n",
1701 queue_max_hw_sectors(q)); 1468 bdevname(bio->bi_bdev, b),
1702 goto end_io; 1469 (long long) bio->bi_sector);
1703 } 1470 goto end_io;
1471 }
1704 1472
1705 part = bio->bi_bdev->bd_part; 1473 if (unlikely(!(bio->bi_rw & REQ_DISCARD) &&
1706 if (should_fail_request(part, bio->bi_size) || 1474 nr_sectors > queue_max_hw_sectors(q))) {
1707 should_fail_request(&part_to_disk(part)->part0, 1475 printk(KERN_ERR "bio too big device %s (%u > %u)\n",
1708 bio->bi_size)) 1476 bdevname(bio->bi_bdev, b),
1709 goto end_io; 1477 bio_sectors(bio),
1478 queue_max_hw_sectors(q));
1479 goto end_io;
1480 }
1710 1481
1711 /* 1482 if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))
1712 * If this device has partitions, remap block n 1483 goto end_io;
1713 * of partition p to block n+start(p) of the disk.
1714 */
1715 blk_partition_remap(bio);
1716 1484
1717 if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) 1485 part = bio->bi_bdev->bd_part;
1718 goto end_io; 1486 if (should_fail_request(part, bio->bi_size) ||
1487 should_fail_request(&part_to_disk(part)->part0,
1488 bio->bi_size))
1489 goto end_io;
1719 1490
1720 if (bio_check_eod(bio, nr_sectors)) 1491 /*
1721 goto end_io; 1492 * If this device has partitions, remap block n
1493 * of partition p to block n+start(p) of the disk.
1494 */
1495 blk_partition_remap(bio);
1722 1496
1723 /* 1497 if (bio_integrity_enabled(bio) && bio_integrity_prep(bio))
1724 * Filter flush bio's early so that make_request based 1498 goto end_io;
1725 * drivers without flush support don't have to worry 1499
1726 * about them. 1500 if (old_sector != -1)
1727 */ 1501 trace_block_bio_remap(q, bio, old_dev, old_sector);
1728 if ((bio->bi_rw & (REQ_FLUSH | REQ_FUA)) && !q->flush_flags) { 1502
1729 bio->bi_rw &= ~(REQ_FLUSH | REQ_FUA); 1503 old_sector = bio->bi_sector;
1730 if (!nr_sectors) { 1504 old_dev = bio->bi_bdev->bd_dev;
1731 err = 0; 1505
1506 if (bio_check_eod(bio, nr_sectors))
1732 goto end_io; 1507 goto end_io;
1508
1509 /*
1510 * Filter flush bio's early so that make_request based
1511 * drivers without flush support don't have to worry
1512 * about them.
1513 */
1514 if ((bio->bi_rw & (REQ_FLUSH | REQ_FUA)) && !q->flush_flags) {
1515 bio->bi_rw &= ~(REQ_FLUSH | REQ_FUA);
1516 if (!nr_sectors) {
1517 err = 0;
1518 goto end_io;
1519 }
1733 } 1520 }
1734 }
1735 1521
1736 if ((bio->bi_rw & REQ_DISCARD) && 1522 if ((bio->bi_rw & REQ_DISCARD) &&
1737 (!blk_queue_discard(q) || 1523 (!blk_queue_discard(q) ||
1738 ((bio->bi_rw & REQ_SECURE) && !blk_queue_secdiscard(q)))) { 1524 ((bio->bi_rw & REQ_SECURE) &&
1739 err = -EOPNOTSUPP; 1525 !blk_queue_secdiscard(q)))) {
1740 goto end_io; 1526 err = -EOPNOTSUPP;
1741 } 1527 goto end_io;
1528 }
1742 1529
1743 if (bio->bi_rw & REQ_WRITE_SAME && !bdev_write_same(bio->bi_bdev)) { 1530 if (blk_throtl_bio(q, &bio))
1744 err = -EOPNOTSUPP; 1531 goto end_io;
1745 goto end_io;
1746 }
1747 1532
1748 /* 1533 /*
1749 * Various block parts want %current->io_context and lazy ioc 1534 * If bio = NULL, bio has been throttled and will be submitted
1750 * allocation ends up trading a lot of pain for a small amount of 1535 * later.
1751 * memory. Just allocate it upfront. This may fail and block 1536 */
1752 * layer knows how to live with it. 1537 if (!bio)
1753 */ 1538 break;
1754 create_io_context(GFP_ATOMIC, q->node);
1755 1539
1756 if (blk_throtl_bio(q, bio)) 1540 trace_block_bio_queue(q, bio);
1757 return false; /* throttled, will be resubmitted later */
1758 1541
1759 trace_block_bio_queue(q, bio); 1542 ret = q->make_request_fn(q, bio);
1760 return true; 1543 } while (ret);
1544
1545 return;
1761 1546
1762end_io: 1547end_io:
1763 bio_endio(bio, err); 1548 bio_endio(bio, err);
1764 return false;
1765} 1549}
1766 1550
1767/** 1551/*
1768 * generic_make_request - hand a buffer to its device driver for I/O 1552 * We only want one ->make_request_fn to be active at a time,
1769 * @bio: The bio describing the location in memory and on the device. 1553 * else stack usage with stacked devices could be a problem.
1770 * 1554 * So use current->bio_list to keep a list of requests
1771 * generic_make_request() is used to make I/O requests of block 1555 * submited by a make_request_fn function.
1772 * devices. It is passed a &struct bio, which describes the I/O that needs 1556 * current->bio_list is also used as a flag to say if
1773 * to be done. 1557 * generic_make_request is currently active in this task or not.
1774 * 1558 * If it is NULL, then no make_request is active. If it is non-NULL,
1775 * generic_make_request() does not return any status. The 1559 * then a make_request is active, and new requests should be added
1776 * success/failure status of the request, along with notification of 1560 * at the tail
1777 * completion, is delivered asynchronously through the bio->bi_end_io
1778 * function described (one day) else where.
1779 *
1780 * The caller of generic_make_request must make sure that bi_io_vec
1781 * are set to describe the memory buffer, and that bi_dev and bi_sector are
1782 * set to describe the device address, and the
1783 * bi_end_io and optionally bi_private are set to describe how
1784 * completion notification should be signaled.
1785 *
1786 * generic_make_request and the drivers it calls may use bi_next if this
1787 * bio happens to be merged with someone else, and may resubmit the bio to
1788 * a lower device by calling into generic_make_request recursively, which
1789 * means the bio should NOT be touched after the call to ->make_request_fn.
1790 */ 1561 */
1791void generic_make_request(struct bio *bio) 1562void generic_make_request(struct bio *bio)
1792{ 1563{
1793 struct bio_list bio_list_on_stack; 1564 struct bio_list bio_list_on_stack;
1794 1565
1795 if (!generic_make_request_checks(bio))
1796 return;
1797
1798 /*
1799 * We only want one ->make_request_fn to be active at a time, else
1800 * stack usage with stacked devices could be a problem. So use
1801 * current->bio_list to keep a list of requests submited by a
1802 * make_request_fn function. current->bio_list is also used as a
1803 * flag to say if generic_make_request is currently active in this
1804 * task or not. If it is NULL, then no make_request is active. If
1805 * it is non-NULL, then a make_request is active, and new requests
1806 * should be added at the tail
1807 */
1808 if (current->bio_list) { 1566 if (current->bio_list) {
1567 /* make_request is active */
1809 bio_list_add(current->bio_list, bio); 1568 bio_list_add(current->bio_list, bio);
1810 return; 1569 return;
1811 } 1570 }
1812
1813 /* following loop may be a bit non-obvious, and so deserves some 1571 /* following loop may be a bit non-obvious, and so deserves some
1814 * explanation. 1572 * explanation.
1815 * Before entering the loop, bio->bi_next is NULL (as all callers 1573 * Before entering the loop, bio->bi_next is NULL (as all callers
@@ -1817,21 +1575,22 @@ void generic_make_request(struct bio *bio)
1817 * We pretend that we have just taken it off a longer list, so 1575 * We pretend that we have just taken it off a longer list, so
1818 * we assign bio_list to a pointer to the bio_list_on_stack, 1576 * we assign bio_list to a pointer to the bio_list_on_stack,
1819 * thus initialising the bio_list of new bios to be 1577 * thus initialising the bio_list of new bios to be
1820 * added. ->make_request() may indeed add some more bios 1578 * added. __generic_make_request may indeed add some more bios
1821 * through a recursive call to generic_make_request. If it 1579 * through a recursive call to generic_make_request. If it
1822 * did, we find a non-NULL value in bio_list and re-enter the loop 1580 * did, we find a non-NULL value in bio_list and re-enter the loop
1823 * from the top. In this case we really did just take the bio 1581 * from the top. In this case we really did just take the bio
1824 * of the top of the list (no pretending) and so remove it from 1582 * of the top of the list (no pretending) and so remove it from
1825 * bio_list, and call into ->make_request() again. 1583 * bio_list, and call into __generic_make_request again.
1584 *
1585 * The loop was structured like this to make only one call to
1586 * __generic_make_request (which is important as it is large and
1587 * inlined) and to keep the structure simple.
1826 */ 1588 */
1827 BUG_ON(bio->bi_next); 1589 BUG_ON(bio->bi_next);
1828 bio_list_init(&bio_list_on_stack); 1590 bio_list_init(&bio_list_on_stack);
1829 current->bio_list = &bio_list_on_stack; 1591 current->bio_list = &bio_list_on_stack;
1830 do { 1592 do {
1831 struct request_queue *q = bdev_get_queue(bio->bi_bdev); 1593 __generic_make_request(bio);
1832
1833 q->make_request_fn(q, bio);
1834
1835 bio = bio_list_pop(current->bio_list); 1594 bio = bio_list_pop(current->bio_list);
1836 } while (bio); 1595 } while (bio);
1837 current->bio_list = NULL; /* deactivate */ 1596 current->bio_list = NULL; /* deactivate */
@@ -1850,20 +1609,15 @@ EXPORT_SYMBOL(generic_make_request);
1850 */ 1609 */
1851void submit_bio(int rw, struct bio *bio) 1610void submit_bio(int rw, struct bio *bio)
1852{ 1611{
1612 int count = bio_sectors(bio);
1613
1853 bio->bi_rw |= rw; 1614 bio->bi_rw |= rw;
1854 1615
1855 /* 1616 /*
1856 * If it's a regular read/write or a barrier with data attached, 1617 * If it's a regular read/write or a barrier with data attached,
1857 * go through the normal accounting stuff before submission. 1618 * go through the normal accounting stuff before submission.
1858 */ 1619 */
1859 if (bio_has_data(bio)) { 1620 if (bio_has_data(bio) && !(rw & REQ_DISCARD)) {
1860 unsigned int count;
1861
1862 if (unlikely(rw & REQ_WRITE_SAME))
1863 count = bdev_logical_block_size(bio->bi_bdev) >> 9;
1864 else
1865 count = bio_sectors(bio);
1866
1867 if (rw & WRITE) { 1621 if (rw & WRITE) {
1868 count_vm_events(PGPGOUT, count); 1622 count_vm_events(PGPGOUT, count);
1869 } else { 1623 } else {
@@ -1909,10 +1663,11 @@ EXPORT_SYMBOL(submit_bio);
1909 */ 1663 */
1910int blk_rq_check_limits(struct request_queue *q, struct request *rq) 1664int blk_rq_check_limits(struct request_queue *q, struct request *rq)
1911{ 1665{
1912 if (!rq_mergeable(rq)) 1666 if (rq->cmd_flags & REQ_DISCARD)
1913 return 0; 1667 return 0;
1914 1668
1915 if (blk_rq_sectors(rq) > blk_queue_get_max_sectors(q, rq->cmd_flags)) { 1669 if (blk_rq_sectors(rq) > queue_max_sectors(q) ||
1670 blk_rq_bytes(rq) > queue_max_hw_sectors(q) << 9) {
1916 printk(KERN_ERR "%s: over max size limit.\n", __func__); 1671 printk(KERN_ERR "%s: over max size limit.\n", __func__);
1917 return -EIO; 1672 return -EIO;
1918 } 1673 }
@@ -1951,10 +1706,6 @@ int blk_insert_cloned_request(struct request_queue *q, struct request *rq)
1951 return -EIO; 1706 return -EIO;
1952 1707
1953 spin_lock_irqsave(q->queue_lock, flags); 1708 spin_lock_irqsave(q->queue_lock, flags);
1954 if (unlikely(blk_queue_dying(q))) {
1955 spin_unlock_irqrestore(q->queue_lock, flags);
1956 return -ENODEV;
1957 }
1958 1709
1959 /* 1710 /*
1960 * Submitting request must be dequeued before calling this function 1711 * Submitting request must be dequeued before calling this function
@@ -2296,11 +2047,9 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
2296 error_type = "I/O"; 2047 error_type = "I/O";
2297 break; 2048 break;
2298 } 2049 }
2299 printk_ratelimited(KERN_ERR "end_request: %s error, dev %s, sector %llu\n", 2050 printk(KERN_ERR "end_request: %s error, dev %s, sector %llu\n",
2300 error_type, req->rq_disk ? 2051 error_type, req->rq_disk ? req->rq_disk->disk_name : "?",
2301 req->rq_disk->disk_name : "?", 2052 (unsigned long long)blk_rq_pos(req));
2302 (unsigned long long)blk_rq_pos(req));
2303
2304 } 2053 }
2305 2054
2306 blk_account_io_completion(req, nr_bytes); 2055 blk_account_io_completion(req, nr_bytes);
@@ -2384,7 +2133,7 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
2384 req->buffer = bio_data(req->bio); 2133 req->buffer = bio_data(req->bio);
2385 2134
2386 /* update sector only for requests with clear definition of sector */ 2135 /* update sector only for requests with clear definition of sector */
2387 if (req->cmd_type == REQ_TYPE_FS) 2136 if (req->cmd_type == REQ_TYPE_FS || (req->cmd_flags & REQ_DISCARD))
2388 req->__sector += total_bytes >> 9; 2137 req->__sector += total_bytes >> 9;
2389 2138
2390 /* mixed attributes always follow the first bio */ 2139 /* mixed attributes always follow the first bio */
@@ -2825,10 +2574,16 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
2825 blk_rq_init(NULL, rq); 2574 blk_rq_init(NULL, rq);
2826 2575
2827 __rq_for_each_bio(bio_src, rq_src) { 2576 __rq_for_each_bio(bio_src, rq_src) {
2828 bio = bio_clone_bioset(bio_src, gfp_mask, bs); 2577 bio = bio_alloc_bioset(gfp_mask, bio_src->bi_max_vecs, bs);
2829 if (!bio) 2578 if (!bio)
2830 goto free_and_out; 2579 goto free_and_out;
2831 2580
2581 __bio_clone(bio, bio_src);
2582
2583 if (bio_integrity(bio_src) &&
2584 bio_integrity_clone(bio, bio_src, gfp_mask, bs))
2585 goto free_and_out;
2586
2832 if (bio_ctr && bio_ctr(bio, bio_src, data)) 2587 if (bio_ctr && bio_ctr(bio, bio_src, data))
2833 goto free_and_out; 2588 goto free_and_out;
2834 2589
@@ -2845,7 +2600,7 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
2845 2600
2846free_and_out: 2601free_and_out:
2847 if (bio) 2602 if (bio)
2848 bio_put(bio); 2603 bio_free(bio, bs);
2849 blk_rq_unprep_clone(rq); 2604 blk_rq_unprep_clone(rq);
2850 2605
2851 return -ENOMEM; 2606 return -ENOMEM;
@@ -2867,20 +2622,6 @@ EXPORT_SYMBOL(kblockd_schedule_delayed_work);
2867 2622
2868#define PLUG_MAGIC 0x91827364 2623#define PLUG_MAGIC 0x91827364
2869 2624
2870/**
2871 * blk_start_plug - initialize blk_plug and track it inside the task_struct
2872 * @plug: The &struct blk_plug that needs to be initialized
2873 *
2874 * Description:
2875 * Tracking blk_plug inside the task_struct will help with auto-flushing the
2876 * pending I/O should the task end up blocking between blk_start_plug() and
2877 * blk_finish_plug(). This is important from a performance perspective, but
2878 * also ensures that we don't deadlock. For instance, if the task is blocking
2879 * for a memory allocation, memory reclaim could end up wanting to free a
2880 * page belonging to that request that is currently residing in our private
2881 * plug. By flushing the pending I/O when the process goes to sleep, we avoid
2882 * this kind of deadlock.
2883 */
2884void blk_start_plug(struct blk_plug *plug) 2625void blk_start_plug(struct blk_plug *plug)
2885{ 2626{
2886 struct task_struct *tsk = current; 2627 struct task_struct *tsk = current;
@@ -2909,8 +2650,7 @@ static int plug_rq_cmp(void *priv, struct list_head *a, struct list_head *b)
2909 struct request *rqa = container_of(a, struct request, queuelist); 2650 struct request *rqa = container_of(a, struct request, queuelist);
2910 struct request *rqb = container_of(b, struct request, queuelist); 2651 struct request *rqb = container_of(b, struct request, queuelist);
2911 2652
2912 return !(rqa->q < rqb->q || 2653 return !(rqa->q <= rqb->q);
2913 (rqa->q == rqb->q && blk_rq_pos(rqa) < blk_rq_pos(rqb)));
2914} 2654}
2915 2655
2916/* 2656/*
@@ -2925,55 +2665,39 @@ static void queue_unplugged(struct request_queue *q, unsigned int depth,
2925{ 2665{
2926 trace_block_unplug(q, depth, !from_schedule); 2666 trace_block_unplug(q, depth, !from_schedule);
2927 2667
2928 if (from_schedule) 2668 /*
2669 * If we are punting this to kblockd, then we can safely drop
2670 * the queue_lock before waking kblockd (which needs to take
2671 * this lock).
2672 */
2673 if (from_schedule) {
2674 spin_unlock(q->queue_lock);
2929 blk_run_queue_async(q); 2675 blk_run_queue_async(q);
2930 else 2676 } else {
2931 __blk_run_queue(q); 2677 __blk_run_queue(q);
2932 spin_unlock(q->queue_lock); 2678 spin_unlock(q->queue_lock);
2679 }
2680
2933} 2681}
2934 2682
2935static void flush_plug_callbacks(struct blk_plug *plug, bool from_schedule) 2683static void flush_plug_callbacks(struct blk_plug *plug)
2936{ 2684{
2937 LIST_HEAD(callbacks); 2685 LIST_HEAD(callbacks);
2938 2686
2939 while (!list_empty(&plug->cb_list)) { 2687 if (list_empty(&plug->cb_list))
2940 list_splice_init(&plug->cb_list, &callbacks); 2688 return;
2689
2690 list_splice_init(&plug->cb_list, &callbacks);
2941 2691
2942 while (!list_empty(&callbacks)) { 2692 while (!list_empty(&callbacks)) {
2943 struct blk_plug_cb *cb = list_first_entry(&callbacks, 2693 struct blk_plug_cb *cb = list_first_entry(&callbacks,
2944 struct blk_plug_cb, 2694 struct blk_plug_cb,
2945 list); 2695 list);
2946 list_del(&cb->list); 2696 list_del(&cb->list);
2947 cb->callback(cb, from_schedule); 2697 cb->callback(cb);
2948 }
2949 } 2698 }
2950} 2699}
2951 2700
2952struct blk_plug_cb *blk_check_plugged(blk_plug_cb_fn unplug, void *data,
2953 int size)
2954{
2955 struct blk_plug *plug = current->plug;
2956 struct blk_plug_cb *cb;
2957
2958 if (!plug)
2959 return NULL;
2960
2961 list_for_each_entry(cb, &plug->cb_list, list)
2962 if (cb->callback == unplug && cb->data == data)
2963 return cb;
2964
2965 /* Not currently on the callback list */
2966 BUG_ON(size < sizeof(*cb));
2967 cb = kzalloc(size, GFP_ATOMIC);
2968 if (cb) {
2969 cb->data = data;
2970 cb->callback = unplug;
2971 list_add(&cb->list, &plug->cb_list);
2972 }
2973 return cb;
2974}
2975EXPORT_SYMBOL(blk_check_plugged);
2976
2977void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule) 2701void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
2978{ 2702{
2979 struct request_queue *q; 2703 struct request_queue *q;
@@ -2984,7 +2708,7 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
2984 2708
2985 BUG_ON(plug->magic != PLUG_MAGIC); 2709 BUG_ON(plug->magic != PLUG_MAGIC);
2986 2710
2987 flush_plug_callbacks(plug, from_schedule); 2711 flush_plug_callbacks(plug);
2988 if (list_empty(&plug->list)) 2712 if (list_empty(&plug->list))
2989 return; 2713 return;
2990 2714
@@ -3017,15 +2741,6 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
3017 depth = 0; 2741 depth = 0;
3018 spin_lock(q->queue_lock); 2742 spin_lock(q->queue_lock);
3019 } 2743 }
3020
3021 /*
3022 * Short-circuit if @q is dead
3023 */
3024 if (unlikely(blk_queue_dying(q))) {
3025 __blk_end_request_all(rq, -ENODEV);
3026 continue;
3027 }
3028
3029 /* 2744 /*
3030 * rq is already accounted, so use raw insert 2745 * rq is already accounted, so use raw insert
3031 */ 2746 */
diff --git a/block/blk-exec.c b/block/blk-exec.c
index 74638ec234c..a1ebceb332f 100644
--- a/block/blk-exec.c
+++ b/block/blk-exec.c
@@ -43,42 +43,29 @@ static void blk_end_sync_rq(struct request *rq, int error)
43 * Description: 43 * Description:
44 * Insert a fully prepared request at the back of the I/O scheduler queue 44 * Insert a fully prepared request at the back of the I/O scheduler queue
45 * for execution. Don't wait for completion. 45 * for execution. Don't wait for completion.
46 *
47 * Note:
48 * This function will invoke @done directly if the queue is dead.
49 */ 46 */
50void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk, 47void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
51 struct request *rq, int at_head, 48 struct request *rq, int at_head,
52 rq_end_io_fn *done) 49 rq_end_io_fn *done)
53{ 50{
54 int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK; 51 int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK;
55 bool is_pm_resume;
56
57 WARN_ON(irqs_disabled());
58
59 rq->rq_disk = bd_disk;
60 rq->end_io = done;
61 /*
62 * need to check this before __blk_run_queue(), because rq can
63 * be freed before that returns.
64 */
65 is_pm_resume = rq->cmd_type == REQ_TYPE_PM_RESUME;
66
67 spin_lock_irq(q->queue_lock);
68 52
69 if (unlikely(blk_queue_dying(q))) { 53 if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) {
70 rq->errors = -ENXIO; 54 rq->errors = -ENXIO;
71 if (rq->end_io) 55 if (rq->end_io)
72 rq->end_io(rq, rq->errors); 56 rq->end_io(rq, rq->errors);
73 spin_unlock_irq(q->queue_lock);
74 return; 57 return;
75 } 58 }
76 59
60 rq->rq_disk = bd_disk;
61 rq->end_io = done;
62 WARN_ON(irqs_disabled());
63 spin_lock_irq(q->queue_lock);
77 __elv_add_request(q, rq, where); 64 __elv_add_request(q, rq, where);
78 __blk_run_queue(q); 65 __blk_run_queue(q);
79 /* the queue is stopped so it won't be run */ 66 /* the queue is stopped so it won't be run */
80 if (is_pm_resume) 67 if (rq->cmd_type == REQ_TYPE_PM_RESUME)
81 __blk_run_queue_uncond(q); 68 q->request_fn(q);
82 spin_unlock_irq(q->queue_lock); 69 spin_unlock_irq(q->queue_lock);
83} 70}
84EXPORT_SYMBOL_GPL(blk_execute_rq_nowait); 71EXPORT_SYMBOL_GPL(blk_execute_rq_nowait);
diff --git a/block/blk-integrity.c b/block/blk-integrity.c
index da2a818c3a9..129b9e209a3 100644
--- a/block/blk-integrity.c
+++ b/block/blk-integrity.c
@@ -24,7 +24,6 @@
24#include <linux/mempool.h> 24#include <linux/mempool.h>
25#include <linux/bio.h> 25#include <linux/bio.h>
26#include <linux/scatterlist.h> 26#include <linux/scatterlist.h>
27#include <linux/export.h>
28#include <linux/slab.h> 27#include <linux/slab.h>
29 28
30#include "blk.h" 29#include "blk.h"
diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index fab4cdd3f7b..6f9bbd97865 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -16,185 +16,52 @@
16 */ 16 */
17static struct kmem_cache *iocontext_cachep; 17static struct kmem_cache *iocontext_cachep;
18 18
19/** 19static void cfq_dtor(struct io_context *ioc)
20 * get_io_context - increment reference count to io_context
21 * @ioc: io_context to get
22 *
23 * Increment reference count to @ioc.
24 */
25void get_io_context(struct io_context *ioc)
26{
27 BUG_ON(atomic_long_read(&ioc->refcount) <= 0);
28 atomic_long_inc(&ioc->refcount);
29}
30EXPORT_SYMBOL(get_io_context);
31
32static void icq_free_icq_rcu(struct rcu_head *head)
33{
34 struct io_cq *icq = container_of(head, struct io_cq, __rcu_head);
35
36 kmem_cache_free(icq->__rcu_icq_cache, icq);
37}
38
39/* Exit an icq. Called with both ioc and q locked. */
40static void ioc_exit_icq(struct io_cq *icq)
41{
42 struct elevator_type *et = icq->q->elevator->type;
43
44 if (icq->flags & ICQ_EXITED)
45 return;
46
47 if (et->ops.elevator_exit_icq_fn)
48 et->ops.elevator_exit_icq_fn(icq);
49
50 icq->flags |= ICQ_EXITED;
51}
52
53/* Release an icq. Called with both ioc and q locked. */
54static void ioc_destroy_icq(struct io_cq *icq)
55{
56 struct io_context *ioc = icq->ioc;
57 struct request_queue *q = icq->q;
58 struct elevator_type *et = q->elevator->type;
59
60 lockdep_assert_held(&ioc->lock);
61 lockdep_assert_held(q->queue_lock);
62
63 radix_tree_delete(&ioc->icq_tree, icq->q->id);
64 hlist_del_init(&icq->ioc_node);
65 list_del_init(&icq->q_node);
66
67 /*
68 * Both setting lookup hint to and clearing it from @icq are done
69 * under queue_lock. If it's not pointing to @icq now, it never
70 * will. Hint assignment itself can race safely.
71 */
72 if (rcu_dereference_raw(ioc->icq_hint) == icq)
73 rcu_assign_pointer(ioc->icq_hint, NULL);
74
75 ioc_exit_icq(icq);
76
77 /*
78 * @icq->q might have gone away by the time RCU callback runs
79 * making it impossible to determine icq_cache. Record it in @icq.
80 */
81 icq->__rcu_icq_cache = et->icq_cache;
82 call_rcu(&icq->__rcu_head, icq_free_icq_rcu);
83}
84
85/*
86 * Slow path for ioc release in put_io_context(). Performs double-lock
87 * dancing to unlink all icq's and then frees ioc.
88 */
89static void ioc_release_fn(struct work_struct *work)
90{ 20{
91 struct io_context *ioc = container_of(work, struct io_context, 21 if (!hlist_empty(&ioc->cic_list)) {
92 release_work); 22 struct cfq_io_context *cic;
93 unsigned long flags;
94 23
95 /* 24 cic = hlist_entry(ioc->cic_list.first, struct cfq_io_context,
96 * Exiting icq may call into put_io_context() through elevator 25 cic_list);
97 * which will trigger lockdep warning. The ioc's are guaranteed to 26 cic->dtor(ioc);
98 * be different, use a different locking subclass here. Use
99 * irqsave variant as there's no spin_lock_irq_nested().
100 */
101 spin_lock_irqsave_nested(&ioc->lock, flags, 1);
102
103 while (!hlist_empty(&ioc->icq_list)) {
104 struct io_cq *icq = hlist_entry(ioc->icq_list.first,
105 struct io_cq, ioc_node);
106 struct request_queue *q = icq->q;
107
108 if (spin_trylock(q->queue_lock)) {
109 ioc_destroy_icq(icq);
110 spin_unlock(q->queue_lock);
111 } else {
112 spin_unlock_irqrestore(&ioc->lock, flags);
113 cpu_relax();
114 spin_lock_irqsave_nested(&ioc->lock, flags, 1);
115 }
116 } 27 }
117
118 spin_unlock_irqrestore(&ioc->lock, flags);
119
120 kmem_cache_free(iocontext_cachep, ioc);
121} 28}
122 29
123/** 30/*
124 * put_io_context - put a reference of io_context 31 * IO Context helper functions. put_io_context() returns 1 if there are no
125 * @ioc: io_context to put 32 * more users of this io context, 0 otherwise.
126 *
127 * Decrement reference count of @ioc and release it if the count reaches
128 * zero.
129 */ 33 */
130void put_io_context(struct io_context *ioc) 34int put_io_context(struct io_context *ioc)
131{ 35{
132 unsigned long flags;
133 bool free_ioc = false;
134
135 if (ioc == NULL) 36 if (ioc == NULL)
136 return; 37 return 1;
137 38
138 BUG_ON(atomic_long_read(&ioc->refcount) <= 0); 39 BUG_ON(atomic_long_read(&ioc->refcount) == 0);
139 40
140 /*
141 * Releasing ioc requires reverse order double locking and we may
142 * already be holding a queue_lock. Do it asynchronously from wq.
143 */
144 if (atomic_long_dec_and_test(&ioc->refcount)) { 41 if (atomic_long_dec_and_test(&ioc->refcount)) {
145 spin_lock_irqsave(&ioc->lock, flags); 42 rcu_read_lock();
146 if (!hlist_empty(&ioc->icq_list)) 43 cfq_dtor(ioc);
147 schedule_work(&ioc->release_work); 44 rcu_read_unlock();
148 else
149 free_ioc = true;
150 spin_unlock_irqrestore(&ioc->lock, flags);
151 }
152 45
153 if (free_ioc)
154 kmem_cache_free(iocontext_cachep, ioc); 46 kmem_cache_free(iocontext_cachep, ioc);
47 return 1;
48 }
49 return 0;
155} 50}
156EXPORT_SYMBOL(put_io_context); 51EXPORT_SYMBOL(put_io_context);
157 52
158/** 53static void cfq_exit(struct io_context *ioc)
159 * put_io_context_active - put active reference on ioc
160 * @ioc: ioc of interest
161 *
162 * Undo get_io_context_active(). If active reference reaches zero after
163 * put, @ioc can never issue further IOs and ioscheds are notified.
164 */
165void put_io_context_active(struct io_context *ioc)
166{ 54{
167 struct hlist_node *n; 55 rcu_read_lock();
168 unsigned long flags;
169 struct io_cq *icq;
170 56
171 if (!atomic_dec_and_test(&ioc->active_ref)) { 57 if (!hlist_empty(&ioc->cic_list)) {
172 put_io_context(ioc); 58 struct cfq_io_context *cic;
173 return;
174 }
175 59
176 /* 60 cic = hlist_entry(ioc->cic_list.first, struct cfq_io_context,
177 * Need ioc lock to walk icq_list and q lock to exit icq. Perform 61 cic_list);
178 * reverse double locking. Read comment in ioc_release_fn() for 62 cic->exit(ioc);
179 * explanation on the nested locking annotation.
180 */
181retry:
182 spin_lock_irqsave_nested(&ioc->lock, flags, 1);
183 hlist_for_each_entry(icq, n, &ioc->icq_list, ioc_node) {
184 if (icq->flags & ICQ_EXITED)
185 continue;
186 if (spin_trylock(icq->q->queue_lock)) {
187 ioc_exit_icq(icq);
188 spin_unlock(icq->q->queue_lock);
189 } else {
190 spin_unlock_irqrestore(&ioc->lock, flags);
191 cpu_relax();
192 goto retry;
193 }
194 } 63 }
195 spin_unlock_irqrestore(&ioc->lock, flags); 64 rcu_read_unlock();
196
197 put_io_context(ioc);
198} 65}
199 66
200/* Called by the exiting task */ 67/* Called by the exiting task */
@@ -207,197 +74,86 @@ void exit_io_context(struct task_struct *task)
207 task->io_context = NULL; 74 task->io_context = NULL;
208 task_unlock(task); 75 task_unlock(task);
209 76
210 atomic_dec(&ioc->nr_tasks); 77 if (atomic_dec_and_test(&ioc->nr_tasks))
211 put_io_context_active(ioc); 78 cfq_exit(ioc);
212}
213 79
214/** 80 put_io_context(ioc);
215 * ioc_clear_queue - break any ioc association with the specified queue
216 * @q: request_queue being cleared
217 *
218 * Walk @q->icq_list and exit all io_cq's. Must be called with @q locked.
219 */
220void ioc_clear_queue(struct request_queue *q)
221{
222 lockdep_assert_held(q->queue_lock);
223
224 while (!list_empty(&q->icq_list)) {
225 struct io_cq *icq = list_entry(q->icq_list.next,
226 struct io_cq, q_node);
227 struct io_context *ioc = icq->ioc;
228
229 spin_lock(&ioc->lock);
230 ioc_destroy_icq(icq);
231 spin_unlock(&ioc->lock);
232 }
233} 81}
234 82
235int create_task_io_context(struct task_struct *task, gfp_t gfp_flags, int node) 83struct io_context *alloc_io_context(gfp_t gfp_flags, int node)
236{ 84{
237 struct io_context *ioc; 85 struct io_context *ioc;
238 int ret;
239
240 ioc = kmem_cache_alloc_node(iocontext_cachep, gfp_flags | __GFP_ZERO,
241 node);
242 if (unlikely(!ioc))
243 return -ENOMEM;
244
245 /* initialize */
246 atomic_long_set(&ioc->refcount, 1);
247 atomic_set(&ioc->nr_tasks, 1);
248 atomic_set(&ioc->active_ref, 1);
249 spin_lock_init(&ioc->lock);
250 INIT_RADIX_TREE(&ioc->icq_tree, GFP_ATOMIC | __GFP_HIGH);
251 INIT_HLIST_HEAD(&ioc->icq_list);
252 INIT_WORK(&ioc->release_work, ioc_release_fn);
253 86
254 /* 87 ioc = kmem_cache_alloc_node(iocontext_cachep, gfp_flags, node);
255 * Try to install. ioc shouldn't be installed if someone else 88 if (ioc) {
256 * already did or @task, which isn't %current, is exiting. Note 89 atomic_long_set(&ioc->refcount, 1);
257 * that we need to allow ioc creation on exiting %current as exit 90 atomic_set(&ioc->nr_tasks, 1);
258 * path may issue IOs from e.g. exit_files(). The exit path is 91 spin_lock_init(&ioc->lock);
259 * responsible for not issuing IO after exit_io_context(). 92 ioc->ioprio_changed = 0;
260 */ 93 ioc->ioprio = 0;
261 task_lock(task); 94 ioc->last_waited = 0; /* doesn't matter... */
262 if (!task->io_context && 95 ioc->nr_batch_requests = 0; /* because this is 0 */
263 (task == current || !(task->flags & PF_EXITING))) 96 INIT_RADIX_TREE(&ioc->radix_root, GFP_ATOMIC | __GFP_HIGH);
264 task->io_context = ioc; 97 INIT_HLIST_HEAD(&ioc->cic_list);
265 else 98 ioc->ioc_data = NULL;
266 kmem_cache_free(iocontext_cachep, ioc); 99#if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE)
267 100 ioc->cgroup_changed = 0;
268 ret = task->io_context ? 0 : -EBUSY; 101#endif
269 102 }
270 task_unlock(task);
271 103
272 return ret; 104 return ioc;
273} 105}
274 106
275/** 107/*
276 * get_task_io_context - get io_context of a task 108 * If the current task has no IO context then create one and initialise it.
277 * @task: task of interest 109 * Otherwise, return its existing IO context.
278 * @gfp_flags: allocation flags, used if allocation is necessary
279 * @node: allocation node, used if allocation is necessary
280 *
281 * Return io_context of @task. If it doesn't exist, it is created with
282 * @gfp_flags and @node. The returned io_context has its reference count
283 * incremented.
284 * 110 *
285 * This function always goes through task_lock() and it's better to use 111 * This returned IO context doesn't have a specifically elevated refcount,
286 * %current->io_context + get_io_context() for %current. 112 * but since the current task itself holds a reference, the context can be
113 * used in general code, so long as it stays within `current` context.
287 */ 114 */
288struct io_context *get_task_io_context(struct task_struct *task, 115struct io_context *current_io_context(gfp_t gfp_flags, int node)
289 gfp_t gfp_flags, int node)
290{ 116{
291 struct io_context *ioc; 117 struct task_struct *tsk = current;
292 118 struct io_context *ret;
293 might_sleep_if(gfp_flags & __GFP_WAIT); 119
294 120 ret = tsk->io_context;
295 do { 121 if (likely(ret))
296 task_lock(task); 122 return ret;
297 ioc = task->io_context; 123
298 if (likely(ioc)) { 124 ret = alloc_io_context(gfp_flags, node);
299 get_io_context(ioc); 125 if (ret) {
300 task_unlock(task); 126 /* make sure set_task_ioprio() sees the settings above */
301 return ioc; 127 smp_wmb();
302 } 128 tsk->io_context = ret;
303 task_unlock(task); 129 }
304 } while (!create_task_io_context(task, gfp_flags, node));
305 130
306 return NULL; 131 return ret;
307} 132}
308EXPORT_SYMBOL(get_task_io_context);
309 133
310/** 134/*
311 * ioc_lookup_icq - lookup io_cq from ioc 135 * If the current task has no IO context then create one and initialise it.
312 * @ioc: the associated io_context 136 * If it does have a context, take a ref on it.
313 * @q: the associated request_queue
314 * 137 *
315 * Look up io_cq associated with @ioc - @q pair from @ioc. Must be called 138 * This is always called in the context of the task which submitted the I/O.
316 * with @q->queue_lock held.
317 */ 139 */
318struct io_cq *ioc_lookup_icq(struct io_context *ioc, struct request_queue *q) 140struct io_context *get_io_context(gfp_t gfp_flags, int node)
319{ 141{
320 struct io_cq *icq; 142 struct io_context *ioc = NULL;
321
322 lockdep_assert_held(q->queue_lock);
323 143
324 /* 144 /*
325 * icq's are indexed from @ioc using radix tree and hint pointer, 145 * Check for unlikely race with exiting task. ioc ref count is
326 * both of which are protected with RCU. All removals are done 146 * zero when ioc is being detached.
327 * holding both q and ioc locks, and we're holding q lock - if we
328 * find a icq which points to us, it's guaranteed to be valid.
329 */ 147 */
330 rcu_read_lock(); 148 do {
331 icq = rcu_dereference(ioc->icq_hint); 149 ioc = current_io_context(gfp_flags, node);
332 if (icq && icq->q == q) 150 if (unlikely(!ioc))
333 goto out; 151 break;
334 152 } while (!atomic_long_inc_not_zero(&ioc->refcount));
335 icq = radix_tree_lookup(&ioc->icq_tree, q->id);
336 if (icq && icq->q == q)
337 rcu_assign_pointer(ioc->icq_hint, icq); /* allowed to race */
338 else
339 icq = NULL;
340out:
341 rcu_read_unlock();
342 return icq;
343}
344EXPORT_SYMBOL(ioc_lookup_icq);
345
346/**
347 * ioc_create_icq - create and link io_cq
348 * @ioc: io_context of interest
349 * @q: request_queue of interest
350 * @gfp_mask: allocation mask
351 *
352 * Make sure io_cq linking @ioc and @q exists. If icq doesn't exist, they
353 * will be created using @gfp_mask.
354 *
355 * The caller is responsible for ensuring @ioc won't go away and @q is
356 * alive and will stay alive until this function returns.
357 */
358struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q,
359 gfp_t gfp_mask)
360{
361 struct elevator_type *et = q->elevator->type;
362 struct io_cq *icq;
363
364 /* allocate stuff */
365 icq = kmem_cache_alloc_node(et->icq_cache, gfp_mask | __GFP_ZERO,
366 q->node);
367 if (!icq)
368 return NULL;
369
370 if (radix_tree_preload(gfp_mask) < 0) {
371 kmem_cache_free(et->icq_cache, icq);
372 return NULL;
373 }
374
375 icq->ioc = ioc;
376 icq->q = q;
377 INIT_LIST_HEAD(&icq->q_node);
378 INIT_HLIST_NODE(&icq->ioc_node);
379
380 /* lock both q and ioc and try to link @icq */
381 spin_lock_irq(q->queue_lock);
382 spin_lock(&ioc->lock);
383
384 if (likely(!radix_tree_insert(&ioc->icq_tree, q->id, icq))) {
385 hlist_add_head(&icq->ioc_node, &ioc->icq_list);
386 list_add(&icq->q_node, &q->icq_list);
387 if (et->ops.elevator_init_icq_fn)
388 et->ops.elevator_init_icq_fn(icq);
389 } else {
390 kmem_cache_free(et->icq_cache, icq);
391 icq = ioc_lookup_icq(ioc, q);
392 if (!icq)
393 printk(KERN_ERR "cfq: icq link failed!\n");
394 }
395 153
396 spin_unlock(&ioc->lock); 154 return ioc;
397 spin_unlock_irq(q->queue_lock);
398 radix_tree_preload_end();
399 return icq;
400} 155}
156EXPORT_SYMBOL(get_io_context);
401 157
402static int __init blk_ioc_init(void) 158static int __init blk_ioc_init(void)
403{ 159{
diff --git a/block/blk-lib.c b/block/blk-lib.c
index b3a1f2b70b3..2b461b496a7 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -43,12 +43,10 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
43 DECLARE_COMPLETION_ONSTACK(wait); 43 DECLARE_COMPLETION_ONSTACK(wait);
44 struct request_queue *q = bdev_get_queue(bdev); 44 struct request_queue *q = bdev_get_queue(bdev);
45 int type = REQ_WRITE | REQ_DISCARD; 45 int type = REQ_WRITE | REQ_DISCARD;
46 sector_t max_discard_sectors; 46 unsigned int max_discard_sectors;
47 sector_t granularity, alignment;
48 struct bio_batch bb; 47 struct bio_batch bb;
49 struct bio *bio; 48 struct bio *bio;
50 int ret = 0; 49 int ret = 0;
51 struct blk_plug plug;
52 50
53 if (!q) 51 if (!q)
54 return -ENXIO; 52 return -ENXIO;
@@ -56,21 +54,18 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
56 if (!blk_queue_discard(q)) 54 if (!blk_queue_discard(q))
57 return -EOPNOTSUPP; 55 return -EOPNOTSUPP;
58 56
59 /* Zero-sector (unknown) and one-sector granularities are the same. */
60 granularity = max(q->limits.discard_granularity >> 9, 1U);
61 alignment = bdev_discard_alignment(bdev) >> 9;
62 alignment = sector_div(alignment, granularity);
63
64 /* 57 /*
65 * Ensure that max_discard_sectors is of the proper 58 * Ensure that max_discard_sectors is of the proper
66 * granularity, so that requests stay aligned after a split. 59 * granularity
67 */ 60 */
68 max_discard_sectors = min(q->limits.max_discard_sectors, UINT_MAX >> 9); 61 max_discard_sectors = min(q->limits.max_discard_sectors, UINT_MAX >> 9);
69 sector_div(max_discard_sectors, granularity);
70 max_discard_sectors *= granularity;
71 if (unlikely(!max_discard_sectors)) { 62 if (unlikely(!max_discard_sectors)) {
72 /* Avoid infinite loop below. Being cautious never hurts. */ 63 /* Avoid infinite loop below. Being cautious never hurts. */
73 return -EOPNOTSUPP; 64 return -EOPNOTSUPP;
65 } else if (q->limits.discard_granularity) {
66 unsigned int disc_sects = q->limits.discard_granularity >> 9;
67
68 max_discard_sectors &= ~(disc_sects - 1);
74 } 69 }
75 70
76 if (flags & BLKDEV_DISCARD_SECURE) { 71 if (flags & BLKDEV_DISCARD_SECURE) {
@@ -83,119 +78,29 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
83 bb.flags = 1 << BIO_UPTODATE; 78 bb.flags = 1 << BIO_UPTODATE;
84 bb.wait = &wait; 79 bb.wait = &wait;
85 80
86 blk_start_plug(&plug);
87 while (nr_sects) { 81 while (nr_sects) {
88 unsigned int req_sects;
89 sector_t end_sect, tmp;
90
91 bio = bio_alloc(gfp_mask, 1); 82 bio = bio_alloc(gfp_mask, 1);
92 if (!bio) { 83 if (!bio) {
93 ret = -ENOMEM; 84 ret = -ENOMEM;
94 break; 85 break;
95 } 86 }
96 87
97 req_sects = min_t(sector_t, nr_sects, max_discard_sectors);
98
99 /*
100 * If splitting a request, and the next starting sector would be
101 * misaligned, stop the discard at the previous aligned sector.
102 */
103 end_sect = sector + req_sects;
104 tmp = end_sect;
105 if (req_sects < nr_sects &&
106 sector_div(tmp, granularity) != alignment) {
107 end_sect = end_sect - alignment;
108 sector_div(end_sect, granularity);
109 end_sect = end_sect * granularity + alignment;
110 req_sects = end_sect - sector;
111 }
112
113 bio->bi_sector = sector; 88 bio->bi_sector = sector;
114 bio->bi_end_io = bio_batch_end_io; 89 bio->bi_end_io = bio_batch_end_io;
115 bio->bi_bdev = bdev; 90 bio->bi_bdev = bdev;
116 bio->bi_private = &bb; 91 bio->bi_private = &bb;
117 92
118 bio->bi_size = req_sects << 9; 93 if (nr_sects > max_discard_sectors) {
119 nr_sects -= req_sects; 94 bio->bi_size = max_discard_sectors << 9;
120 sector = end_sect; 95 nr_sects -= max_discard_sectors;
121 96 sector += max_discard_sectors;
122 atomic_inc(&bb.done);
123 submit_bio(type, bio);
124 }
125 blk_finish_plug(&plug);
126
127 /* Wait for bios in-flight */
128 if (!atomic_dec_and_test(&bb.done))
129 wait_for_completion(&wait);
130
131 if (!test_bit(BIO_UPTODATE, &bb.flags))
132 ret = -EIO;
133
134 return ret;
135}
136EXPORT_SYMBOL(blkdev_issue_discard);
137
138/**
139 * blkdev_issue_write_same - queue a write same operation
140 * @bdev: target blockdev
141 * @sector: start sector
142 * @nr_sects: number of sectors to write
143 * @gfp_mask: memory allocation flags (for bio_alloc)
144 * @page: page containing data to write
145 *
146 * Description:
147 * Issue a write same request for the sectors in question.
148 */
149int blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
150 sector_t nr_sects, gfp_t gfp_mask,
151 struct page *page)
152{
153 DECLARE_COMPLETION_ONSTACK(wait);
154 struct request_queue *q = bdev_get_queue(bdev);
155 unsigned int max_write_same_sectors;
156 struct bio_batch bb;
157 struct bio *bio;
158 int ret = 0;
159
160 if (!q)
161 return -ENXIO;
162
163 max_write_same_sectors = q->limits.max_write_same_sectors;
164
165 if (max_write_same_sectors == 0)
166 return -EOPNOTSUPP;
167
168 atomic_set(&bb.done, 1);
169 bb.flags = 1 << BIO_UPTODATE;
170 bb.wait = &wait;
171
172 while (nr_sects) {
173 bio = bio_alloc(gfp_mask, 1);
174 if (!bio) {
175 ret = -ENOMEM;
176 break;
177 }
178
179 bio->bi_sector = sector;
180 bio->bi_end_io = bio_batch_end_io;
181 bio->bi_bdev = bdev;
182 bio->bi_private = &bb;
183 bio->bi_vcnt = 1;
184 bio->bi_io_vec->bv_page = page;
185 bio->bi_io_vec->bv_offset = 0;
186 bio->bi_io_vec->bv_len = bdev_logical_block_size(bdev);
187
188 if (nr_sects > max_write_same_sectors) {
189 bio->bi_size = max_write_same_sectors << 9;
190 nr_sects -= max_write_same_sectors;
191 sector += max_write_same_sectors;
192 } else { 97 } else {
193 bio->bi_size = nr_sects << 9; 98 bio->bi_size = nr_sects << 9;
194 nr_sects = 0; 99 nr_sects = 0;
195 } 100 }
196 101
197 atomic_inc(&bb.done); 102 atomic_inc(&bb.done);
198 submit_bio(REQ_WRITE | REQ_WRITE_SAME, bio); 103 submit_bio(type, bio);
199 } 104 }
200 105
201 /* Wait for bios in-flight */ 106 /* Wait for bios in-flight */
@@ -203,11 +108,11 @@ int blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
203 wait_for_completion(&wait); 108 wait_for_completion(&wait);
204 109
205 if (!test_bit(BIO_UPTODATE, &bb.flags)) 110 if (!test_bit(BIO_UPTODATE, &bb.flags))
206 ret = -ENOTSUPP; 111 ret = -EIO;
207 112
208 return ret; 113 return ret;
209} 114}
210EXPORT_SYMBOL(blkdev_issue_write_same); 115EXPORT_SYMBOL(blkdev_issue_discard);
211 116
212/** 117/**
213 * blkdev_issue_zeroout - generate number of zero filed write bios 118 * blkdev_issue_zeroout - generate number of zero filed write bios
@@ -220,7 +125,7 @@ EXPORT_SYMBOL(blkdev_issue_write_same);
220 * Generate and issue number of bios with zerofiled pages. 125 * Generate and issue number of bios with zerofiled pages.
221 */ 126 */
222 127
223int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, 128int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
224 sector_t nr_sects, gfp_t gfp_mask) 129 sector_t nr_sects, gfp_t gfp_mask)
225{ 130{
226 int ret; 131 int ret;
@@ -270,32 +175,4 @@ int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
270 175
271 return ret; 176 return ret;
272} 177}
273
274/**
275 * blkdev_issue_zeroout - zero-fill a block range
276 * @bdev: blockdev to write
277 * @sector: start sector
278 * @nr_sects: number of sectors to write
279 * @gfp_mask: memory allocation flags (for bio_alloc)
280 *
281 * Description:
282 * Generate and issue number of bios with zerofiled pages.
283 */
284
285int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
286 sector_t nr_sects, gfp_t gfp_mask)
287{
288 if (bdev_write_same(bdev)) {
289 unsigned char bdn[BDEVNAME_SIZE];
290
291 if (!blkdev_issue_write_same(bdev, sector, nr_sects, gfp_mask,
292 ZERO_PAGE(0)))
293 return 0;
294
295 bdevname(bdev, bdn);
296 pr_err("%s: WRITE SAME failed. Manually zeroing.\n", bdn);
297 }
298
299 return __blkdev_issue_zeroout(bdev, sector, nr_sects, gfp_mask);
300}
301EXPORT_SYMBOL(blkdev_issue_zeroout); 178EXPORT_SYMBOL(blkdev_issue_zeroout);
diff --git a/block/blk-map.c b/block/blk-map.c
index 623e1cd4cff..164cd005970 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -311,7 +311,7 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
311 if (IS_ERR(bio)) 311 if (IS_ERR(bio))
312 return PTR_ERR(bio); 312 return PTR_ERR(bio);
313 313
314 if (!reading) 314 if (rq_data_dir(rq) == WRITE)
315 bio->bi_rw |= REQ_WRITE; 315 bio->bi_rw |= REQ_WRITE;
316 316
317 if (do_copy) 317 if (do_copy)
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 936a110de0b..cfcc37cb222 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -110,49 +110,6 @@ static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio,
110 return 0; 110 return 0;
111} 111}
112 112
113static void
114__blk_segment_map_sg(struct request_queue *q, struct bio_vec *bvec,
115 struct scatterlist *sglist, struct bio_vec **bvprv,
116 struct scatterlist **sg, int *nsegs, int *cluster)
117{
118
119 int nbytes = bvec->bv_len;
120
121 if (*bvprv && *cluster) {
122 if ((*sg)->length + nbytes > queue_max_segment_size(q))
123 goto new_segment;
124
125 if (!BIOVEC_PHYS_MERGEABLE(*bvprv, bvec))
126 goto new_segment;
127 if (!BIOVEC_SEG_BOUNDARY(q, *bvprv, bvec))
128 goto new_segment;
129
130 (*sg)->length += nbytes;
131 } else {
132new_segment:
133 if (!*sg)
134 *sg = sglist;
135 else {
136 /*
137 * If the driver previously mapped a shorter
138 * list, we could see a termination bit
139 * prematurely unless it fully inits the sg
140 * table on each mapping. We KNOW that there
141 * must be more entries here or the driver
142 * would be buggy, so force clear the
143 * termination bit to avoid doing a full
144 * sg_init_table() in drivers for each command.
145 */
146 (*sg)->page_link &= ~0x02;
147 *sg = sg_next(*sg);
148 }
149
150 sg_set_page(*sg, bvec->bv_page, nbytes, bvec->bv_offset);
151 (*nsegs)++;
152 }
153 *bvprv = bvec;
154}
155
156/* 113/*
157 * map a request to scatterlist, return number of sg entries setup. Caller 114 * map a request to scatterlist, return number of sg entries setup. Caller
158 * must make sure sg can hold rq->nr_phys_segments entries 115 * must make sure sg can hold rq->nr_phys_segments entries
@@ -174,8 +131,41 @@ int blk_rq_map_sg(struct request_queue *q, struct request *rq,
174 bvprv = NULL; 131 bvprv = NULL;
175 sg = NULL; 132 sg = NULL;
176 rq_for_each_segment(bvec, rq, iter) { 133 rq_for_each_segment(bvec, rq, iter) {
177 __blk_segment_map_sg(q, bvec, sglist, &bvprv, &sg, 134 int nbytes = bvec->bv_len;
178 &nsegs, &cluster); 135
136 if (bvprv && cluster) {
137 if (sg->length + nbytes > queue_max_segment_size(q))
138 goto new_segment;
139
140 if (!BIOVEC_PHYS_MERGEABLE(bvprv, bvec))
141 goto new_segment;
142 if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bvec))
143 goto new_segment;
144
145 sg->length += nbytes;
146 } else {
147new_segment:
148 if (!sg)
149 sg = sglist;
150 else {
151 /*
152 * If the driver previously mapped a shorter
153 * list, we could see a termination bit
154 * prematurely unless it fully inits the sg
155 * table on each mapping. We KNOW that there
156 * must be more entries here or the driver
157 * would be buggy, so force clear the
158 * termination bit to avoid doing a full
159 * sg_init_table() in drivers for each command.
160 */
161 sg->page_link &= ~0x02;
162 sg = sg_next(sg);
163 }
164
165 sg_set_page(sg, bvec->bv_page, nbytes, bvec->bv_offset);
166 nsegs++;
167 }
168 bvprv = bvec;
179 } /* segments in rq */ 169 } /* segments in rq */
180 170
181 171
@@ -209,43 +199,6 @@ int blk_rq_map_sg(struct request_queue *q, struct request *rq,
209} 199}
210EXPORT_SYMBOL(blk_rq_map_sg); 200EXPORT_SYMBOL(blk_rq_map_sg);
211 201
212/**
213 * blk_bio_map_sg - map a bio to a scatterlist
214 * @q: request_queue in question
215 * @bio: bio being mapped
216 * @sglist: scatterlist being mapped
217 *
218 * Note:
219 * Caller must make sure sg can hold bio->bi_phys_segments entries
220 *
221 * Will return the number of sg entries setup
222 */
223int blk_bio_map_sg(struct request_queue *q, struct bio *bio,
224 struct scatterlist *sglist)
225{
226 struct bio_vec *bvec, *bvprv;
227 struct scatterlist *sg;
228 int nsegs, cluster;
229 unsigned long i;
230
231 nsegs = 0;
232 cluster = blk_queue_cluster(q);
233
234 bvprv = NULL;
235 sg = NULL;
236 bio_for_each_segment(bvec, bio, i) {
237 __blk_segment_map_sg(q, bvec, sglist, &bvprv, &sg,
238 &nsegs, &cluster);
239 } /* segments in bio */
240
241 if (sg)
242 sg_mark_end(sg);
243
244 BUG_ON(bio->bi_phys_segments && nsegs > bio->bi_phys_segments);
245 return nsegs;
246}
247EXPORT_SYMBOL(blk_bio_map_sg);
248
249static inline int ll_new_hw_segment(struct request_queue *q, 202static inline int ll_new_hw_segment(struct request_queue *q,
250 struct request *req, 203 struct request *req,
251 struct bio *bio) 204 struct bio *bio)
@@ -275,8 +228,14 @@ no_merge:
275int ll_back_merge_fn(struct request_queue *q, struct request *req, 228int ll_back_merge_fn(struct request_queue *q, struct request *req,
276 struct bio *bio) 229 struct bio *bio)
277{ 230{
278 if (blk_rq_sectors(req) + bio_sectors(bio) > 231 unsigned short max_sectors;
279 blk_rq_get_max_sectors(req)) { 232
233 if (unlikely(req->cmd_type == REQ_TYPE_BLOCK_PC))
234 max_sectors = queue_max_hw_sectors(q);
235 else
236 max_sectors = queue_max_sectors(q);
237
238 if (blk_rq_sectors(req) + bio_sectors(bio) > max_sectors) {
280 req->cmd_flags |= REQ_NOMERGE; 239 req->cmd_flags |= REQ_NOMERGE;
281 if (req == q->last_merge) 240 if (req == q->last_merge)
282 q->last_merge = NULL; 241 q->last_merge = NULL;
@@ -293,8 +252,15 @@ int ll_back_merge_fn(struct request_queue *q, struct request *req,
293int ll_front_merge_fn(struct request_queue *q, struct request *req, 252int ll_front_merge_fn(struct request_queue *q, struct request *req,
294 struct bio *bio) 253 struct bio *bio)
295{ 254{
296 if (blk_rq_sectors(req) + bio_sectors(bio) > 255 unsigned short max_sectors;
297 blk_rq_get_max_sectors(req)) { 256
257 if (unlikely(req->cmd_type == REQ_TYPE_BLOCK_PC))
258 max_sectors = queue_max_hw_sectors(q);
259 else
260 max_sectors = queue_max_sectors(q);
261
262
263 if (blk_rq_sectors(req) + bio_sectors(bio) > max_sectors) {
298 req->cmd_flags |= REQ_NOMERGE; 264 req->cmd_flags |= REQ_NOMERGE;
299 if (req == q->last_merge) 265 if (req == q->last_merge)
300 q->last_merge = NULL; 266 q->last_merge = NULL;
@@ -325,8 +291,7 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
325 /* 291 /*
326 * Will it become too large? 292 * Will it become too large?
327 */ 293 */
328 if ((blk_rq_sectors(req) + blk_rq_sectors(next)) > 294 if ((blk_rq_sectors(req) + blk_rq_sectors(next)) > queue_max_sectors(q))
329 blk_rq_get_max_sectors(req))
330 return 0; 295 return 0;
331 296
332 total_phys_segments = req->nr_phys_segments + next->nr_phys_segments; 297 total_phys_segments = req->nr_phys_segments + next->nr_phys_segments;
@@ -405,7 +370,16 @@ static int attempt_merge(struct request_queue *q, struct request *req,
405 if (!rq_mergeable(req) || !rq_mergeable(next)) 370 if (!rq_mergeable(req) || !rq_mergeable(next))
406 return 0; 371 return 0;
407 372
408 if (!blk_check_merge_flags(req->cmd_flags, next->cmd_flags)) 373 /*
374 * Don't merge file system requests and discard requests
375 */
376 if ((req->cmd_flags & REQ_DISCARD) != (next->cmd_flags & REQ_DISCARD))
377 return 0;
378
379 /*
380 * Don't merge discard requests and secure discard requests
381 */
382 if ((req->cmd_flags & REQ_SECURE) != (next->cmd_flags & REQ_SECURE))
409 return 0; 383 return 0;
410 384
411 /* 385 /*
@@ -419,10 +393,6 @@ static int attempt_merge(struct request_queue *q, struct request *req,
419 || next->special) 393 || next->special)
420 return 0; 394 return 0;
421 395
422 if (req->cmd_flags & REQ_WRITE_SAME &&
423 !blk_write_same_mergeable(req->bio, next->bio))
424 return 0;
425
426 /* 396 /*
427 * If we are allowed to merge, then append bio list 397 * If we are allowed to merge, then append bio list
428 * from next to rq and release next. merge_requests_fn 398 * from next to rq and release next. merge_requests_fn
@@ -501,40 +471,3 @@ int blk_attempt_req_merge(struct request_queue *q, struct request *rq,
501{ 471{
502 return attempt_merge(q, rq, next); 472 return attempt_merge(q, rq, next);
503} 473}
504
505bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
506{
507 if (!rq_mergeable(rq) || !bio_mergeable(bio))
508 return false;
509
510 if (!blk_check_merge_flags(rq->cmd_flags, bio->bi_rw))
511 return false;
512
513 /* different data direction or already started, don't merge */
514 if (bio_data_dir(bio) != rq_data_dir(rq))
515 return false;
516
517 /* must be same device and not a special request */
518 if (rq->rq_disk != bio->bi_bdev->bd_disk || rq->special)
519 return false;
520
521 /* only merge integrity protected bio into ditto rq */
522 if (bio_integrity(bio) != blk_integrity_rq(rq))
523 return false;
524
525 /* must be using the same buffer */
526 if (rq->cmd_flags & REQ_WRITE_SAME &&
527 !blk_write_same_mergeable(rq->bio, bio))
528 return false;
529
530 return true;
531}
532
533int blk_try_merge(struct request *rq, struct bio *bio)
534{
535 if (blk_rq_pos(rq) + blk_rq_sectors(rq) == bio->bi_sector)
536 return ELEVATOR_BACK_MERGE;
537 else if (blk_rq_pos(rq) - bio_sectors(bio) == bio->bi_sector)
538 return ELEVATOR_FRONT_MERGE;
539 return ELEVATOR_NO_MERGE;
540}
diff --git a/block/blk-settings.c b/block/blk-settings.c
index c50ecf0ea3b..fa1eb0449a0 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -104,7 +104,9 @@ EXPORT_SYMBOL_GPL(blk_queue_lld_busy);
104 * @lim: the queue_limits structure to reset 104 * @lim: the queue_limits structure to reset
105 * 105 *
106 * Description: 106 * Description:
107 * Returns a queue_limit struct to its default state. 107 * Returns a queue_limit struct to its default state. Can be used by
108 * stacking drivers like DM that stage table swaps and reuse an
109 * existing device queue.
108 */ 110 */
109void blk_set_default_limits(struct queue_limits *lim) 111void blk_set_default_limits(struct queue_limits *lim)
110{ 112{
@@ -112,13 +114,13 @@ void blk_set_default_limits(struct queue_limits *lim)
112 lim->max_integrity_segments = 0; 114 lim->max_integrity_segments = 0;
113 lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK; 115 lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK;
114 lim->max_segment_size = BLK_MAX_SEGMENT_SIZE; 116 lim->max_segment_size = BLK_MAX_SEGMENT_SIZE;
115 lim->max_sectors = lim->max_hw_sectors = BLK_SAFE_MAX_SECTORS; 117 lim->max_sectors = BLK_DEF_MAX_SECTORS;
116 lim->max_write_same_sectors = 0; 118 lim->max_hw_sectors = INT_MAX;
117 lim->max_discard_sectors = 0; 119 lim->max_discard_sectors = 0;
118 lim->discard_granularity = 0; 120 lim->discard_granularity = 0;
119 lim->discard_alignment = 0; 121 lim->discard_alignment = 0;
120 lim->discard_misaligned = 0; 122 lim->discard_misaligned = 0;
121 lim->discard_zeroes_data = 0; 123 lim->discard_zeroes_data = 1;
122 lim->logical_block_size = lim->physical_block_size = lim->io_min = 512; 124 lim->logical_block_size = lim->physical_block_size = lim->io_min = 512;
123 lim->bounce_pfn = (unsigned long)(BLK_BOUNCE_ANY >> PAGE_SHIFT); 125 lim->bounce_pfn = (unsigned long)(BLK_BOUNCE_ANY >> PAGE_SHIFT);
124 lim->alignment_offset = 0; 126 lim->alignment_offset = 0;
@@ -129,27 +131,6 @@ void blk_set_default_limits(struct queue_limits *lim)
129EXPORT_SYMBOL(blk_set_default_limits); 131EXPORT_SYMBOL(blk_set_default_limits);
130 132
131/** 133/**
132 * blk_set_stacking_limits - set default limits for stacking devices
133 * @lim: the queue_limits structure to reset
134 *
135 * Description:
136 * Returns a queue_limit struct to its default state. Should be used
137 * by stacking drivers like DM that have no internal limits.
138 */
139void blk_set_stacking_limits(struct queue_limits *lim)
140{
141 blk_set_default_limits(lim);
142
143 /* Inherit limits from component devices */
144 lim->discard_zeroes_data = 1;
145 lim->max_segments = USHRT_MAX;
146 lim->max_hw_sectors = UINT_MAX;
147 lim->max_sectors = UINT_MAX;
148 lim->max_write_same_sectors = UINT_MAX;
149}
150EXPORT_SYMBOL(blk_set_stacking_limits);
151
152/**
153 * blk_queue_make_request - define an alternate make_request function for a device 134 * blk_queue_make_request - define an alternate make_request function for a device
154 * @q: the request queue for the device to be affected 135 * @q: the request queue for the device to be affected
155 * @mfn: the alternate make_request function 136 * @mfn: the alternate make_request function
@@ -184,6 +165,8 @@ void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn)
184 q->nr_batching = BLK_BATCH_REQ; 165 q->nr_batching = BLK_BATCH_REQ;
185 166
186 blk_set_default_limits(&q->limits); 167 blk_set_default_limits(&q->limits);
168 blk_queue_max_hw_sectors(q, BLK_SAFE_MAX_SECTORS);
169 q->limits.discard_zeroes_data = 0;
187 170
188 /* 171 /*
189 * by default assume old behaviour and bounce for any highmem page 172 * by default assume old behaviour and bounce for any highmem page
@@ -288,18 +271,6 @@ void blk_queue_max_discard_sectors(struct request_queue *q,
288EXPORT_SYMBOL(blk_queue_max_discard_sectors); 271EXPORT_SYMBOL(blk_queue_max_discard_sectors);
289 272
290/** 273/**
291 * blk_queue_max_write_same_sectors - set max sectors for a single write same
292 * @q: the request queue for the device
293 * @max_write_same_sectors: maximum number of sectors to write per command
294 **/
295void blk_queue_max_write_same_sectors(struct request_queue *q,
296 unsigned int max_write_same_sectors)
297{
298 q->limits.max_write_same_sectors = max_write_same_sectors;
299}
300EXPORT_SYMBOL(blk_queue_max_write_same_sectors);
301
302/**
303 * blk_queue_max_segments - set max hw segments for a request for this queue 274 * blk_queue_max_segments - set max hw segments for a request for this queue
304 * @q: the request queue for the device 275 * @q: the request queue for the device
305 * @max_segments: max number of segments 276 * @max_segments: max number of segments
@@ -524,8 +495,6 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
524 495
525 t->max_sectors = min_not_zero(t->max_sectors, b->max_sectors); 496 t->max_sectors = min_not_zero(t->max_sectors, b->max_sectors);
526 t->max_hw_sectors = min_not_zero(t->max_hw_sectors, b->max_hw_sectors); 497 t->max_hw_sectors = min_not_zero(t->max_hw_sectors, b->max_hw_sectors);
527 t->max_write_same_sectors = min(t->max_write_same_sectors,
528 b->max_write_same_sectors);
529 t->bounce_pfn = min_not_zero(t->bounce_pfn, b->bounce_pfn); 498 t->bounce_pfn = min_not_zero(t->bounce_pfn, b->bounce_pfn);
530 499
531 t->seg_boundary_mask = min_not_zero(t->seg_boundary_mask, 500 t->seg_boundary_mask = min_not_zero(t->seg_boundary_mask,
@@ -611,7 +580,7 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
611 bottom = b->discard_granularity + alignment; 580 bottom = b->discard_granularity + alignment;
612 581
613 /* Verify that top and bottom intervals line up */ 582 /* Verify that top and bottom intervals line up */
614 if ((max(top, bottom) % min(top, bottom)) != 0) 583 if (max(top, bottom) & (min(top, bottom) - 1))
615 t->discard_misaligned = 1; 584 t->discard_misaligned = 1;
616 } 585 }
617 586
@@ -619,8 +588,8 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
619 b->max_discard_sectors); 588 b->max_discard_sectors);
620 t->discard_granularity = max(t->discard_granularity, 589 t->discard_granularity = max(t->discard_granularity,
621 b->discard_granularity); 590 b->discard_granularity);
622 t->discard_alignment = lcm(t->discard_alignment, alignment) % 591 t->discard_alignment = lcm(t->discard_alignment, alignment) &
623 t->discard_granularity; 592 (t->discard_granularity - 1);
624 } 593 }
625 594
626 return ret; 595 return ret;
diff --git a/block/blk-softirq.c b/block/blk-softirq.c
index 467c8de8864..1366a89d8e6 100644
--- a/block/blk-softirq.c
+++ b/block/blk-softirq.c
@@ -8,7 +8,6 @@
8#include <linux/blkdev.h> 8#include <linux/blkdev.h>
9#include <linux/interrupt.h> 9#include <linux/interrupt.h>
10#include <linux/cpu.h> 10#include <linux/cpu.h>
11#include <linux/sched.h>
12 11
13#include "blk.h" 12#include "blk.h"
14 13
@@ -104,10 +103,9 @@ static struct notifier_block __cpuinitdata blk_cpu_notifier = {
104 103
105void __blk_complete_request(struct request *req) 104void __blk_complete_request(struct request *req)
106{ 105{
107 int ccpu, cpu; 106 int ccpu, cpu, group_cpu = NR_CPUS;
108 struct request_queue *q = req->q; 107 struct request_queue *q = req->q;
109 unsigned long flags; 108 unsigned long flags;
110 bool shared = false;
111 109
112 BUG_ON(!q->softirq_done_fn); 110 BUG_ON(!q->softirq_done_fn);
113 111
@@ -119,20 +117,22 @@ void __blk_complete_request(struct request *req)
119 */ 117 */
120 if (req->cpu != -1) { 118 if (req->cpu != -1) {
121 ccpu = req->cpu; 119 ccpu = req->cpu;
122 if (!test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags)) 120 if (!test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags)) {
123 shared = cpus_share_cache(cpu, ccpu); 121 ccpu = blk_cpu_to_group(ccpu);
122 group_cpu = blk_cpu_to_group(cpu);
123 }
124 } else 124 } else
125 ccpu = cpu; 125 ccpu = cpu;
126 126
127 /* 127 /*
128 * If current CPU and requested CPU share a cache, run the softirq on 128 * If current CPU and requested CPU are in the same group, running
129 * the current CPU. One might concern this is just like 129 * softirq in current CPU. One might concern this is just like
130 * QUEUE_FLAG_SAME_FORCE, but actually not. blk_complete_request() is 130 * QUEUE_FLAG_SAME_FORCE, but actually not. blk_complete_request() is
131 * running in interrupt handler, and currently I/O controller doesn't 131 * running in interrupt handler, and currently I/O controller doesn't
132 * support multiple interrupts, so current CPU is unique actually. This 132 * support multiple interrupts, so current CPU is unique actually. This
133 * avoids IPI sending from current CPU to the first CPU of a group. 133 * avoids IPI sending from current CPU to the first CPU of a group.
134 */ 134 */
135 if (ccpu == cpu || shared) { 135 if (ccpu == cpu || ccpu == group_cpu) {
136 struct list_head *list; 136 struct list_head *list;
137do_local: 137do_local:
138 list = &__get_cpu_var(blk_cpu_done); 138 list = &__get_cpu_var(blk_cpu_done);
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 788147797a7..60fda88c57f 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -9,7 +9,6 @@
9#include <linux/blktrace_api.h> 9#include <linux/blktrace_api.h>
10 10
11#include "blk.h" 11#include "blk.h"
12#include "blk-cgroup.h"
13 12
14struct queue_sysfs_entry { 13struct queue_sysfs_entry {
15 struct attribute attr; 14 struct attribute attr;
@@ -26,15 +25,9 @@ queue_var_show(unsigned long var, char *page)
26static ssize_t 25static ssize_t
27queue_var_store(unsigned long *var, const char *page, size_t count) 26queue_var_store(unsigned long *var, const char *page, size_t count)
28{ 27{
29 int err; 28 char *p = (char *) page;
30 unsigned long v;
31
32 err = strict_strtoul(page, 10, &v);
33 if (err || v > UINT_MAX)
34 return -EINVAL;
35
36 *var = v;
37 29
30 *var = simple_strtoul(p, &p, 10);
38 return count; 31 return count;
39} 32}
40 33
@@ -46,7 +39,7 @@ static ssize_t queue_requests_show(struct request_queue *q, char *page)
46static ssize_t 39static ssize_t
47queue_requests_store(struct request_queue *q, const char *page, size_t count) 40queue_requests_store(struct request_queue *q, const char *page, size_t count)
48{ 41{
49 struct request_list *rl; 42 struct request_list *rl = &q->rq;
50 unsigned long nr; 43 unsigned long nr;
51 int ret; 44 int ret;
52 45
@@ -54,9 +47,6 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count)
54 return -EINVAL; 47 return -EINVAL;
55 48
56 ret = queue_var_store(&nr, page, count); 49 ret = queue_var_store(&nr, page, count);
57 if (ret < 0)
58 return ret;
59
60 if (nr < BLKDEV_MIN_RQ) 50 if (nr < BLKDEV_MIN_RQ)
61 nr = BLKDEV_MIN_RQ; 51 nr = BLKDEV_MIN_RQ;
62 52
@@ -64,9 +54,6 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count)
64 q->nr_requests = nr; 54 q->nr_requests = nr;
65 blk_queue_congestion_threshold(q); 55 blk_queue_congestion_threshold(q);
66 56
67 /* congestion isn't cgroup aware and follows root blkcg for now */
68 rl = &q->root_rl;
69
70 if (rl->count[BLK_RW_SYNC] >= queue_congestion_on_threshold(q)) 57 if (rl->count[BLK_RW_SYNC] >= queue_congestion_on_threshold(q))
71 blk_set_queue_congested(q, BLK_RW_SYNC); 58 blk_set_queue_congested(q, BLK_RW_SYNC);
72 else if (rl->count[BLK_RW_SYNC] < queue_congestion_off_threshold(q)) 59 else if (rl->count[BLK_RW_SYNC] < queue_congestion_off_threshold(q))
@@ -77,22 +64,19 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count)
77 else if (rl->count[BLK_RW_ASYNC] < queue_congestion_off_threshold(q)) 64 else if (rl->count[BLK_RW_ASYNC] < queue_congestion_off_threshold(q))
78 blk_clear_queue_congested(q, BLK_RW_ASYNC); 65 blk_clear_queue_congested(q, BLK_RW_ASYNC);
79 66
80 blk_queue_for_each_rl(rl, q) { 67 if (rl->count[BLK_RW_SYNC] >= q->nr_requests) {
81 if (rl->count[BLK_RW_SYNC] >= q->nr_requests) { 68 blk_set_queue_full(q, BLK_RW_SYNC);
82 blk_set_rl_full(rl, BLK_RW_SYNC); 69 } else {
83 } else { 70 blk_clear_queue_full(q, BLK_RW_SYNC);
84 blk_clear_rl_full(rl, BLK_RW_SYNC); 71 wake_up(&rl->wait[BLK_RW_SYNC]);
85 wake_up(&rl->wait[BLK_RW_SYNC]);
86 }
87
88 if (rl->count[BLK_RW_ASYNC] >= q->nr_requests) {
89 blk_set_rl_full(rl, BLK_RW_ASYNC);
90 } else {
91 blk_clear_rl_full(rl, BLK_RW_ASYNC);
92 wake_up(&rl->wait[BLK_RW_ASYNC]);
93 }
94 } 72 }
95 73
74 if (rl->count[BLK_RW_ASYNC] >= q->nr_requests) {
75 blk_set_queue_full(q, BLK_RW_ASYNC);
76 } else {
77 blk_clear_queue_full(q, BLK_RW_ASYNC);
78 wake_up(&rl->wait[BLK_RW_ASYNC]);
79 }
96 spin_unlock_irq(q->queue_lock); 80 spin_unlock_irq(q->queue_lock);
97 return ret; 81 return ret;
98} 82}
@@ -111,9 +95,6 @@ queue_ra_store(struct request_queue *q, const char *page, size_t count)
111 unsigned long ra_kb; 95 unsigned long ra_kb;
112 ssize_t ret = queue_var_store(&ra_kb, page, count); 96 ssize_t ret = queue_var_store(&ra_kb, page, count);
113 97
114 if (ret < 0)
115 return ret;
116
117 q->backing_dev_info.ra_pages = ra_kb >> (PAGE_CACHE_SHIFT - 10); 98 q->backing_dev_info.ra_pages = ra_kb >> (PAGE_CACHE_SHIFT - 10);
118 99
119 return ret; 100 return ret;
@@ -180,13 +161,6 @@ static ssize_t queue_discard_zeroes_data_show(struct request_queue *q, char *pag
180 return queue_var_show(queue_discard_zeroes_data(q), page); 161 return queue_var_show(queue_discard_zeroes_data(q), page);
181} 162}
182 163
183static ssize_t queue_write_same_max_show(struct request_queue *q, char *page)
184{
185 return sprintf(page, "%llu\n",
186 (unsigned long long)q->limits.max_write_same_sectors << 9);
187}
188
189
190static ssize_t 164static ssize_t
191queue_max_sectors_store(struct request_queue *q, const char *page, size_t count) 165queue_max_sectors_store(struct request_queue *q, const char *page, size_t count)
192{ 166{
@@ -195,9 +169,6 @@ queue_max_sectors_store(struct request_queue *q, const char *page, size_t count)
195 page_kb = 1 << (PAGE_CACHE_SHIFT - 10); 169 page_kb = 1 << (PAGE_CACHE_SHIFT - 10);
196 ssize_t ret = queue_var_store(&max_sectors_kb, page, count); 170 ssize_t ret = queue_var_store(&max_sectors_kb, page, count);
197 171
198 if (ret < 0)
199 return ret;
200
201 if (max_sectors_kb > max_hw_sectors_kb || max_sectors_kb < page_kb) 172 if (max_sectors_kb > max_hw_sectors_kb || max_sectors_kb < page_kb)
202 return -EINVAL; 173 return -EINVAL;
203 174
@@ -258,9 +229,6 @@ static ssize_t queue_nomerges_store(struct request_queue *q, const char *page,
258 unsigned long nm; 229 unsigned long nm;
259 ssize_t ret = queue_var_store(&nm, page, count); 230 ssize_t ret = queue_var_store(&nm, page, count);
260 231
261 if (ret < 0)
262 return ret;
263
264 spin_lock_irq(q->queue_lock); 232 spin_lock_irq(q->queue_lock);
265 queue_flag_clear(QUEUE_FLAG_NOMERGES, q); 233 queue_flag_clear(QUEUE_FLAG_NOMERGES, q);
266 queue_flag_clear(QUEUE_FLAG_NOXMERGES, q); 234 queue_flag_clear(QUEUE_FLAG_NOXMERGES, q);
@@ -289,9 +257,6 @@ queue_rq_affinity_store(struct request_queue *q, const char *page, size_t count)
289 unsigned long val; 257 unsigned long val;
290 258
291 ret = queue_var_store(&val, page, count); 259 ret = queue_var_store(&val, page, count);
292 if (ret < 0)
293 return ret;
294
295 spin_lock_irq(q->queue_lock); 260 spin_lock_irq(q->queue_lock);
296 if (val == 2) { 261 if (val == 2) {
297 queue_flag_set(QUEUE_FLAG_SAME_COMP, q); 262 queue_flag_set(QUEUE_FLAG_SAME_COMP, q);
@@ -392,11 +357,6 @@ static struct queue_sysfs_entry queue_discard_zeroes_data_entry = {
392 .show = queue_discard_zeroes_data_show, 357 .show = queue_discard_zeroes_data_show,
393}; 358};
394 359
395static struct queue_sysfs_entry queue_write_same_max_entry = {
396 .attr = {.name = "write_same_max_bytes", .mode = S_IRUGO },
397 .show = queue_write_same_max_show,
398};
399
400static struct queue_sysfs_entry queue_nonrot_entry = { 360static struct queue_sysfs_entry queue_nonrot_entry = {
401 .attr = {.name = "rotational", .mode = S_IRUGO | S_IWUSR }, 361 .attr = {.name = "rotational", .mode = S_IRUGO | S_IWUSR },
402 .show = queue_show_nonrot, 362 .show = queue_show_nonrot,
@@ -444,7 +404,6 @@ static struct attribute *default_attrs[] = {
444 &queue_discard_granularity_entry.attr, 404 &queue_discard_granularity_entry.attr,
445 &queue_discard_max_entry.attr, 405 &queue_discard_max_entry.attr,
446 &queue_discard_zeroes_data_entry.attr, 406 &queue_discard_zeroes_data_entry.attr,
447 &queue_write_same_max_entry.attr,
448 &queue_nonrot_entry.attr, 407 &queue_nonrot_entry.attr,
449 &queue_nomerges_entry.attr, 408 &queue_nomerges_entry.attr,
450 &queue_rq_affinity_entry.attr, 409 &queue_rq_affinity_entry.attr,
@@ -466,7 +425,7 @@ queue_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
466 if (!entry->show) 425 if (!entry->show)
467 return -EIO; 426 return -EIO;
468 mutex_lock(&q->sysfs_lock); 427 mutex_lock(&q->sysfs_lock);
469 if (blk_queue_dying(q)) { 428 if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)) {
470 mutex_unlock(&q->sysfs_lock); 429 mutex_unlock(&q->sysfs_lock);
471 return -ENOENT; 430 return -ENOENT;
472 } 431 }
@@ -488,7 +447,7 @@ queue_attr_store(struct kobject *kobj, struct attribute *attr,
488 447
489 q = container_of(kobj, struct request_queue, kobj); 448 q = container_of(kobj, struct request_queue, kobj);
490 mutex_lock(&q->sysfs_lock); 449 mutex_lock(&q->sysfs_lock);
491 if (blk_queue_dying(q)) { 450 if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)) {
492 mutex_unlock(&q->sysfs_lock); 451 mutex_unlock(&q->sysfs_lock);
493 return -ENOENT; 452 return -ENOENT;
494 } 453 }
@@ -498,11 +457,11 @@ queue_attr_store(struct kobject *kobj, struct attribute *attr,
498} 457}
499 458
500/** 459/**
501 * blk_release_queue: - release a &struct request_queue when it is no longer needed 460 * blk_cleanup_queue: - release a &struct request_queue when it is no longer needed
502 * @kobj: the kobj belonging to the request queue to be released 461 * @kobj: the kobj belonging of the request queue to be released
503 * 462 *
504 * Description: 463 * Description:
505 * blk_release_queue is the pair to blk_init_queue() or 464 * blk_cleanup_queue is the pair to blk_init_queue() or
506 * blk_queue_make_request(). It should be called when a request queue is 465 * blk_queue_make_request(). It should be called when a request queue is
507 * being released; typically when a block device is being de-registered. 466 * being released; typically when a block device is being de-registered.
508 * Currently, its primary task it to free all the &struct request 467 * Currently, its primary task it to free all the &struct request
@@ -516,19 +475,17 @@ static void blk_release_queue(struct kobject *kobj)
516{ 475{
517 struct request_queue *q = 476 struct request_queue *q =
518 container_of(kobj, struct request_queue, kobj); 477 container_of(kobj, struct request_queue, kobj);
478 struct request_list *rl = &q->rq;
519 479
520 blk_sync_queue(q); 480 blk_sync_queue(q);
521 481
522 blkcg_exit_queue(q); 482 if (q->elevator)
523
524 if (q->elevator) {
525 spin_lock_irq(q->queue_lock);
526 ioc_clear_queue(q);
527 spin_unlock_irq(q->queue_lock);
528 elevator_exit(q->elevator); 483 elevator_exit(q->elevator);
529 }
530 484
531 blk_exit_rl(&q->root_rl); 485 blk_throtl_exit(q);
486
487 if (rl->rq_pool)
488 mempool_destroy(rl->rq_pool);
532 489
533 if (q->queue_tags) 490 if (q->queue_tags)
534 __blk_queue_free_tags(q); 491 __blk_queue_free_tags(q);
@@ -536,8 +493,6 @@ static void blk_release_queue(struct kobject *kobj)
536 blk_trace_shutdown(q); 493 blk_trace_shutdown(q);
537 494
538 bdi_destroy(&q->backing_dev_info); 495 bdi_destroy(&q->backing_dev_info);
539
540 ida_simple_remove(&blk_queue_ida, q->id);
541 kmem_cache_free(blk_requestq_cachep, q); 496 kmem_cache_free(blk_requestq_cachep, q);
542} 497}
543 498
@@ -561,12 +516,6 @@ int blk_register_queue(struct gendisk *disk)
561 if (WARN_ON(!q)) 516 if (WARN_ON(!q))
562 return -ENXIO; 517 return -ENXIO;
563 518
564 /*
565 * Initialization must be complete by now. Finish the initial
566 * bypass from queue allocation.
567 */
568 blk_queue_bypass_end(q);
569
570 ret = blk_trace_init_sysfs(dev); 519 ret = blk_trace_init_sysfs(dev);
571 if (ret) 520 if (ret)
572 return ret; 521 return ret;
diff --git a/block/blk-tag.c b/block/blk-tag.c
index cc345e1d8d4..ece65fc4c79 100644
--- a/block/blk-tag.c
+++ b/block/blk-tag.c
@@ -186,8 +186,7 @@ int blk_queue_init_tags(struct request_queue *q, int depth,
186 tags = __blk_queue_init_tags(q, depth); 186 tags = __blk_queue_init_tags(q, depth);
187 187
188 if (!tags) 188 if (!tags)
189 return -ENOMEM; 189 goto fail;
190
191 } else if (q->queue_tags) { 190 } else if (q->queue_tags) {
192 rc = blk_queue_resize_tags(q, depth); 191 rc = blk_queue_resize_tags(q, depth);
193 if (rc) 192 if (rc)
@@ -204,6 +203,9 @@ int blk_queue_init_tags(struct request_queue *q, int depth,
204 queue_flag_set_unlocked(QUEUE_FLAG_QUEUED, q); 203 queue_flag_set_unlocked(QUEUE_FLAG_QUEUED, q);
205 INIT_LIST_HEAD(&q->tag_busy_list); 204 INIT_LIST_HEAD(&q->tag_busy_list);
206 return 0; 205 return 0;
206fail:
207 kfree(tags);
208 return -ENOMEM;
207} 209}
208EXPORT_SYMBOL(blk_queue_init_tags); 210EXPORT_SYMBOL(blk_queue_init_tags);
209 211
@@ -280,9 +282,16 @@ EXPORT_SYMBOL(blk_queue_resize_tags);
280void blk_queue_end_tag(struct request_queue *q, struct request *rq) 282void blk_queue_end_tag(struct request_queue *q, struct request *rq)
281{ 283{
282 struct blk_queue_tag *bqt = q->queue_tags; 284 struct blk_queue_tag *bqt = q->queue_tags;
283 unsigned tag = rq->tag; /* negative tags invalid */ 285 int tag = rq->tag;
286
287 BUG_ON(tag == -1);
284 288
285 BUG_ON(tag >= bqt->real_max_depth); 289 if (unlikely(tag >= bqt->real_max_depth))
290 /*
291 * This can happen after tag depth has been reduced.
292 * FIXME: how about a warning or info message here?
293 */
294 return;
286 295
287 list_del_init(&rq->queuelist); 296 list_del_init(&rq->queuelist);
288 rq->cmd_flags &= ~REQ_QUEUED; 297 rq->cmd_flags &= ~REQ_QUEUED;
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 31146225f3d..a19f58c6fc3 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -10,7 +10,6 @@
10#include <linux/bio.h> 10#include <linux/bio.h>
11#include <linux/blktrace_api.h> 11#include <linux/blktrace_api.h>
12#include "blk-cgroup.h" 12#include "blk-cgroup.h"
13#include "blk.h"
14 13
15/* Max dispatch from a group in 1 round */ 14/* Max dispatch from a group in 1 round */
16static int throtl_grp_quantum = 8; 15static int throtl_grp_quantum = 8;
@@ -21,8 +20,6 @@ static int throtl_quantum = 32;
21/* Throttling is performed over 100ms slice and after that slice is renewed */ 20/* Throttling is performed over 100ms slice and after that slice is renewed */
22static unsigned long throtl_slice = HZ/10; /* 100 ms */ 21static unsigned long throtl_slice = HZ/10; /* 100 ms */
23 22
24static struct blkcg_policy blkcg_policy_throtl;
25
26/* A workqueue to queue throttle related work */ 23/* A workqueue to queue throttle related work */
27static struct workqueue_struct *kthrotld_workqueue; 24static struct workqueue_struct *kthrotld_workqueue;
28static void throtl_schedule_delayed_work(struct throtl_data *td, 25static void throtl_schedule_delayed_work(struct throtl_data *td,
@@ -40,17 +37,9 @@ struct throtl_rb_root {
40 37
41#define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node) 38#define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node)
42 39
43/* Per-cpu group stats */
44struct tg_stats_cpu {
45 /* total bytes transferred */
46 struct blkg_rwstat service_bytes;
47 /* total IOs serviced, post merge */
48 struct blkg_rwstat serviced;
49};
50
51struct throtl_grp { 40struct throtl_grp {
52 /* must be the first member */ 41 /* List of throtl groups on the request queue*/
53 struct blkg_policy_data pd; 42 struct hlist_node tg_node;
54 43
55 /* active throtl group service_tree member */ 44 /* active throtl group service_tree member */
56 struct rb_node rb_node; 45 struct rb_node rb_node;
@@ -62,6 +51,8 @@ struct throtl_grp {
62 */ 51 */
63 unsigned long disptime; 52 unsigned long disptime;
64 53
54 struct blkio_group blkg;
55 atomic_t ref;
65 unsigned int flags; 56 unsigned int flags;
66 57
67 /* Two lists for READ and WRITE */ 58 /* Two lists for READ and WRITE */
@@ -88,18 +79,18 @@ struct throtl_grp {
88 /* Some throttle limits got updated for the group */ 79 /* Some throttle limits got updated for the group */
89 int limits_changed; 80 int limits_changed;
90 81
91 /* Per cpu stats pointer */ 82 struct rcu_head rcu_head;
92 struct tg_stats_cpu __percpu *stats_cpu;
93
94 /* List of tgs waiting for per cpu stats memory to be allocated */
95 struct list_head stats_alloc_node;
96}; 83};
97 84
98struct throtl_data 85struct throtl_data
99{ 86{
87 /* List of throtl groups */
88 struct hlist_head tg_list;
89
100 /* service tree for active throtl groups */ 90 /* service tree for active throtl groups */
101 struct throtl_rb_root tg_service_tree; 91 struct throtl_rb_root tg_service_tree;
102 92
93 struct throtl_grp *root_tg;
103 struct request_queue *queue; 94 struct request_queue *queue;
104 95
105 /* Total Number of queued bios on READ and WRITE lists */ 96 /* Total Number of queued bios on READ and WRITE lists */
@@ -116,33 +107,6 @@ struct throtl_data
116 int limits_changed; 107 int limits_changed;
117}; 108};
118 109
119/* list and work item to allocate percpu group stats */
120static DEFINE_SPINLOCK(tg_stats_alloc_lock);
121static LIST_HEAD(tg_stats_alloc_list);
122
123static void tg_stats_alloc_fn(struct work_struct *);
124static DECLARE_DELAYED_WORK(tg_stats_alloc_work, tg_stats_alloc_fn);
125
126static inline struct throtl_grp *pd_to_tg(struct blkg_policy_data *pd)
127{
128 return pd ? container_of(pd, struct throtl_grp, pd) : NULL;
129}
130
131static inline struct throtl_grp *blkg_to_tg(struct blkcg_gq *blkg)
132{
133 return pd_to_tg(blkg_to_pd(blkg, &blkcg_policy_throtl));
134}
135
136static inline struct blkcg_gq *tg_to_blkg(struct throtl_grp *tg)
137{
138 return pd_to_blkg(&tg->pd);
139}
140
141static inline struct throtl_grp *td_root_tg(struct throtl_data *td)
142{
143 return blkg_to_tg(td->queue->root_blkg);
144}
145
146enum tg_state_flags { 110enum tg_state_flags {
147 THROTL_TG_FLAG_on_rr = 0, /* on round-robin busy list */ 111 THROTL_TG_FLAG_on_rr = 0, /* on round-robin busy list */
148}; 112};
@@ -163,149 +127,254 @@ static inline int throtl_tg_##name(const struct throtl_grp *tg) \
163 127
164THROTL_TG_FNS(on_rr); 128THROTL_TG_FNS(on_rr);
165 129
166#define throtl_log_tg(td, tg, fmt, args...) do { \ 130#define throtl_log_tg(td, tg, fmt, args...) \
167 char __pbuf[128]; \ 131 blk_add_trace_msg((td)->queue, "throtl %s " fmt, \
168 \ 132 blkg_path(&(tg)->blkg), ##args); \
169 blkg_path(tg_to_blkg(tg), __pbuf, sizeof(__pbuf)); \
170 blk_add_trace_msg((td)->queue, "throtl %s " fmt, __pbuf, ##args); \
171} while (0)
172 133
173#define throtl_log(td, fmt, args...) \ 134#define throtl_log(td, fmt, args...) \
174 blk_add_trace_msg((td)->queue, "throtl " fmt, ##args) 135 blk_add_trace_msg((td)->queue, "throtl " fmt, ##args)
175 136
137static inline struct throtl_grp *tg_of_blkg(struct blkio_group *blkg)
138{
139 if (blkg)
140 return container_of(blkg, struct throtl_grp, blkg);
141
142 return NULL;
143}
144
176static inline unsigned int total_nr_queued(struct throtl_data *td) 145static inline unsigned int total_nr_queued(struct throtl_data *td)
177{ 146{
178 return td->nr_queued[0] + td->nr_queued[1]; 147 return td->nr_queued[0] + td->nr_queued[1];
179} 148}
180 149
181/* 150static inline struct throtl_grp *throtl_ref_get_tg(struct throtl_grp *tg)
182 * Worker for allocating per cpu stat for tgs. This is scheduled on the
183 * system_wq once there are some groups on the alloc_list waiting for
184 * allocation.
185 */
186static void tg_stats_alloc_fn(struct work_struct *work)
187{ 151{
188 static struct tg_stats_cpu *stats_cpu; /* this fn is non-reentrant */ 152 atomic_inc(&tg->ref);
189 struct delayed_work *dwork = to_delayed_work(work); 153 return tg;
190 bool empty = false; 154}
191
192alloc_stats:
193 if (!stats_cpu) {
194 stats_cpu = alloc_percpu(struct tg_stats_cpu);
195 if (!stats_cpu) {
196 /* allocation failed, try again after some time */
197 schedule_delayed_work(dwork, msecs_to_jiffies(10));
198 return;
199 }
200 }
201
202 spin_lock_irq(&tg_stats_alloc_lock);
203 155
204 if (!list_empty(&tg_stats_alloc_list)) { 156static void throtl_free_tg(struct rcu_head *head)
205 struct throtl_grp *tg = list_first_entry(&tg_stats_alloc_list, 157{
206 struct throtl_grp, 158 struct throtl_grp *tg;
207 stats_alloc_node);
208 swap(tg->stats_cpu, stats_cpu);
209 list_del_init(&tg->stats_alloc_node);
210 }
211 159
212 empty = list_empty(&tg_stats_alloc_list); 160 tg = container_of(head, struct throtl_grp, rcu_head);
213 spin_unlock_irq(&tg_stats_alloc_lock); 161 free_percpu(tg->blkg.stats_cpu);
214 if (!empty) 162 kfree(tg);
215 goto alloc_stats;
216} 163}
217 164
218static void throtl_pd_init(struct blkcg_gq *blkg) 165static void throtl_put_tg(struct throtl_grp *tg)
219{ 166{
220 struct throtl_grp *tg = blkg_to_tg(blkg); 167 BUG_ON(atomic_read(&tg->ref) <= 0);
221 unsigned long flags; 168 if (!atomic_dec_and_test(&tg->ref))
169 return;
222 170
171 /*
172 * A group is freed in rcu manner. But having an rcu lock does not
173 * mean that one can access all the fields of blkg and assume these
174 * are valid. For example, don't try to follow throtl_data and
175 * request queue links.
176 *
177 * Having a reference to blkg under an rcu allows acess to only
178 * values local to groups like group stats and group rate limits
179 */
180 call_rcu(&tg->rcu_head, throtl_free_tg);
181}
182
183static void throtl_init_group(struct throtl_grp *tg)
184{
185 INIT_HLIST_NODE(&tg->tg_node);
223 RB_CLEAR_NODE(&tg->rb_node); 186 RB_CLEAR_NODE(&tg->rb_node);
224 bio_list_init(&tg->bio_lists[0]); 187 bio_list_init(&tg->bio_lists[0]);
225 bio_list_init(&tg->bio_lists[1]); 188 bio_list_init(&tg->bio_lists[1]);
226 tg->limits_changed = false; 189 tg->limits_changed = false;
227 190
228 tg->bps[READ] = -1; 191 /* Practically unlimited BW */
229 tg->bps[WRITE] = -1; 192 tg->bps[0] = tg->bps[1] = -1;
230 tg->iops[READ] = -1; 193 tg->iops[0] = tg->iops[1] = -1;
231 tg->iops[WRITE] = -1;
232 194
233 /* 195 /*
234 * Ugh... We need to perform per-cpu allocation for tg->stats_cpu 196 * Take the initial reference that will be released on destroy
235 * but percpu allocator can't be called from IO path. Queue tg on 197 * This can be thought of a joint reference by cgroup and
236 * tg_stats_alloc_list and allocate from work item. 198 * request queue which will be dropped by either request queue
199 * exit or cgroup deletion path depending on who is exiting first.
237 */ 200 */
238 spin_lock_irqsave(&tg_stats_alloc_lock, flags); 201 atomic_set(&tg->ref, 1);
239 list_add(&tg->stats_alloc_node, &tg_stats_alloc_list);
240 schedule_delayed_work(&tg_stats_alloc_work, 0);
241 spin_unlock_irqrestore(&tg_stats_alloc_lock, flags);
242} 202}
243 203
244static void throtl_pd_exit(struct blkcg_gq *blkg) 204/* Should be called with rcu read lock held (needed for blkcg) */
205static void
206throtl_add_group_to_td_list(struct throtl_data *td, struct throtl_grp *tg)
245{ 207{
246 struct throtl_grp *tg = blkg_to_tg(blkg); 208 hlist_add_head(&tg->tg_node, &td->tg_list);
247 unsigned long flags; 209 td->nr_undestroyed_grps++;
210}
248 211
249 spin_lock_irqsave(&tg_stats_alloc_lock, flags); 212static void
250 list_del_init(&tg->stats_alloc_node); 213__throtl_tg_fill_dev_details(struct throtl_data *td, struct throtl_grp *tg)
251 spin_unlock_irqrestore(&tg_stats_alloc_lock, flags); 214{
215 struct backing_dev_info *bdi = &td->queue->backing_dev_info;
216 unsigned int major, minor;
217
218 if (!tg || tg->blkg.dev)
219 return;
252 220
253 free_percpu(tg->stats_cpu); 221 /*
222 * Fill in device details for a group which might not have been
223 * filled at group creation time as queue was being instantiated
224 * and driver had not attached a device yet
225 */
226 if (bdi->dev && dev_name(bdi->dev)) {
227 sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
228 tg->blkg.dev = MKDEV(major, minor);
229 }
254} 230}
255 231
256static void throtl_pd_reset_stats(struct blkcg_gq *blkg) 232/*
233 * Should be called with without queue lock held. Here queue lock will be
234 * taken rarely. It will be taken only once during life time of a group
235 * if need be
236 */
237static void
238throtl_tg_fill_dev_details(struct throtl_data *td, struct throtl_grp *tg)
257{ 239{
258 struct throtl_grp *tg = blkg_to_tg(blkg); 240 if (!tg || tg->blkg.dev)
259 int cpu;
260
261 if (tg->stats_cpu == NULL)
262 return; 241 return;
263 242
264 for_each_possible_cpu(cpu) { 243 spin_lock_irq(td->queue->queue_lock);
265 struct tg_stats_cpu *sc = per_cpu_ptr(tg->stats_cpu, cpu); 244 __throtl_tg_fill_dev_details(td, tg);
245 spin_unlock_irq(td->queue->queue_lock);
246}
247
248static void throtl_init_add_tg_lists(struct throtl_data *td,
249 struct throtl_grp *tg, struct blkio_cgroup *blkcg)
250{
251 __throtl_tg_fill_dev_details(td, tg);
252
253 /* Add group onto cgroup list */
254 blkiocg_add_blkio_group(blkcg, &tg->blkg, (void *)td,
255 tg->blkg.dev, BLKIO_POLICY_THROTL);
266 256
267 blkg_rwstat_reset(&sc->service_bytes); 257 tg->bps[READ] = blkcg_get_read_bps(blkcg, tg->blkg.dev);
268 blkg_rwstat_reset(&sc->serviced); 258 tg->bps[WRITE] = blkcg_get_write_bps(blkcg, tg->blkg.dev);
259 tg->iops[READ] = blkcg_get_read_iops(blkcg, tg->blkg.dev);
260 tg->iops[WRITE] = blkcg_get_write_iops(blkcg, tg->blkg.dev);
261
262 throtl_add_group_to_td_list(td, tg);
263}
264
265/* Should be called without queue lock and outside of rcu period */
266static struct throtl_grp *throtl_alloc_tg(struct throtl_data *td)
267{
268 struct throtl_grp *tg = NULL;
269 int ret;
270
271 tg = kzalloc_node(sizeof(*tg), GFP_ATOMIC, td->queue->node);
272 if (!tg)
273 return NULL;
274
275 ret = blkio_alloc_blkg_stats(&tg->blkg);
276
277 if (ret) {
278 kfree(tg);
279 return NULL;
269 } 280 }
281
282 throtl_init_group(tg);
283 return tg;
270} 284}
271 285
272static struct throtl_grp *throtl_lookup_tg(struct throtl_data *td, 286static struct
273 struct blkcg *blkcg) 287throtl_grp *throtl_find_tg(struct throtl_data *td, struct blkio_cgroup *blkcg)
274{ 288{
289 struct throtl_grp *tg = NULL;
290 void *key = td;
291
275 /* 292 /*
276 * This is the common case when there are no blkcgs. Avoid lookup 293 * This is the common case when there are no blkio cgroups.
277 * in this case 294 * Avoid lookup in this case
278 */ 295 */
279 if (blkcg == &blkcg_root) 296 if (blkcg == &blkio_root_cgroup)
280 return td_root_tg(td); 297 tg = td->root_tg;
298 else
299 tg = tg_of_blkg(blkiocg_lookup_group(blkcg, key));
281 300
282 return blkg_to_tg(blkg_lookup(blkcg, td->queue)); 301 __throtl_tg_fill_dev_details(td, tg);
302 return tg;
283} 303}
284 304
285static struct throtl_grp *throtl_lookup_create_tg(struct throtl_data *td, 305/*
286 struct blkcg *blkcg) 306 * This function returns with queue lock unlocked in case of error, like
307 * request queue is no more
308 */
309static struct throtl_grp * throtl_get_tg(struct throtl_data *td)
287{ 310{
311 struct throtl_grp *tg = NULL, *__tg = NULL;
312 struct blkio_cgroup *blkcg;
288 struct request_queue *q = td->queue; 313 struct request_queue *q = td->queue;
289 struct throtl_grp *tg = NULL;
290 314
315 rcu_read_lock();
316 blkcg = task_blkio_cgroup(current);
317 tg = throtl_find_tg(td, blkcg);
318 if (tg) {
319 rcu_read_unlock();
320 return tg;
321 }
322
323 /*
324 * Need to allocate a group. Allocation of group also needs allocation
325 * of per cpu stats which in-turn takes a mutex() and can block. Hence
326 * we need to drop rcu lock and queue_lock before we call alloc
327 *
328 * Take the request queue reference to make sure queue does not
329 * go away once we return from allocation.
330 */
331 blk_get_queue(q);
332 rcu_read_unlock();
333 spin_unlock_irq(q->queue_lock);
334
335 tg = throtl_alloc_tg(td);
291 /* 336 /*
292 * This is the common case when there are no blkcgs. Avoid lookup 337 * We might have slept in group allocation. Make sure queue is not
293 * in this case 338 * dead
294 */ 339 */
295 if (blkcg == &blkcg_root) { 340 if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) {
296 tg = td_root_tg(td); 341 blk_put_queue(q);
297 } else { 342 if (tg)
298 struct blkcg_gq *blkg; 343 kfree(tg);
299 344
300 blkg = blkg_lookup_create(blkcg, q); 345 return ERR_PTR(-ENODEV);
301
302 /* if %NULL and @q is alive, fall back to root_tg */
303 if (!IS_ERR(blkg))
304 tg = blkg_to_tg(blkg);
305 else if (!blk_queue_dying(q))
306 tg = td_root_tg(td);
307 } 346 }
347 blk_put_queue(q);
348
349 /* Group allocated and queue is still alive. take the lock */
350 spin_lock_irq(q->queue_lock);
308 351
352 /*
353 * Initialize the new group. After sleeping, read the blkcg again.
354 */
355 rcu_read_lock();
356 blkcg = task_blkio_cgroup(current);
357
358 /*
359 * If some other thread already allocated the group while we were
360 * not holding queue lock, free up the group
361 */
362 __tg = throtl_find_tg(td, blkcg);
363
364 if (__tg) {
365 kfree(tg);
366 rcu_read_unlock();
367 return __tg;
368 }
369
370 /* Group allocation failed. Account the IO to root group */
371 if (!tg) {
372 tg = td->root_tg;
373 return tg;
374 }
375
376 throtl_init_add_tg_lists(td, tg, blkcg);
377 rcu_read_unlock();
309 return tg; 378 return tg;
310} 379}
311 380
@@ -674,41 +743,16 @@ static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg,
674 return 0; 743 return 0;
675} 744}
676 745
677static void throtl_update_dispatch_stats(struct blkcg_gq *blkg, u64 bytes,
678 int rw)
679{
680 struct throtl_grp *tg = blkg_to_tg(blkg);
681 struct tg_stats_cpu *stats_cpu;
682 unsigned long flags;
683
684 /* If per cpu stats are not allocated yet, don't do any accounting. */
685 if (tg->stats_cpu == NULL)
686 return;
687
688 /*
689 * Disabling interrupts to provide mutual exclusion between two
690 * writes on same cpu. It probably is not needed for 64bit. Not
691 * optimizing that case yet.
692 */
693 local_irq_save(flags);
694
695 stats_cpu = this_cpu_ptr(tg->stats_cpu);
696
697 blkg_rwstat_add(&stats_cpu->serviced, rw, 1);
698 blkg_rwstat_add(&stats_cpu->service_bytes, rw, bytes);
699
700 local_irq_restore(flags);
701}
702
703static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) 746static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
704{ 747{
705 bool rw = bio_data_dir(bio); 748 bool rw = bio_data_dir(bio);
749 bool sync = rw_is_sync(bio->bi_rw);
706 750
707 /* Charge the bio to the group */ 751 /* Charge the bio to the group */
708 tg->bytes_disp[rw] += bio->bi_size; 752 tg->bytes_disp[rw] += bio->bi_size;
709 tg->io_disp[rw]++; 753 tg->io_disp[rw]++;
710 754
711 throtl_update_dispatch_stats(tg_to_blkg(tg), bio->bi_size, bio->bi_rw); 755 blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size, rw, sync);
712} 756}
713 757
714static void throtl_add_bio_tg(struct throtl_data *td, struct throtl_grp *tg, 758static void throtl_add_bio_tg(struct throtl_data *td, struct throtl_grp *tg,
@@ -718,7 +762,7 @@ static void throtl_add_bio_tg(struct throtl_data *td, struct throtl_grp *tg,
718 762
719 bio_list_add(&tg->bio_lists[rw], bio); 763 bio_list_add(&tg->bio_lists[rw], bio);
720 /* Take a bio reference on tg */ 764 /* Take a bio reference on tg */
721 blkg_get(tg_to_blkg(tg)); 765 throtl_ref_get_tg(tg);
722 tg->nr_queued[rw]++; 766 tg->nr_queued[rw]++;
723 td->nr_queued[rw]++; 767 td->nr_queued[rw]++;
724 throtl_enqueue_tg(td, tg); 768 throtl_enqueue_tg(td, tg);
@@ -751,8 +795,8 @@ static void tg_dispatch_one_bio(struct throtl_data *td, struct throtl_grp *tg,
751 795
752 bio = bio_list_pop(&tg->bio_lists[rw]); 796 bio = bio_list_pop(&tg->bio_lists[rw]);
753 tg->nr_queued[rw]--; 797 tg->nr_queued[rw]--;
754 /* Drop bio reference on blkg */ 798 /* Drop bio reference on tg */
755 blkg_put(tg_to_blkg(tg)); 799 throtl_put_tg(tg);
756 800
757 BUG_ON(td->nr_queued[rw] <= 0); 801 BUG_ON(td->nr_queued[rw] <= 0);
758 td->nr_queued[rw]--; 802 td->nr_queued[rw]--;
@@ -830,8 +874,8 @@ static int throtl_select_dispatch(struct throtl_data *td, struct bio_list *bl)
830 874
831static void throtl_process_limit_change(struct throtl_data *td) 875static void throtl_process_limit_change(struct throtl_data *td)
832{ 876{
833 struct request_queue *q = td->queue; 877 struct throtl_grp *tg;
834 struct blkcg_gq *blkg, *n; 878 struct hlist_node *pos, *n;
835 879
836 if (!td->limits_changed) 880 if (!td->limits_changed)
837 return; 881 return;
@@ -840,9 +884,7 @@ static void throtl_process_limit_change(struct throtl_data *td)
840 884
841 throtl_log(td, "limits changed"); 885 throtl_log(td, "limits changed");
842 886
843 list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) { 887 hlist_for_each_entry_safe(tg, pos, n, &td->tg_list, tg_node) {
844 struct throtl_grp *tg = blkg_to_tg(blkg);
845
846 if (!tg->limits_changed) 888 if (!tg->limits_changed)
847 continue; 889 continue;
848 890
@@ -929,164 +971,135 @@ throtl_schedule_delayed_work(struct throtl_data *td, unsigned long delay)
929 971
930 /* schedule work if limits changed even if no bio is queued */ 972 /* schedule work if limits changed even if no bio is queued */
931 if (total_nr_queued(td) || td->limits_changed) { 973 if (total_nr_queued(td) || td->limits_changed) {
932 mod_delayed_work(kthrotld_workqueue, dwork, delay); 974 /*
975 * We might have a work scheduled to be executed in future.
976 * Cancel that and schedule a new one.
977 */
978 __cancel_delayed_work(dwork);
979 queue_delayed_work(kthrotld_workqueue, dwork, delay);
933 throtl_log(td, "schedule work. delay=%lu jiffies=%lu", 980 throtl_log(td, "schedule work. delay=%lu jiffies=%lu",
934 delay, jiffies); 981 delay, jiffies);
935 } 982 }
936} 983}
937 984
938static u64 tg_prfill_cpu_rwstat(struct seq_file *sf, 985static void
939 struct blkg_policy_data *pd, int off) 986throtl_destroy_tg(struct throtl_data *td, struct throtl_grp *tg)
940{ 987{
941 struct throtl_grp *tg = pd_to_tg(pd); 988 /* Something wrong if we are trying to remove same group twice */
942 struct blkg_rwstat rwstat = { }, tmp; 989 BUG_ON(hlist_unhashed(&tg->tg_node));
943 int i, cpu;
944 990
945 for_each_possible_cpu(cpu) { 991 hlist_del_init(&tg->tg_node);
946 struct tg_stats_cpu *sc = per_cpu_ptr(tg->stats_cpu, cpu);
947
948 tmp = blkg_rwstat_read((void *)sc + off);
949 for (i = 0; i < BLKG_RWSTAT_NR; i++)
950 rwstat.cnt[i] += tmp.cnt[i];
951 }
952 992
953 return __blkg_prfill_rwstat(sf, pd, &rwstat); 993 /*
994 * Put the reference taken at the time of creation so that when all
995 * queues are gone, group can be destroyed.
996 */
997 throtl_put_tg(tg);
998 td->nr_undestroyed_grps--;
954} 999}
955 1000
956static int tg_print_cpu_rwstat(struct cgroup *cgrp, struct cftype *cft, 1001static void throtl_release_tgs(struct throtl_data *td)
957 struct seq_file *sf)
958{ 1002{
959 struct blkcg *blkcg = cgroup_to_blkcg(cgrp); 1003 struct hlist_node *pos, *n;
1004 struct throtl_grp *tg;
960 1005
961 blkcg_print_blkgs(sf, blkcg, tg_prfill_cpu_rwstat, &blkcg_policy_throtl, 1006 hlist_for_each_entry_safe(tg, pos, n, &td->tg_list, tg_node) {
962 cft->private, true); 1007 /*
963 return 0; 1008 * If cgroup removal path got to blk_group first and removed
1009 * it from cgroup list, then it will take care of destroying
1010 * cfqg also.
1011 */
1012 if (!blkiocg_del_blkio_group(&tg->blkg))
1013 throtl_destroy_tg(td, tg);
1014 }
964} 1015}
965 1016
966static u64 tg_prfill_conf_u64(struct seq_file *sf, struct blkg_policy_data *pd, 1017static void throtl_td_free(struct throtl_data *td)
967 int off)
968{ 1018{
969 struct throtl_grp *tg = pd_to_tg(pd); 1019 kfree(td);
970 u64 v = *(u64 *)((void *)tg + off);
971
972 if (v == -1)
973 return 0;
974 return __blkg_prfill_u64(sf, pd, v);
975} 1020}
976 1021
977static u64 tg_prfill_conf_uint(struct seq_file *sf, struct blkg_policy_data *pd, 1022/*
978 int off) 1023 * Blk cgroup controller notification saying that blkio_group object is being
1024 * delinked as associated cgroup object is going away. That also means that
1025 * no new IO will come in this group. So get rid of this group as soon as
1026 * any pending IO in the group is finished.
1027 *
1028 * This function is called under rcu_read_lock(). key is the rcu protected
1029 * pointer. That means "key" is a valid throtl_data pointer as long as we are
1030 * rcu read lock.
1031 *
1032 * "key" was fetched from blkio_group under blkio_cgroup->lock. That means
1033 * it should not be NULL as even if queue was going away, cgroup deltion
1034 * path got to it first.
1035 */
1036void throtl_unlink_blkio_group(void *key, struct blkio_group *blkg)
979{ 1037{
980 struct throtl_grp *tg = pd_to_tg(pd); 1038 unsigned long flags;
981 unsigned int v = *(unsigned int *)((void *)tg + off); 1039 struct throtl_data *td = key;
982 1040
983 if (v == -1) 1041 spin_lock_irqsave(td->queue->queue_lock, flags);
984 return 0; 1042 throtl_destroy_tg(td, tg_of_blkg(blkg));
985 return __blkg_prfill_u64(sf, pd, v); 1043 spin_unlock_irqrestore(td->queue->queue_lock, flags);
986} 1044}
987 1045
988static int tg_print_conf_u64(struct cgroup *cgrp, struct cftype *cft, 1046static void throtl_update_blkio_group_common(struct throtl_data *td,
989 struct seq_file *sf) 1047 struct throtl_grp *tg)
990{ 1048{
991 blkcg_print_blkgs(sf, cgroup_to_blkcg(cgrp), tg_prfill_conf_u64, 1049 xchg(&tg->limits_changed, true);
992 &blkcg_policy_throtl, cft->private, false); 1050 xchg(&td->limits_changed, true);
993 return 0; 1051 /* Schedule a work now to process the limit change */
1052 throtl_schedule_delayed_work(td, 0);
994} 1053}
995 1054
996static int tg_print_conf_uint(struct cgroup *cgrp, struct cftype *cft, 1055/*
997 struct seq_file *sf) 1056 * For all update functions, key should be a valid pointer because these
1057 * update functions are called under blkcg_lock, that means, blkg is
1058 * valid and in turn key is valid. queue exit path can not race because
1059 * of blkcg_lock
1060 *
1061 * Can not take queue lock in update functions as queue lock under blkcg_lock
1062 * is not allowed. Under other paths we take blkcg_lock under queue_lock.
1063 */
1064static void throtl_update_blkio_group_read_bps(void *key,
1065 struct blkio_group *blkg, u64 read_bps)
998{ 1066{
999 blkcg_print_blkgs(sf, cgroup_to_blkcg(cgrp), tg_prfill_conf_uint, 1067 struct throtl_data *td = key;
1000 &blkcg_policy_throtl, cft->private, false); 1068 struct throtl_grp *tg = tg_of_blkg(blkg);
1001 return 0; 1069
1070 tg->bps[READ] = read_bps;
1071 throtl_update_blkio_group_common(td, tg);
1002} 1072}
1003 1073
1004static int tg_set_conf(struct cgroup *cgrp, struct cftype *cft, const char *buf, 1074static void throtl_update_blkio_group_write_bps(void *key,
1005 bool is_u64) 1075 struct blkio_group *blkg, u64 write_bps)
1006{ 1076{
1007 struct blkcg *blkcg = cgroup_to_blkcg(cgrp); 1077 struct throtl_data *td = key;
1008 struct blkg_conf_ctx ctx; 1078 struct throtl_grp *tg = tg_of_blkg(blkg);
1009 struct throtl_grp *tg;
1010 struct throtl_data *td;
1011 int ret;
1012
1013 ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx);
1014 if (ret)
1015 return ret;
1016
1017 tg = blkg_to_tg(ctx.blkg);
1018 td = ctx.blkg->q->td;
1019
1020 if (!ctx.v)
1021 ctx.v = -1;
1022
1023 if (is_u64)
1024 *(u64 *)((void *)tg + cft->private) = ctx.v;
1025 else
1026 *(unsigned int *)((void *)tg + cft->private) = ctx.v;
1027
1028 /* XXX: we don't need the following deferred processing */
1029 xchg(&tg->limits_changed, true);
1030 xchg(&td->limits_changed, true);
1031 throtl_schedule_delayed_work(td, 0);
1032 1079
1033 blkg_conf_finish(&ctx); 1080 tg->bps[WRITE] = write_bps;
1034 return 0; 1081 throtl_update_blkio_group_common(td, tg);
1035} 1082}
1036 1083
1037static int tg_set_conf_u64(struct cgroup *cgrp, struct cftype *cft, 1084static void throtl_update_blkio_group_read_iops(void *key,
1038 const char *buf) 1085 struct blkio_group *blkg, unsigned int read_iops)
1039{ 1086{
1040 return tg_set_conf(cgrp, cft, buf, true); 1087 struct throtl_data *td = key;
1088 struct throtl_grp *tg = tg_of_blkg(blkg);
1089
1090 tg->iops[READ] = read_iops;
1091 throtl_update_blkio_group_common(td, tg);
1041} 1092}
1042 1093
1043static int tg_set_conf_uint(struct cgroup *cgrp, struct cftype *cft, 1094static void throtl_update_blkio_group_write_iops(void *key,
1044 const char *buf) 1095 struct blkio_group *blkg, unsigned int write_iops)
1045{ 1096{
1046 return tg_set_conf(cgrp, cft, buf, false); 1097 struct throtl_data *td = key;
1047} 1098 struct throtl_grp *tg = tg_of_blkg(blkg);
1048 1099
1049static struct cftype throtl_files[] = { 1100 tg->iops[WRITE] = write_iops;
1050 { 1101 throtl_update_blkio_group_common(td, tg);
1051 .name = "throttle.read_bps_device", 1102}
1052 .private = offsetof(struct throtl_grp, bps[READ]),
1053 .read_seq_string = tg_print_conf_u64,
1054 .write_string = tg_set_conf_u64,
1055 .max_write_len = 256,
1056 },
1057 {
1058 .name = "throttle.write_bps_device",
1059 .private = offsetof(struct throtl_grp, bps[WRITE]),
1060 .read_seq_string = tg_print_conf_u64,
1061 .write_string = tg_set_conf_u64,
1062 .max_write_len = 256,
1063 },
1064 {
1065 .name = "throttle.read_iops_device",
1066 .private = offsetof(struct throtl_grp, iops[READ]),
1067 .read_seq_string = tg_print_conf_uint,
1068 .write_string = tg_set_conf_uint,
1069 .max_write_len = 256,
1070 },
1071 {
1072 .name = "throttle.write_iops_device",
1073 .private = offsetof(struct throtl_grp, iops[WRITE]),
1074 .read_seq_string = tg_print_conf_uint,
1075 .write_string = tg_set_conf_uint,
1076 .max_write_len = 256,
1077 },
1078 {
1079 .name = "throttle.io_service_bytes",
1080 .private = offsetof(struct tg_stats_cpu, service_bytes),
1081 .read_seq_string = tg_print_cpu_rwstat,
1082 },
1083 {
1084 .name = "throttle.io_serviced",
1085 .private = offsetof(struct tg_stats_cpu, serviced),
1086 .read_seq_string = tg_print_cpu_rwstat,
1087 },
1088 { } /* terminate */
1089};
1090 1103
1091static void throtl_shutdown_wq(struct request_queue *q) 1104static void throtl_shutdown_wq(struct request_queue *q)
1092{ 1105{
@@ -1095,26 +1108,32 @@ static void throtl_shutdown_wq(struct request_queue *q)
1095 cancel_delayed_work_sync(&td->throtl_work); 1108 cancel_delayed_work_sync(&td->throtl_work);
1096} 1109}
1097 1110
1098static struct blkcg_policy blkcg_policy_throtl = { 1111static struct blkio_policy_type blkio_policy_throtl = {
1099 .pd_size = sizeof(struct throtl_grp), 1112 .ops = {
1100 .cftypes = throtl_files, 1113 .blkio_unlink_group_fn = throtl_unlink_blkio_group,
1101 1114 .blkio_update_group_read_bps_fn =
1102 .pd_init_fn = throtl_pd_init, 1115 throtl_update_blkio_group_read_bps,
1103 .pd_exit_fn = throtl_pd_exit, 1116 .blkio_update_group_write_bps_fn =
1104 .pd_reset_stats_fn = throtl_pd_reset_stats, 1117 throtl_update_blkio_group_write_bps,
1118 .blkio_update_group_read_iops_fn =
1119 throtl_update_blkio_group_read_iops,
1120 .blkio_update_group_write_iops_fn =
1121 throtl_update_blkio_group_write_iops,
1122 },
1123 .plid = BLKIO_POLICY_THROTL,
1105}; 1124};
1106 1125
1107bool blk_throtl_bio(struct request_queue *q, struct bio *bio) 1126int blk_throtl_bio(struct request_queue *q, struct bio **biop)
1108{ 1127{
1109 struct throtl_data *td = q->td; 1128 struct throtl_data *td = q->td;
1110 struct throtl_grp *tg; 1129 struct throtl_grp *tg;
1130 struct bio *bio = *biop;
1111 bool rw = bio_data_dir(bio), update_disptime = true; 1131 bool rw = bio_data_dir(bio), update_disptime = true;
1112 struct blkcg *blkcg; 1132 struct blkio_cgroup *blkcg;
1113 bool throttled = false;
1114 1133
1115 if (bio->bi_rw & REQ_THROTTLED) { 1134 if (bio->bi_rw & REQ_THROTTLED) {
1116 bio->bi_rw &= ~REQ_THROTTLED; 1135 bio->bi_rw &= ~REQ_THROTTLED;
1117 goto out; 1136 return 0;
1118 } 1137 }
1119 1138
1120 /* 1139 /*
@@ -1122,25 +1141,38 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
1122 * basic fields like stats and io rates. If a group has no rules, 1141 * basic fields like stats and io rates. If a group has no rules,
1123 * just update the dispatch stats in lockless manner and return. 1142 * just update the dispatch stats in lockless manner and return.
1124 */ 1143 */
1144
1125 rcu_read_lock(); 1145 rcu_read_lock();
1126 blkcg = bio_blkcg(bio); 1146 blkcg = task_blkio_cgroup(current);
1127 tg = throtl_lookup_tg(td, blkcg); 1147 tg = throtl_find_tg(td, blkcg);
1128 if (tg) { 1148 if (tg) {
1149 throtl_tg_fill_dev_details(td, tg);
1150
1129 if (tg_no_rule_group(tg, rw)) { 1151 if (tg_no_rule_group(tg, rw)) {
1130 throtl_update_dispatch_stats(tg_to_blkg(tg), 1152 blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size,
1131 bio->bi_size, bio->bi_rw); 1153 rw, rw_is_sync(bio->bi_rw));
1132 goto out_unlock_rcu; 1154 rcu_read_unlock();
1155 return 0;
1133 } 1156 }
1134 } 1157 }
1158 rcu_read_unlock();
1135 1159
1136 /* 1160 /*
1137 * Either group has not been allocated yet or it is not an unlimited 1161 * Either group has not been allocated yet or it is not an unlimited
1138 * IO group 1162 * IO group
1139 */ 1163 */
1164
1140 spin_lock_irq(q->queue_lock); 1165 spin_lock_irq(q->queue_lock);
1141 tg = throtl_lookup_create_tg(td, blkcg); 1166 tg = throtl_get_tg(td);
1142 if (unlikely(!tg)) 1167
1143 goto out_unlock; 1168 if (IS_ERR(tg)) {
1169 if (PTR_ERR(tg) == -ENODEV) {
1170 /*
1171 * Queue is gone. No queue lock held here.
1172 */
1173 return -ENODEV;
1174 }
1175 }
1144 1176
1145 if (tg->nr_queued[rw]) { 1177 if (tg->nr_queued[rw]) {
1146 /* 1178 /*
@@ -1168,7 +1200,7 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
1168 * So keep on trimming slice even if bio is not queued. 1200 * So keep on trimming slice even if bio is not queued.
1169 */ 1201 */
1170 throtl_trim_slice(td, tg, rw); 1202 throtl_trim_slice(td, tg, rw);
1171 goto out_unlock; 1203 goto out;
1172 } 1204 }
1173 1205
1174queue_bio: 1206queue_bio:
@@ -1179,87 +1211,92 @@ queue_bio:
1179 tg->io_disp[rw], tg->iops[rw], 1211 tg->io_disp[rw], tg->iops[rw],
1180 tg->nr_queued[READ], tg->nr_queued[WRITE]); 1212 tg->nr_queued[READ], tg->nr_queued[WRITE]);
1181 1213
1182 bio_associate_current(bio);
1183 throtl_add_bio_tg(q->td, tg, bio); 1214 throtl_add_bio_tg(q->td, tg, bio);
1184 throttled = true; 1215 *biop = NULL;
1185 1216
1186 if (update_disptime) { 1217 if (update_disptime) {
1187 tg_update_disptime(td, tg); 1218 tg_update_disptime(td, tg);
1188 throtl_schedule_next_dispatch(td); 1219 throtl_schedule_next_dispatch(td);
1189 } 1220 }
1190 1221
1191out_unlock:
1192 spin_unlock_irq(q->queue_lock);
1193out_unlock_rcu:
1194 rcu_read_unlock();
1195out: 1222out:
1196 return throttled;
1197}
1198
1199/**
1200 * blk_throtl_drain - drain throttled bios
1201 * @q: request_queue to drain throttled bios for
1202 *
1203 * Dispatch all currently throttled bios on @q through ->make_request_fn().
1204 */
1205void blk_throtl_drain(struct request_queue *q)
1206 __releases(q->queue_lock) __acquires(q->queue_lock)
1207{
1208 struct throtl_data *td = q->td;
1209 struct throtl_rb_root *st = &td->tg_service_tree;
1210 struct throtl_grp *tg;
1211 struct bio_list bl;
1212 struct bio *bio;
1213
1214 queue_lockdep_assert_held(q);
1215
1216 bio_list_init(&bl);
1217
1218 while ((tg = throtl_rb_first(st))) {
1219 throtl_dequeue_tg(td, tg);
1220
1221 while ((bio = bio_list_peek(&tg->bio_lists[READ])))
1222 tg_dispatch_one_bio(td, tg, bio_data_dir(bio), &bl);
1223 while ((bio = bio_list_peek(&tg->bio_lists[WRITE])))
1224 tg_dispatch_one_bio(td, tg, bio_data_dir(bio), &bl);
1225 }
1226 spin_unlock_irq(q->queue_lock); 1223 spin_unlock_irq(q->queue_lock);
1227 1224 return 0;
1228 while ((bio = bio_list_pop(&bl)))
1229 generic_make_request(bio);
1230
1231 spin_lock_irq(q->queue_lock);
1232} 1225}
1233 1226
1234int blk_throtl_init(struct request_queue *q) 1227int blk_throtl_init(struct request_queue *q)
1235{ 1228{
1236 struct throtl_data *td; 1229 struct throtl_data *td;
1237 int ret; 1230 struct throtl_grp *tg;
1238 1231
1239 td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node); 1232 td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node);
1240 if (!td) 1233 if (!td)
1241 return -ENOMEM; 1234 return -ENOMEM;
1242 1235
1236 INIT_HLIST_HEAD(&td->tg_list);
1243 td->tg_service_tree = THROTL_RB_ROOT; 1237 td->tg_service_tree = THROTL_RB_ROOT;
1244 td->limits_changed = false; 1238 td->limits_changed = false;
1245 INIT_DELAYED_WORK(&td->throtl_work, blk_throtl_work); 1239 INIT_DELAYED_WORK(&td->throtl_work, blk_throtl_work);
1246 1240
1247 q->td = td; 1241 /* alloc and Init root group. */
1248 td->queue = q; 1242 td->queue = q;
1243 tg = throtl_alloc_tg(td);
1249 1244
1250 /* activate policy */ 1245 if (!tg) {
1251 ret = blkcg_activate_policy(q, &blkcg_policy_throtl);
1252 if (ret)
1253 kfree(td); 1246 kfree(td);
1254 return ret; 1247 return -ENOMEM;
1248 }
1249
1250 td->root_tg = tg;
1251
1252 rcu_read_lock();
1253 throtl_init_add_tg_lists(td, tg, &blkio_root_cgroup);
1254 rcu_read_unlock();
1255
1256 /* Attach throtl data to request queue */
1257 q->td = td;
1258 return 0;
1255} 1259}
1256 1260
1257void blk_throtl_exit(struct request_queue *q) 1261void blk_throtl_exit(struct request_queue *q)
1258{ 1262{
1259 BUG_ON(!q->td); 1263 struct throtl_data *td = q->td;
1264 bool wait = false;
1265
1266 BUG_ON(!td);
1267
1260 throtl_shutdown_wq(q); 1268 throtl_shutdown_wq(q);
1261 blkcg_deactivate_policy(q, &blkcg_policy_throtl); 1269
1262 kfree(q->td); 1270 spin_lock_irq(q->queue_lock);
1271 throtl_release_tgs(td);
1272
1273 /* If there are other groups */
1274 if (td->nr_undestroyed_grps > 0)
1275 wait = true;
1276
1277 spin_unlock_irq(q->queue_lock);
1278
1279 /*
1280 * Wait for tg->blkg->key accessors to exit their grace periods.
1281 * Do this wait only if there are other undestroyed groups out
1282 * there (other than root group). This can happen if cgroup deletion
1283 * path claimed the responsibility of cleaning up a group before
1284 * queue cleanup code get to the group.
1285 *
1286 * Do not call synchronize_rcu() unconditionally as there are drivers
1287 * which create/delete request queue hundreds of times during scan/boot
1288 * and synchronize_rcu() can take significant time and slow down boot.
1289 */
1290 if (wait)
1291 synchronize_rcu();
1292
1293 /*
1294 * Just being safe to make sure after previous flush if some body did
1295 * update limits through cgroup and another work got queued, cancel
1296 * it.
1297 */
1298 throtl_shutdown_wq(q);
1299 throtl_td_free(td);
1263} 1300}
1264 1301
1265static int __init throtl_init(void) 1302static int __init throtl_init(void)
@@ -1268,7 +1305,8 @@ static int __init throtl_init(void)
1268 if (!kthrotld_workqueue) 1305 if (!kthrotld_workqueue)
1269 panic("Failed to create kthrotld\n"); 1306 panic("Failed to create kthrotld\n");
1270 1307
1271 return blkcg_policy_register(&blkcg_policy_throtl); 1308 blkio_policy_register(&blkio_policy_throtl);
1309 return 0;
1272} 1310}
1273 1311
1274module_init(throtl_init); 1312module_init(throtl_init);
diff --git a/block/blk-timeout.c b/block/blk-timeout.c
index 6e4744cbfb5..78035488895 100644
--- a/block/blk-timeout.c
+++ b/block/blk-timeout.c
@@ -197,3 +197,44 @@ void blk_add_timer(struct request *req)
197 mod_timer(&q->timeout, expiry); 197 mod_timer(&q->timeout, expiry);
198} 198}
199 199
200/**
201 * blk_abort_queue -- Abort all request on given queue
202 * @queue: pointer to queue
203 *
204 */
205void blk_abort_queue(struct request_queue *q)
206{
207 unsigned long flags;
208 struct request *rq, *tmp;
209 LIST_HEAD(list);
210
211 /*
212 * Not a request based block device, nothing to abort
213 */
214 if (!q->request_fn)
215 return;
216
217 spin_lock_irqsave(q->queue_lock, flags);
218
219 elv_abort_queue(q);
220
221 /*
222 * Splice entries to local list, to avoid deadlocking if entries
223 * get readded to the timeout list by error handling
224 */
225 list_splice_init(&q->timeout_list, &list);
226
227 list_for_each_entry_safe(rq, tmp, &list, timeout_list)
228 blk_abort_request(rq);
229
230 /*
231 * Occasionally, blk_abort_request() will return without
232 * deleting the element from the list. Make sure we add those back
233 * instead of leaving them on the local stack list.
234 */
235 list_splice(&list, &q->timeout_list);
236
237 spin_unlock_irqrestore(q->queue_lock, flags);
238
239}
240EXPORT_SYMBOL_GPL(blk_abort_queue);
diff --git a/block/blk.h b/block/blk.h
index 47fdfdd4152..20b900a377c 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -1,8 +1,6 @@
1#ifndef BLK_INTERNAL_H 1#ifndef BLK_INTERNAL_H
2#define BLK_INTERNAL_H 2#define BLK_INTERNAL_H
3 3
4#include <linux/idr.h>
5
6/* Amount of time in which a process may batch requests */ 4/* Amount of time in which a process may batch requests */
7#define BLK_BATCH_TIME (HZ/50UL) 5#define BLK_BATCH_TIME (HZ/50UL)
8 6
@@ -11,23 +9,12 @@
11 9
12extern struct kmem_cache *blk_requestq_cachep; 10extern struct kmem_cache *blk_requestq_cachep;
13extern struct kobj_type blk_queue_ktype; 11extern struct kobj_type blk_queue_ktype;
14extern struct ida blk_queue_ida;
15
16static inline void __blk_get_queue(struct request_queue *q)
17{
18 kobject_get(&q->kobj);
19}
20 12
21int blk_init_rl(struct request_list *rl, struct request_queue *q,
22 gfp_t gfp_mask);
23void blk_exit_rl(struct request_list *rl);
24void init_request_from_bio(struct request *req, struct bio *bio); 13void init_request_from_bio(struct request *req, struct bio *bio);
25void blk_rq_bio_prep(struct request_queue *q, struct request *rq, 14void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
26 struct bio *bio); 15 struct bio *bio);
27int blk_rq_append_bio(struct request_queue *q, struct request *rq, 16int blk_rq_append_bio(struct request_queue *q, struct request *rq,
28 struct bio *bio); 17 struct bio *bio);
29void blk_queue_bypass_start(struct request_queue *q);
30void blk_queue_bypass_end(struct request_queue *q);
31void blk_dequeue_request(struct request *rq); 18void blk_dequeue_request(struct request *rq);
32void __blk_queue_free_tags(struct request_queue *q); 19void __blk_queue_free_tags(struct request_queue *q);
33bool __blk_end_bidi_request(struct request *rq, int error, 20bool __blk_end_bidi_request(struct request *rq, int error,
@@ -36,6 +23,7 @@ bool __blk_end_bidi_request(struct request *rq, int error,
36void blk_rq_timed_out_timer(unsigned long data); 23void blk_rq_timed_out_timer(unsigned long data);
37void blk_delete_timer(struct request *); 24void blk_delete_timer(struct request *);
38void blk_add_timer(struct request *); 25void blk_add_timer(struct request *);
26void __generic_unplug_device(struct request_queue *);
39 27
40/* 28/*
41 * Internal atomic flags for request handling 29 * Internal atomic flags for request handling
@@ -96,8 +84,8 @@ static inline struct request *__elv_next_request(struct request_queue *q)
96 q->flush_queue_delayed = 1; 84 q->flush_queue_delayed = 1;
97 return NULL; 85 return NULL;
98 } 86 }
99 if (unlikely(blk_queue_dying(q)) || 87 if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags) ||
100 !q->elevator->type->ops.elevator_dispatch_fn(q, 0)) 88 !q->elevator->ops->elevator_dispatch_fn(q, 0))
101 return NULL; 89 return NULL;
102 } 90 }
103} 91}
@@ -106,16 +94,16 @@ static inline void elv_activate_rq(struct request_queue *q, struct request *rq)
106{ 94{
107 struct elevator_queue *e = q->elevator; 95 struct elevator_queue *e = q->elevator;
108 96
109 if (e->type->ops.elevator_activate_req_fn) 97 if (e->ops->elevator_activate_req_fn)
110 e->type->ops.elevator_activate_req_fn(q, rq); 98 e->ops->elevator_activate_req_fn(q, rq);
111} 99}
112 100
113static inline void elv_deactivate_rq(struct request_queue *q, struct request *rq) 101static inline void elv_deactivate_rq(struct request_queue *q, struct request *rq)
114{ 102{
115 struct elevator_queue *e = q->elevator; 103 struct elevator_queue *e = q->elevator;
116 104
117 if (e->type->ops.elevator_deactivate_req_fn) 105 if (e->ops->elevator_deactivate_req_fn)
118 e->type->ops.elevator_deactivate_req_fn(q, rq); 106 e->ops->elevator_deactivate_req_fn(q, rq);
119} 107}
120 108
121#ifdef CONFIG_FAIL_IO_TIMEOUT 109#ifdef CONFIG_FAIL_IO_TIMEOUT
@@ -130,6 +118,8 @@ static inline int blk_should_fake_timeout(struct request_queue *q)
130} 118}
131#endif 119#endif
132 120
121struct io_context *current_io_context(gfp_t gfp_flags, int node);
122
133int ll_back_merge_fn(struct request_queue *q, struct request *req, 123int ll_back_merge_fn(struct request_queue *q, struct request *req,
134 struct bio *bio); 124 struct bio *bio);
135int ll_front_merge_fn(struct request_queue *q, struct request *req, 125int ll_front_merge_fn(struct request_queue *q, struct request *req,
@@ -140,15 +130,14 @@ int blk_attempt_req_merge(struct request_queue *q, struct request *rq,
140 struct request *next); 130 struct request *next);
141void blk_recalc_rq_segments(struct request *rq); 131void blk_recalc_rq_segments(struct request *rq);
142void blk_rq_set_mixed_merge(struct request *rq); 132void blk_rq_set_mixed_merge(struct request *rq);
143bool blk_rq_merge_ok(struct request *rq, struct bio *bio);
144int blk_try_merge(struct request *rq, struct bio *bio);
145 133
146void blk_queue_congestion_threshold(struct request_queue *q); 134void blk_queue_congestion_threshold(struct request_queue *q);
147 135
148void __blk_run_queue_uncond(struct request_queue *q);
149
150int blk_dev_init(void); 136int blk_dev_init(void);
151 137
138void elv_quiesce_start(struct request_queue *q);
139void elv_quiesce_end(struct request_queue *q);
140
152 141
153/* 142/*
154 * Return the threshold (number of used requests) at which the queue is 143 * Return the threshold (number of used requests) at which the queue is
@@ -168,67 +157,35 @@ static inline int queue_congestion_off_threshold(struct request_queue *q)
168 return q->nr_congestion_off; 157 return q->nr_congestion_off;
169} 158}
170 159
160static inline int blk_cpu_to_group(int cpu)
161{
162 int group = NR_CPUS;
163#ifdef CONFIG_SCHED_MC
164 const struct cpumask *mask = cpu_coregroup_mask(cpu);
165 group = cpumask_first(mask);
166#elif defined(CONFIG_SCHED_SMT)
167 group = cpumask_first(topology_thread_cpumask(cpu));
168#else
169 return cpu;
170#endif
171 if (likely(group < NR_CPUS))
172 return group;
173 return cpu;
174}
175
171/* 176/*
172 * Contribute to IO statistics IFF: 177 * Contribute to IO statistics IFF:
173 * 178 *
174 * a) it's attached to a gendisk, and 179 * a) it's attached to a gendisk, and
175 * b) the queue had IO stats enabled when this request was started, and 180 * b) the queue had IO stats enabled when this request was started, and
176 * c) it's a file system request 181 * c) it's a file system request or a discard request
177 */ 182 */
178static inline int blk_do_io_stat(struct request *rq) 183static inline int blk_do_io_stat(struct request *rq)
179{ 184{
180 return rq->rq_disk && 185 return rq->rq_disk &&
181 (rq->cmd_flags & REQ_IO_STAT) && 186 (rq->cmd_flags & REQ_IO_STAT) &&
182 (rq->cmd_type == REQ_TYPE_FS); 187 (rq->cmd_type == REQ_TYPE_FS ||
183} 188 (rq->cmd_flags & REQ_DISCARD));
184
185/*
186 * Internal io_context interface
187 */
188void get_io_context(struct io_context *ioc);
189struct io_cq *ioc_lookup_icq(struct io_context *ioc, struct request_queue *q);
190struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q,
191 gfp_t gfp_mask);
192void ioc_clear_queue(struct request_queue *q);
193
194int create_task_io_context(struct task_struct *task, gfp_t gfp_mask, int node);
195
196/**
197 * create_io_context - try to create task->io_context
198 * @gfp_mask: allocation mask
199 * @node: allocation node
200 *
201 * If %current->io_context is %NULL, allocate a new io_context and install
202 * it. Returns the current %current->io_context which may be %NULL if
203 * allocation failed.
204 *
205 * Note that this function can't be called with IRQ disabled because
206 * task_lock which protects %current->io_context is IRQ-unsafe.
207 */
208static inline struct io_context *create_io_context(gfp_t gfp_mask, int node)
209{
210 WARN_ON_ONCE(irqs_disabled());
211 if (unlikely(!current->io_context))
212 create_task_io_context(current, gfp_mask, node);
213 return current->io_context;
214} 189}
215 190
216/* 191#endif
217 * Internal throttling interface
218 */
219#ifdef CONFIG_BLK_DEV_THROTTLING
220extern bool blk_throtl_bio(struct request_queue *q, struct bio *bio);
221extern void blk_throtl_drain(struct request_queue *q);
222extern int blk_throtl_init(struct request_queue *q);
223extern void blk_throtl_exit(struct request_queue *q);
224#else /* CONFIG_BLK_DEV_THROTTLING */
225static inline bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
226{
227 return false;
228}
229static inline void blk_throtl_drain(struct request_queue *q) { }
230static inline int blk_throtl_init(struct request_queue *q) { return 0; }
231static inline void blk_throtl_exit(struct request_queue *q) { }
232#endif /* CONFIG_BLK_DEV_THROTTLING */
233
234#endif /* BLK_INTERNAL_H */
diff --git a/block/bsg-lib.c b/block/bsg-lib.c
index 650f427d915..6690e6e4103 100644
--- a/block/bsg-lib.c
+++ b/block/bsg-lib.c
@@ -25,7 +25,7 @@
25#include <linux/delay.h> 25#include <linux/delay.h>
26#include <linux/scatterlist.h> 26#include <linux/scatterlist.h>
27#include <linux/bsg-lib.h> 27#include <linux/bsg-lib.h>
28#include <linux/export.h> 28#include <linux/module.h>
29#include <scsi/scsi_cmnd.h> 29#include <scsi/scsi_cmnd.h>
30 30
31/** 31/**
@@ -151,6 +151,19 @@ failjob_rls_job:
151 return -ENOMEM; 151 return -ENOMEM;
152} 152}
153 153
154/*
155 * bsg_goose_queue - restart queue in case it was stopped
156 * @q: request q to be restarted
157 */
158void bsg_goose_queue(struct request_queue *q)
159{
160 if (!q)
161 return;
162
163 blk_run_queue_async(q);
164}
165EXPORT_SYMBOL_GPL(bsg_goose_queue);
166
154/** 167/**
155 * bsg_request_fn - generic handler for bsg requests 168 * bsg_request_fn - generic handler for bsg requests
156 * @q: request queue to manage 169 * @q: request queue to manage
@@ -230,3 +243,56 @@ int bsg_setup_queue(struct device *dev, struct request_queue *q,
230 return 0; 243 return 0;
231} 244}
232EXPORT_SYMBOL_GPL(bsg_setup_queue); 245EXPORT_SYMBOL_GPL(bsg_setup_queue);
246
247/**
248 * bsg_remove_queue - Deletes the bsg dev from the q
249 * @q: the request_queue that is to be torn down.
250 *
251 * Notes:
252 * Before unregistering the queue empty any requests that are blocked
253 */
254void bsg_remove_queue(struct request_queue *q)
255{
256 struct request *req; /* block request */
257 int counts; /* totals for request_list count and starved */
258
259 if (!q)
260 return;
261
262 /* Stop taking in new requests */
263 spin_lock_irq(q->queue_lock);
264 blk_stop_queue(q);
265
266 /* drain all requests in the queue */
267 while (1) {
268 /* need the lock to fetch a request
269 * this may fetch the same reqeust as the previous pass
270 */
271 req = blk_fetch_request(q);
272 /* save requests in use and starved */
273 counts = q->rq.count[0] + q->rq.count[1] +
274 q->rq.starved[0] + q->rq.starved[1];
275 spin_unlock_irq(q->queue_lock);
276 /* any requests still outstanding? */
277 if (counts == 0)
278 break;
279
280 /* This may be the same req as the previous iteration,
281 * always send the blk_end_request_all after a prefetch.
282 * It is not okay to not end the request because the
283 * prefetch started the request.
284 */
285 if (req) {
286 /* return -ENXIO to indicate that this queue is
287 * going away
288 */
289 req->errors = -ENXIO;
290 blk_end_request_all(req, -ENXIO);
291 }
292
293 msleep(200); /* allow bsg to possibly finish */
294 spin_lock_irq(q->queue_lock);
295 }
296 bsg_unregister_queue(q);
297}
298EXPORT_SYMBOL_GPL(bsg_remove_queue);
diff --git a/block/bsg.c b/block/bsg.c
index ff64ae3bace..702f1316bb8 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -769,10 +769,12 @@ static struct bsg_device *bsg_add_device(struct inode *inode,
769 struct file *file) 769 struct file *file)
770{ 770{
771 struct bsg_device *bd; 771 struct bsg_device *bd;
772 int ret;
772#ifdef BSG_DEBUG 773#ifdef BSG_DEBUG
773 unsigned char buf[32]; 774 unsigned char buf[32];
774#endif 775#endif
775 if (!blk_get_queue(rq)) 776 ret = blk_get_queue(rq);
777 if (ret)
776 return ERR_PTR(-ENXIO); 778 return ERR_PTR(-ENXIO);
777 779
778 bd = bsg_alloc_device(); 780 bd = bsg_alloc_device();
@@ -983,8 +985,7 @@ void bsg_unregister_queue(struct request_queue *q)
983 985
984 mutex_lock(&bsg_mutex); 986 mutex_lock(&bsg_mutex);
985 idr_remove(&bsg_minor_idr, bcd->minor); 987 idr_remove(&bsg_minor_idr, bcd->minor);
986 if (q->kobj.sd) 988 sysfs_remove_link(&q->kobj, "bsg");
987 sysfs_remove_link(&q->kobj, "bsg");
988 device_unregister(bcd->class_dev); 989 device_unregister(bcd->class_dev);
989 bcd->class_dev = NULL; 990 bcd->class_dev = NULL;
990 kref_put(&bcd->ref, bsg_kref_release_function); 991 kref_put(&bcd->ref, bsg_kref_release_function);
@@ -1069,7 +1070,7 @@ EXPORT_SYMBOL_GPL(bsg_register_queue);
1069 1070
1070static struct cdev bsg_cdev; 1071static struct cdev bsg_cdev;
1071 1072
1072static char *bsg_devnode(struct device *dev, umode_t *mode) 1073static char *bsg_devnode(struct device *dev, mode_t *mode)
1073{ 1074{
1074 return kasprintf(GFP_KERNEL, "bsg/%s", dev_name(dev)); 1075 return kasprintf(GFP_KERNEL, "bsg/%s", dev_name(dev));
1075} 1076}
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index e62e9205b80..4c12869fcf7 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -14,8 +14,7 @@
14#include <linux/rbtree.h> 14#include <linux/rbtree.h>
15#include <linux/ioprio.h> 15#include <linux/ioprio.h>
16#include <linux/blktrace_api.h> 16#include <linux/blktrace_api.h>
17#include "blk.h" 17#include "cfq.h"
18#include "blk-cgroup.h"
19 18
20/* 19/*
21 * tunables 20 * tunables
@@ -54,11 +53,20 @@ static const int cfq_hist_divisor = 4;
54#define CFQQ_SECT_THR_NONROT (sector_t)(2 * 32) 53#define CFQQ_SECT_THR_NONROT (sector_t)(2 * 32)
55#define CFQQ_SEEKY(cfqq) (hweight32(cfqq->seek_history) > 32/8) 54#define CFQQ_SEEKY(cfqq) (hweight32(cfqq->seek_history) > 32/8)
56 55
57#define RQ_CIC(rq) icq_to_cic((rq)->elv.icq) 56#define RQ_CIC(rq) \
58#define RQ_CFQQ(rq) (struct cfq_queue *) ((rq)->elv.priv[0]) 57 ((struct cfq_io_context *) (rq)->elevator_private[0])
59#define RQ_CFQG(rq) (struct cfq_group *) ((rq)->elv.priv[1]) 58#define RQ_CFQQ(rq) (struct cfq_queue *) ((rq)->elevator_private[1])
59#define RQ_CFQG(rq) (struct cfq_group *) ((rq)->elevator_private[2])
60 60
61static struct kmem_cache *cfq_pool; 61static struct kmem_cache *cfq_pool;
62static struct kmem_cache *cfq_ioc_pool;
63
64static DEFINE_PER_CPU(unsigned long, cfq_ioc_count);
65static struct completion *ioc_gone;
66static DEFINE_SPINLOCK(ioc_gone_lock);
67
68static DEFINE_SPINLOCK(cic_index_lock);
69static DEFINE_IDA(cic_index_ida);
62 70
63#define CFQ_PRIO_LISTS IOPRIO_BE_NR 71#define CFQ_PRIO_LISTS IOPRIO_BE_NR
64#define cfq_class_idle(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE) 72#define cfq_class_idle(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
@@ -67,14 +75,6 @@ static struct kmem_cache *cfq_pool;
67#define sample_valid(samples) ((samples) > 80) 75#define sample_valid(samples) ((samples) > 80)
68#define rb_entry_cfqg(node) rb_entry((node), struct cfq_group, rb_node) 76#define rb_entry_cfqg(node) rb_entry((node), struct cfq_group, rb_node)
69 77
70struct cfq_ttime {
71 unsigned long last_end_request;
72
73 unsigned long ttime_total;
74 unsigned long ttime_samples;
75 unsigned long ttime_mean;
76};
77
78/* 78/*
79 * Most of our rbtree usage is for sorting with min extraction, so 79 * Most of our rbtree usage is for sorting with min extraction, so
80 * if we cache the leftmost node we don't have to walk down the tree 80 * if we cache the leftmost node we don't have to walk down the tree
@@ -171,53 +171,8 @@ enum wl_type_t {
171 SYNC_WORKLOAD = 2 171 SYNC_WORKLOAD = 2
172}; 172};
173 173
174struct cfqg_stats {
175#ifdef CONFIG_CFQ_GROUP_IOSCHED
176 /* total bytes transferred */
177 struct blkg_rwstat service_bytes;
178 /* total IOs serviced, post merge */
179 struct blkg_rwstat serviced;
180 /* number of ios merged */
181 struct blkg_rwstat merged;
182 /* total time spent on device in ns, may not be accurate w/ queueing */
183 struct blkg_rwstat service_time;
184 /* total time spent waiting in scheduler queue in ns */
185 struct blkg_rwstat wait_time;
186 /* number of IOs queued up */
187 struct blkg_rwstat queued;
188 /* total sectors transferred */
189 struct blkg_stat sectors;
190 /* total disk time and nr sectors dispatched by this group */
191 struct blkg_stat time;
192#ifdef CONFIG_DEBUG_BLK_CGROUP
193 /* time not charged to this cgroup */
194 struct blkg_stat unaccounted_time;
195 /* sum of number of ios queued across all samples */
196 struct blkg_stat avg_queue_size_sum;
197 /* count of samples taken for average */
198 struct blkg_stat avg_queue_size_samples;
199 /* how many times this group has been removed from service tree */
200 struct blkg_stat dequeue;
201 /* total time spent waiting for it to be assigned a timeslice. */
202 struct blkg_stat group_wait_time;
203 /* time spent idling for this blkcg_gq */
204 struct blkg_stat idle_time;
205 /* total time with empty current active q with other requests queued */
206 struct blkg_stat empty_time;
207 /* fields after this shouldn't be cleared on stat reset */
208 uint64_t start_group_wait_time;
209 uint64_t start_idle_time;
210 uint64_t start_empty_time;
211 uint16_t flags;
212#endif /* CONFIG_DEBUG_BLK_CGROUP */
213#endif /* CONFIG_CFQ_GROUP_IOSCHED */
214};
215
216/* This is per cgroup per device grouping structure */ 174/* This is per cgroup per device grouping structure */
217struct cfq_group { 175struct cfq_group {
218 /* must be the first member */
219 struct blkg_policy_data pd;
220
221 /* group service_tree member */ 176 /* group service_tree member */
222 struct rb_node rb_node; 177 struct rb_node rb_node;
223 178
@@ -225,7 +180,7 @@ struct cfq_group {
225 u64 vdisktime; 180 u64 vdisktime;
226 unsigned int weight; 181 unsigned int weight;
227 unsigned int new_weight; 182 unsigned int new_weight;
228 unsigned int dev_weight; 183 bool needs_update;
229 184
230 /* number of cfqq currently on this group */ 185 /* number of cfqq currently on this group */
231 int nr_cfqq; 186 int nr_cfqq;
@@ -251,21 +206,14 @@ struct cfq_group {
251 unsigned long saved_workload_slice; 206 unsigned long saved_workload_slice;
252 enum wl_type_t saved_workload; 207 enum wl_type_t saved_workload;
253 enum wl_prio_t saved_serving_prio; 208 enum wl_prio_t saved_serving_prio;
254 209 struct blkio_group blkg;
210#ifdef CONFIG_CFQ_GROUP_IOSCHED
211 struct hlist_node cfqd_node;
212 int ref;
213#endif
255 /* number of requests that are on the dispatch list or inside driver */ 214 /* number of requests that are on the dispatch list or inside driver */
256 int dispatched; 215 int dispatched;
257 struct cfq_ttime ttime; 216 struct cfq_ttime ttime;
258 struct cfqg_stats stats;
259};
260
261struct cfq_io_cq {
262 struct io_cq icq; /* must be the first member */
263 struct cfq_queue *cfqq[2];
264 struct cfq_ttime ttime;
265 int ioprio; /* the current ioprio */
266#ifdef CONFIG_CFQ_GROUP_IOSCHED
267 uint64_t blkcg_id; /* the current blkcg ID */
268#endif
269}; 217};
270 218
271/* 219/*
@@ -275,7 +223,7 @@ struct cfq_data {
275 struct request_queue *queue; 223 struct request_queue *queue;
276 /* Root service tree for cfq_groups */ 224 /* Root service tree for cfq_groups */
277 struct cfq_rb_root grp_service_tree; 225 struct cfq_rb_root grp_service_tree;
278 struct cfq_group *root_group; 226 struct cfq_group root_group;
279 227
280 /* 228 /*
281 * The priority currently being served 229 * The priority currently being served
@@ -319,7 +267,7 @@ struct cfq_data {
319 struct work_struct unplug_work; 267 struct work_struct unplug_work;
320 268
321 struct cfq_queue *active_queue; 269 struct cfq_queue *active_queue;
322 struct cfq_io_cq *active_cic; 270 struct cfq_io_context *active_cic;
323 271
324 /* 272 /*
325 * async queue for each priority case 273 * async queue for each priority case
@@ -341,7 +289,9 @@ struct cfq_data {
341 unsigned int cfq_slice_idle; 289 unsigned int cfq_slice_idle;
342 unsigned int cfq_group_idle; 290 unsigned int cfq_group_idle;
343 unsigned int cfq_latency; 291 unsigned int cfq_latency;
344 unsigned int cfq_target_latency; 292
293 unsigned int cic_index;
294 struct list_head cic_list;
345 295
346 /* 296 /*
347 * Fallback dummy cfqq for extreme OOM conditions 297 * Fallback dummy cfqq for extreme OOM conditions
@@ -349,6 +299,12 @@ struct cfq_data {
349 struct cfq_queue oom_cfqq; 299 struct cfq_queue oom_cfqq;
350 300
351 unsigned long last_delayed_sync; 301 unsigned long last_delayed_sync;
302
303 /* List of cfq groups being managed on this device*/
304 struct hlist_head cfqg_list;
305
306 /* Number of groups which are on blkcg->blkg_list */
307 unsigned int nr_blkcg_linked_grps;
352}; 308};
353 309
354static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd); 310static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);
@@ -411,286 +367,21 @@ CFQ_CFQQ_FNS(deep);
411CFQ_CFQQ_FNS(wait_busy); 367CFQ_CFQQ_FNS(wait_busy);
412#undef CFQ_CFQQ_FNS 368#undef CFQ_CFQQ_FNS
413 369
414static inline struct cfq_group *pd_to_cfqg(struct blkg_policy_data *pd)
415{
416 return pd ? container_of(pd, struct cfq_group, pd) : NULL;
417}
418
419static inline struct blkcg_gq *cfqg_to_blkg(struct cfq_group *cfqg)
420{
421 return pd_to_blkg(&cfqg->pd);
422}
423
424#if defined(CONFIG_CFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP)
425
426/* cfqg stats flags */
427enum cfqg_stats_flags {
428 CFQG_stats_waiting = 0,
429 CFQG_stats_idling,
430 CFQG_stats_empty,
431};
432
433#define CFQG_FLAG_FNS(name) \
434static inline void cfqg_stats_mark_##name(struct cfqg_stats *stats) \
435{ \
436 stats->flags |= (1 << CFQG_stats_##name); \
437} \
438static inline void cfqg_stats_clear_##name(struct cfqg_stats *stats) \
439{ \
440 stats->flags &= ~(1 << CFQG_stats_##name); \
441} \
442static inline int cfqg_stats_##name(struct cfqg_stats *stats) \
443{ \
444 return (stats->flags & (1 << CFQG_stats_##name)) != 0; \
445} \
446
447CFQG_FLAG_FNS(waiting)
448CFQG_FLAG_FNS(idling)
449CFQG_FLAG_FNS(empty)
450#undef CFQG_FLAG_FNS
451
452/* This should be called with the queue_lock held. */
453static void cfqg_stats_update_group_wait_time(struct cfqg_stats *stats)
454{
455 unsigned long long now;
456
457 if (!cfqg_stats_waiting(stats))
458 return;
459
460 now = sched_clock();
461 if (time_after64(now, stats->start_group_wait_time))
462 blkg_stat_add(&stats->group_wait_time,
463 now - stats->start_group_wait_time);
464 cfqg_stats_clear_waiting(stats);
465}
466
467/* This should be called with the queue_lock held. */
468static void cfqg_stats_set_start_group_wait_time(struct cfq_group *cfqg,
469 struct cfq_group *curr_cfqg)
470{
471 struct cfqg_stats *stats = &cfqg->stats;
472
473 if (cfqg_stats_waiting(stats))
474 return;
475 if (cfqg == curr_cfqg)
476 return;
477 stats->start_group_wait_time = sched_clock();
478 cfqg_stats_mark_waiting(stats);
479}
480
481/* This should be called with the queue_lock held. */
482static void cfqg_stats_end_empty_time(struct cfqg_stats *stats)
483{
484 unsigned long long now;
485
486 if (!cfqg_stats_empty(stats))
487 return;
488
489 now = sched_clock();
490 if (time_after64(now, stats->start_empty_time))
491 blkg_stat_add(&stats->empty_time,
492 now - stats->start_empty_time);
493 cfqg_stats_clear_empty(stats);
494}
495
496static void cfqg_stats_update_dequeue(struct cfq_group *cfqg)
497{
498 blkg_stat_add(&cfqg->stats.dequeue, 1);
499}
500
501static void cfqg_stats_set_start_empty_time(struct cfq_group *cfqg)
502{
503 struct cfqg_stats *stats = &cfqg->stats;
504
505 if (blkg_rwstat_sum(&stats->queued))
506 return;
507
508 /*
509 * group is already marked empty. This can happen if cfqq got new
510 * request in parent group and moved to this group while being added
511 * to service tree. Just ignore the event and move on.
512 */
513 if (cfqg_stats_empty(stats))
514 return;
515
516 stats->start_empty_time = sched_clock();
517 cfqg_stats_mark_empty(stats);
518}
519
520static void cfqg_stats_update_idle_time(struct cfq_group *cfqg)
521{
522 struct cfqg_stats *stats = &cfqg->stats;
523
524 if (cfqg_stats_idling(stats)) {
525 unsigned long long now = sched_clock();
526
527 if (time_after64(now, stats->start_idle_time))
528 blkg_stat_add(&stats->idle_time,
529 now - stats->start_idle_time);
530 cfqg_stats_clear_idling(stats);
531 }
532}
533
534static void cfqg_stats_set_start_idle_time(struct cfq_group *cfqg)
535{
536 struct cfqg_stats *stats = &cfqg->stats;
537
538 BUG_ON(cfqg_stats_idling(stats));
539
540 stats->start_idle_time = sched_clock();
541 cfqg_stats_mark_idling(stats);
542}
543
544static void cfqg_stats_update_avg_queue_size(struct cfq_group *cfqg)
545{
546 struct cfqg_stats *stats = &cfqg->stats;
547
548 blkg_stat_add(&stats->avg_queue_size_sum,
549 blkg_rwstat_sum(&stats->queued));
550 blkg_stat_add(&stats->avg_queue_size_samples, 1);
551 cfqg_stats_update_group_wait_time(stats);
552}
553
554#else /* CONFIG_CFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */
555
556static inline void cfqg_stats_set_start_group_wait_time(struct cfq_group *cfqg, struct cfq_group *curr_cfqg) { }
557static inline void cfqg_stats_end_empty_time(struct cfqg_stats *stats) { }
558static inline void cfqg_stats_update_dequeue(struct cfq_group *cfqg) { }
559static inline void cfqg_stats_set_start_empty_time(struct cfq_group *cfqg) { }
560static inline void cfqg_stats_update_idle_time(struct cfq_group *cfqg) { }
561static inline void cfqg_stats_set_start_idle_time(struct cfq_group *cfqg) { }
562static inline void cfqg_stats_update_avg_queue_size(struct cfq_group *cfqg) { }
563
564#endif /* CONFIG_CFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */
565
566#ifdef CONFIG_CFQ_GROUP_IOSCHED 370#ifdef CONFIG_CFQ_GROUP_IOSCHED
567 371#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \
568static struct blkcg_policy blkcg_policy_cfq;
569
570static inline struct cfq_group *blkg_to_cfqg(struct blkcg_gq *blkg)
571{
572 return pd_to_cfqg(blkg_to_pd(blkg, &blkcg_policy_cfq));
573}
574
575static inline void cfqg_get(struct cfq_group *cfqg)
576{
577 return blkg_get(cfqg_to_blkg(cfqg));
578}
579
580static inline void cfqg_put(struct cfq_group *cfqg)
581{
582 return blkg_put(cfqg_to_blkg(cfqg));
583}
584
585#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) do { \
586 char __pbuf[128]; \
587 \
588 blkg_path(cfqg_to_blkg((cfqq)->cfqg), __pbuf, sizeof(__pbuf)); \
589 blk_add_trace_msg((cfqd)->queue, "cfq%d%c %s " fmt, (cfqq)->pid, \ 372 blk_add_trace_msg((cfqd)->queue, "cfq%d%c %s " fmt, (cfqq)->pid, \
590 cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \ 373 cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \
591 __pbuf, ##args); \ 374 blkg_path(&(cfqq)->cfqg->blkg), ##args)
592} while (0)
593
594#define cfq_log_cfqg(cfqd, cfqg, fmt, args...) do { \
595 char __pbuf[128]; \
596 \
597 blkg_path(cfqg_to_blkg(cfqg), __pbuf, sizeof(__pbuf)); \
598 blk_add_trace_msg((cfqd)->queue, "%s " fmt, __pbuf, ##args); \
599} while (0)
600
601static inline void cfqg_stats_update_io_add(struct cfq_group *cfqg,
602 struct cfq_group *curr_cfqg, int rw)
603{
604 blkg_rwstat_add(&cfqg->stats.queued, rw, 1);
605 cfqg_stats_end_empty_time(&cfqg->stats);
606 cfqg_stats_set_start_group_wait_time(cfqg, curr_cfqg);
607}
608 375
609static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg, 376#define cfq_log_cfqg(cfqd, cfqg, fmt, args...) \
610 unsigned long time, unsigned long unaccounted_time) 377 blk_add_trace_msg((cfqd)->queue, "%s " fmt, \
611{ 378 blkg_path(&(cfqg)->blkg), ##args) \
612 blkg_stat_add(&cfqg->stats.time, time);
613#ifdef CONFIG_DEBUG_BLK_CGROUP
614 blkg_stat_add(&cfqg->stats.unaccounted_time, unaccounted_time);
615#endif
616}
617
618static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg, int rw)
619{
620 blkg_rwstat_add(&cfqg->stats.queued, rw, -1);
621}
622
623static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, int rw)
624{
625 blkg_rwstat_add(&cfqg->stats.merged, rw, 1);
626}
627
628static inline void cfqg_stats_update_dispatch(struct cfq_group *cfqg,
629 uint64_t bytes, int rw)
630{
631 blkg_stat_add(&cfqg->stats.sectors, bytes >> 9);
632 blkg_rwstat_add(&cfqg->stats.serviced, rw, 1);
633 blkg_rwstat_add(&cfqg->stats.service_bytes, rw, bytes);
634}
635
636static inline void cfqg_stats_update_completion(struct cfq_group *cfqg,
637 uint64_t start_time, uint64_t io_start_time, int rw)
638{
639 struct cfqg_stats *stats = &cfqg->stats;
640 unsigned long long now = sched_clock();
641
642 if (time_after64(now, io_start_time))
643 blkg_rwstat_add(&stats->service_time, rw, now - io_start_time);
644 if (time_after64(io_start_time, start_time))
645 blkg_rwstat_add(&stats->wait_time, rw,
646 io_start_time - start_time);
647}
648
649static void cfq_pd_reset_stats(struct blkcg_gq *blkg)
650{
651 struct cfq_group *cfqg = blkg_to_cfqg(blkg);
652 struct cfqg_stats *stats = &cfqg->stats;
653
654 /* queued stats shouldn't be cleared */
655 blkg_rwstat_reset(&stats->service_bytes);
656 blkg_rwstat_reset(&stats->serviced);
657 blkg_rwstat_reset(&stats->merged);
658 blkg_rwstat_reset(&stats->service_time);
659 blkg_rwstat_reset(&stats->wait_time);
660 blkg_stat_reset(&stats->time);
661#ifdef CONFIG_DEBUG_BLK_CGROUP
662 blkg_stat_reset(&stats->unaccounted_time);
663 blkg_stat_reset(&stats->avg_queue_size_sum);
664 blkg_stat_reset(&stats->avg_queue_size_samples);
665 blkg_stat_reset(&stats->dequeue);
666 blkg_stat_reset(&stats->group_wait_time);
667 blkg_stat_reset(&stats->idle_time);
668 blkg_stat_reset(&stats->empty_time);
669#endif
670}
671
672#else /* CONFIG_CFQ_GROUP_IOSCHED */
673
674static inline void cfqg_get(struct cfq_group *cfqg) { }
675static inline void cfqg_put(struct cfq_group *cfqg) { }
676 379
380#else
677#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \ 381#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \
678 blk_add_trace_msg((cfqd)->queue, "cfq%d " fmt, (cfqq)->pid, ##args) 382 blk_add_trace_msg((cfqd)->queue, "cfq%d " fmt, (cfqq)->pid, ##args)
679#define cfq_log_cfqg(cfqd, cfqg, fmt, args...) do {} while (0) 383#define cfq_log_cfqg(cfqd, cfqg, fmt, args...) do {} while (0)
680 384#endif
681static inline void cfqg_stats_update_io_add(struct cfq_group *cfqg,
682 struct cfq_group *curr_cfqg, int rw) { }
683static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg,
684 unsigned long time, unsigned long unaccounted_time) { }
685static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg, int rw) { }
686static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, int rw) { }
687static inline void cfqg_stats_update_dispatch(struct cfq_group *cfqg,
688 uint64_t bytes, int rw) { }
689static inline void cfqg_stats_update_completion(struct cfq_group *cfqg,
690 uint64_t start_time, uint64_t io_start_time, int rw) { }
691
692#endif /* CONFIG_CFQ_GROUP_IOSCHED */
693
694#define cfq_log(cfqd, fmt, args...) \ 385#define cfq_log(cfqd, fmt, args...) \
695 blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args) 386 blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args)
696 387
@@ -771,38 +462,39 @@ static inline int cfqg_busy_async_queues(struct cfq_data *cfqd,
771} 462}
772 463
773static void cfq_dispatch_insert(struct request_queue *, struct request *); 464static void cfq_dispatch_insert(struct request_queue *, struct request *);
774static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, bool is_sync, 465static struct cfq_queue *cfq_get_queue(struct cfq_data *, bool,
775 struct cfq_io_cq *cic, struct bio *bio, 466 struct io_context *, gfp_t);
776 gfp_t gfp_mask); 467static struct cfq_io_context *cfq_cic_lookup(struct cfq_data *,
468 struct io_context *);
777 469
778static inline struct cfq_io_cq *icq_to_cic(struct io_cq *icq) 470static inline struct cfq_queue *cic_to_cfqq(struct cfq_io_context *cic,
471 bool is_sync)
779{ 472{
780 /* cic->icq is the first member, %NULL will convert to %NULL */ 473 return cic->cfqq[is_sync];
781 return container_of(icq, struct cfq_io_cq, icq);
782} 474}
783 475
784static inline struct cfq_io_cq *cfq_cic_lookup(struct cfq_data *cfqd, 476static inline void cic_set_cfqq(struct cfq_io_context *cic,
785 struct io_context *ioc) 477 struct cfq_queue *cfqq, bool is_sync)
786{ 478{
787 if (ioc) 479 cic->cfqq[is_sync] = cfqq;
788 return icq_to_cic(ioc_lookup_icq(ioc, cfqd->queue));
789 return NULL;
790} 480}
791 481
792static inline struct cfq_queue *cic_to_cfqq(struct cfq_io_cq *cic, bool is_sync) 482#define CIC_DEAD_KEY 1ul
793{ 483#define CIC_DEAD_INDEX_SHIFT 1
794 return cic->cfqq[is_sync];
795}
796 484
797static inline void cic_set_cfqq(struct cfq_io_cq *cic, struct cfq_queue *cfqq, 485static inline void *cfqd_dead_key(struct cfq_data *cfqd)
798 bool is_sync)
799{ 486{
800 cic->cfqq[is_sync] = cfqq; 487 return (void *)(cfqd->cic_index << CIC_DEAD_INDEX_SHIFT | CIC_DEAD_KEY);
801} 488}
802 489
803static inline struct cfq_data *cic_to_cfqd(struct cfq_io_cq *cic) 490static inline struct cfq_data *cic_to_cfqd(struct cfq_io_context *cic)
804{ 491{
805 return cic->icq.q->elevator->elevator_data; 492 struct cfq_data *cfqd = cic->key;
493
494 if (unlikely((unsigned long) cfqd & CIC_DEAD_KEY))
495 return NULL;
496
497 return cfqd;
806} 498}
807 499
808/* 500/*
@@ -851,7 +543,7 @@ static inline u64 cfq_scale_slice(unsigned long delta, struct cfq_group *cfqg)
851{ 543{
852 u64 d = delta << CFQ_SERVICE_SHIFT; 544 u64 d = delta << CFQ_SERVICE_SHIFT;
853 545
854 d = d * CFQ_WEIGHT_DEFAULT; 546 d = d * BLKIO_WEIGHT_DEFAULT;
855 do_div(d, cfqg->weight); 547 do_div(d, cfqg->weight);
856 return d; 548 return d;
857} 549}
@@ -911,7 +603,7 @@ cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg)
911{ 603{
912 struct cfq_rb_root *st = &cfqd->grp_service_tree; 604 struct cfq_rb_root *st = &cfqd->grp_service_tree;
913 605
914 return cfqd->cfq_target_latency * cfqg->weight / st->total_weight; 606 return cfq_target_latency * cfqg->weight / st->total_weight;
915} 607}
916 608
917static inline unsigned 609static inline unsigned
@@ -1178,9 +870,9 @@ static void
1178cfq_update_group_weight(struct cfq_group *cfqg) 870cfq_update_group_weight(struct cfq_group *cfqg)
1179{ 871{
1180 BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node)); 872 BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node));
1181 if (cfqg->new_weight) { 873 if (cfqg->needs_update) {
1182 cfqg->weight = cfqg->new_weight; 874 cfqg->weight = cfqg->new_weight;
1183 cfqg->new_weight = 0; 875 cfqg->needs_update = false;
1184 } 876 }
1185} 877}
1186 878
@@ -1242,7 +934,7 @@ cfq_group_notify_queue_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
1242 cfq_log_cfqg(cfqd, cfqg, "del_from_rr group"); 934 cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");
1243 cfq_group_service_tree_del(st, cfqg); 935 cfq_group_service_tree_del(st, cfqg);
1244 cfqg->saved_workload_slice = 0; 936 cfqg->saved_workload_slice = 0;
1245 cfqg_stats_update_dequeue(cfqg); 937 cfq_blkiocg_update_dequeue_stats(&cfqg->blkg, 1);
1246} 938}
1247 939
1248static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq, 940static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq,
@@ -1314,59 +1006,178 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
1314 "sl_used=%u disp=%u charge=%u iops=%u sect=%lu", 1006 "sl_used=%u disp=%u charge=%u iops=%u sect=%lu",
1315 used_sl, cfqq->slice_dispatch, charge, 1007 used_sl, cfqq->slice_dispatch, charge,
1316 iops_mode(cfqd), cfqq->nr_sectors); 1008 iops_mode(cfqd), cfqq->nr_sectors);
1317 cfqg_stats_update_timeslice_used(cfqg, used_sl, unaccounted_sl); 1009 cfq_blkiocg_update_timeslice_used(&cfqg->blkg, used_sl,
1318 cfqg_stats_set_start_empty_time(cfqg); 1010 unaccounted_sl);
1011 cfq_blkiocg_set_start_empty_time(&cfqg->blkg);
1319} 1012}
1320 1013
1321/** 1014#ifdef CONFIG_CFQ_GROUP_IOSCHED
1322 * cfq_init_cfqg_base - initialize base part of a cfq_group 1015static inline struct cfq_group *cfqg_of_blkg(struct blkio_group *blkg)
1323 * @cfqg: cfq_group to initialize 1016{
1324 * 1017 if (blkg)
1325 * Initialize the base part which is used whether %CONFIG_CFQ_GROUP_IOSCHED 1018 return container_of(blkg, struct cfq_group, blkg);
1326 * is enabled or not. 1019 return NULL;
1020}
1021
1022static void cfq_update_blkio_group_weight(void *key, struct blkio_group *blkg,
1023 unsigned int weight)
1024{
1025 struct cfq_group *cfqg = cfqg_of_blkg(blkg);
1026 cfqg->new_weight = weight;
1027 cfqg->needs_update = true;
1028}
1029
1030static void cfq_init_add_cfqg_lists(struct cfq_data *cfqd,
1031 struct cfq_group *cfqg, struct blkio_cgroup *blkcg)
1032{
1033 struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
1034 unsigned int major, minor;
1035
1036 /*
1037 * Add group onto cgroup list. It might happen that bdi->dev is
1038 * not initialized yet. Initialize this new group without major
1039 * and minor info and this info will be filled in once a new thread
1040 * comes for IO.
1041 */
1042 if (bdi->dev) {
1043 sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
1044 cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg,
1045 (void *)cfqd, MKDEV(major, minor));
1046 } else
1047 cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg,
1048 (void *)cfqd, 0);
1049
1050 cfqd->nr_blkcg_linked_grps++;
1051 cfqg->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
1052
1053 /* Add group on cfqd list */
1054 hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
1055}
1056
1057/*
1058 * Should be called from sleepable context. No request queue lock as per
1059 * cpu stats are allocated dynamically and alloc_percpu needs to be called
1060 * from sleepable context.
1327 */ 1061 */
1328static void cfq_init_cfqg_base(struct cfq_group *cfqg) 1062static struct cfq_group * cfq_alloc_cfqg(struct cfq_data *cfqd)
1329{ 1063{
1064 struct cfq_group *cfqg = NULL;
1065 int i, j, ret;
1330 struct cfq_rb_root *st; 1066 struct cfq_rb_root *st;
1331 int i, j; 1067
1068 cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node);
1069 if (!cfqg)
1070 return NULL;
1332 1071
1333 for_each_cfqg_st(cfqg, i, j, st) 1072 for_each_cfqg_st(cfqg, i, j, st)
1334 *st = CFQ_RB_ROOT; 1073 *st = CFQ_RB_ROOT;
1335 RB_CLEAR_NODE(&cfqg->rb_node); 1074 RB_CLEAR_NODE(&cfqg->rb_node);
1336 1075
1337 cfqg->ttime.last_end_request = jiffies; 1076 cfqg->ttime.last_end_request = jiffies;
1077
1078 /*
1079 * Take the initial reference that will be released on destroy
1080 * This can be thought of a joint reference by cgroup and
1081 * elevator which will be dropped by either elevator exit
1082 * or cgroup deletion path depending on who is exiting first.
1083 */
1084 cfqg->ref = 1;
1085
1086 ret = blkio_alloc_blkg_stats(&cfqg->blkg);
1087 if (ret) {
1088 kfree(cfqg);
1089 return NULL;
1090 }
1091
1092 return cfqg;
1338} 1093}
1339 1094
1340#ifdef CONFIG_CFQ_GROUP_IOSCHED 1095static struct cfq_group *
1341static void cfq_pd_init(struct blkcg_gq *blkg) 1096cfq_find_cfqg(struct cfq_data *cfqd, struct blkio_cgroup *blkcg)
1342{ 1097{
1343 struct cfq_group *cfqg = blkg_to_cfqg(blkg); 1098 struct cfq_group *cfqg = NULL;
1099 void *key = cfqd;
1100 struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
1101 unsigned int major, minor;
1344 1102
1345 cfq_init_cfqg_base(cfqg); 1103 /*
1346 cfqg->weight = blkg->blkcg->cfq_weight; 1104 * This is the common case when there are no blkio cgroups.
1105 * Avoid lookup in this case
1106 */
1107 if (blkcg == &blkio_root_cgroup)
1108 cfqg = &cfqd->root_group;
1109 else
1110 cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
1111
1112 if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
1113 sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
1114 cfqg->blkg.dev = MKDEV(major, minor);
1115 }
1116
1117 return cfqg;
1347} 1118}
1348 1119
1349/* 1120/*
1350 * Search for the cfq group current task belongs to. request_queue lock must 1121 * Search for the cfq group current task belongs to. request_queue lock must
1351 * be held. 1122 * be held.
1352 */ 1123 */
1353static struct cfq_group *cfq_lookup_create_cfqg(struct cfq_data *cfqd, 1124static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd)
1354 struct blkcg *blkcg)
1355{ 1125{
1126 struct blkio_cgroup *blkcg;
1127 struct cfq_group *cfqg = NULL, *__cfqg = NULL;
1356 struct request_queue *q = cfqd->queue; 1128 struct request_queue *q = cfqd->queue;
1357 struct cfq_group *cfqg = NULL;
1358 1129
1359 /* avoid lookup for the common case where there's no blkcg */ 1130 rcu_read_lock();
1360 if (blkcg == &blkcg_root) { 1131 blkcg = task_blkio_cgroup(current);
1361 cfqg = cfqd->root_group; 1132 cfqg = cfq_find_cfqg(cfqd, blkcg);
1362 } else { 1133 if (cfqg) {
1363 struct blkcg_gq *blkg; 1134 rcu_read_unlock();
1135 return cfqg;
1136 }
1137
1138 /*
1139 * Need to allocate a group. Allocation of group also needs allocation
1140 * of per cpu stats which in-turn takes a mutex() and can block. Hence
1141 * we need to drop rcu lock and queue_lock before we call alloc.
1142 *
1143 * Not taking any queue reference here and assuming that queue is
1144 * around by the time we return. CFQ queue allocation code does
1145 * the same. It might be racy though.
1146 */
1147
1148 rcu_read_unlock();
1149 spin_unlock_irq(q->queue_lock);
1150
1151 cfqg = cfq_alloc_cfqg(cfqd);
1364 1152
1365 blkg = blkg_lookup_create(blkcg, q); 1153 spin_lock_irq(q->queue_lock);
1366 if (!IS_ERR(blkg)) 1154
1367 cfqg = blkg_to_cfqg(blkg); 1155 rcu_read_lock();
1156 blkcg = task_blkio_cgroup(current);
1157
1158 /*
1159 * If some other thread already allocated the group while we were
1160 * not holding queue lock, free up the group
1161 */
1162 __cfqg = cfq_find_cfqg(cfqd, blkcg);
1163
1164 if (__cfqg) {
1165 kfree(cfqg);
1166 rcu_read_unlock();
1167 return __cfqg;
1368 } 1168 }
1369 1169
1170 if (!cfqg)
1171 cfqg = &cfqd->root_group;
1172
1173 cfq_init_add_cfqg_lists(cfqd, cfqg, blkcg);
1174 rcu_read_unlock();
1175 return cfqg;
1176}
1177
1178static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg)
1179{
1180 cfqg->ref++;
1370 return cfqg; 1181 return cfqg;
1371} 1182}
1372 1183
@@ -1374,224 +1185,94 @@ static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg)
1374{ 1185{
1375 /* Currently, all async queues are mapped to root group */ 1186 /* Currently, all async queues are mapped to root group */
1376 if (!cfq_cfqq_sync(cfqq)) 1187 if (!cfq_cfqq_sync(cfqq))
1377 cfqg = cfqq->cfqd->root_group; 1188 cfqg = &cfqq->cfqd->root_group;
1378 1189
1379 cfqq->cfqg = cfqg; 1190 cfqq->cfqg = cfqg;
1380 /* cfqq reference on cfqg */ 1191 /* cfqq reference on cfqg */
1381 cfqg_get(cfqg); 1192 cfqq->cfqg->ref++;
1382} 1193}
1383 1194
1384static u64 cfqg_prfill_weight_device(struct seq_file *sf, 1195static void cfq_put_cfqg(struct cfq_group *cfqg)
1385 struct blkg_policy_data *pd, int off)
1386{ 1196{
1387 struct cfq_group *cfqg = pd_to_cfqg(pd); 1197 struct cfq_rb_root *st;
1388 1198 int i, j;
1389 if (!cfqg->dev_weight)
1390 return 0;
1391 return __blkg_prfill_u64(sf, pd, cfqg->dev_weight);
1392}
1393
1394static int cfqg_print_weight_device(struct cgroup *cgrp, struct cftype *cft,
1395 struct seq_file *sf)
1396{
1397 blkcg_print_blkgs(sf, cgroup_to_blkcg(cgrp),
1398 cfqg_prfill_weight_device, &blkcg_policy_cfq, 0,
1399 false);
1400 return 0;
1401}
1402 1199
1403static int cfq_print_weight(struct cgroup *cgrp, struct cftype *cft, 1200 BUG_ON(cfqg->ref <= 0);
1404 struct seq_file *sf) 1201 cfqg->ref--;
1405{ 1202 if (cfqg->ref)
1406 seq_printf(sf, "%u\n", cgroup_to_blkcg(cgrp)->cfq_weight); 1203 return;
1407 return 0; 1204 for_each_cfqg_st(cfqg, i, j, st)
1205 BUG_ON(!RB_EMPTY_ROOT(&st->rb));
1206 free_percpu(cfqg->blkg.stats_cpu);
1207 kfree(cfqg);
1408} 1208}
1409 1209
1410static int cfqg_set_weight_device(struct cgroup *cgrp, struct cftype *cft, 1210static void cfq_destroy_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg)
1411 const char *buf)
1412{ 1211{
1413 struct blkcg *blkcg = cgroup_to_blkcg(cgrp); 1212 /* Something wrong if we are trying to remove same group twice */
1414 struct blkg_conf_ctx ctx; 1213 BUG_ON(hlist_unhashed(&cfqg->cfqd_node));
1415 struct cfq_group *cfqg;
1416 int ret;
1417 1214
1418 ret = blkg_conf_prep(blkcg, &blkcg_policy_cfq, buf, &ctx); 1215 hlist_del_init(&cfqg->cfqd_node);
1419 if (ret)
1420 return ret;
1421 1216
1422 ret = -EINVAL; 1217 BUG_ON(cfqd->nr_blkcg_linked_grps <= 0);
1423 cfqg = blkg_to_cfqg(ctx.blkg); 1218 cfqd->nr_blkcg_linked_grps--;
1424 if (!ctx.v || (ctx.v >= CFQ_WEIGHT_MIN && ctx.v <= CFQ_WEIGHT_MAX)) {
1425 cfqg->dev_weight = ctx.v;
1426 cfqg->new_weight = cfqg->dev_weight ?: blkcg->cfq_weight;
1427 ret = 0;
1428 }
1429 1219
1430 blkg_conf_finish(&ctx); 1220 /*
1431 return ret; 1221 * Put the reference taken at the time of creation so that when all
1222 * queues are gone, group can be destroyed.
1223 */
1224 cfq_put_cfqg(cfqg);
1432} 1225}
1433 1226
1434static int cfq_set_weight(struct cgroup *cgrp, struct cftype *cft, u64 val) 1227static void cfq_release_cfq_groups(struct cfq_data *cfqd)
1435{ 1228{
1436 struct blkcg *blkcg = cgroup_to_blkcg(cgrp); 1229 struct hlist_node *pos, *n;
1437 struct blkcg_gq *blkg; 1230 struct cfq_group *cfqg;
1438 struct hlist_node *n;
1439
1440 if (val < CFQ_WEIGHT_MIN || val > CFQ_WEIGHT_MAX)
1441 return -EINVAL;
1442
1443 spin_lock_irq(&blkcg->lock);
1444 blkcg->cfq_weight = (unsigned int)val;
1445
1446 hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
1447 struct cfq_group *cfqg = blkg_to_cfqg(blkg);
1448 1231
1449 if (cfqg && !cfqg->dev_weight) 1232 hlist_for_each_entry_safe(cfqg, pos, n, &cfqd->cfqg_list, cfqd_node) {
1450 cfqg->new_weight = blkcg->cfq_weight; 1233 /*
1234 * If cgroup removal path got to blk_group first and removed
1235 * it from cgroup list, then it will take care of destroying
1236 * cfqg also.
1237 */
1238 if (!cfq_blkiocg_del_blkio_group(&cfqg->blkg))
1239 cfq_destroy_cfqg(cfqd, cfqg);
1451 } 1240 }
1452
1453 spin_unlock_irq(&blkcg->lock);
1454 return 0;
1455} 1241}
1456 1242
1457static int cfqg_print_stat(struct cgroup *cgrp, struct cftype *cft, 1243/*
1458 struct seq_file *sf) 1244 * Blk cgroup controller notification saying that blkio_group object is being
1459{ 1245 * delinked as associated cgroup object is going away. That also means that
1460 struct blkcg *blkcg = cgroup_to_blkcg(cgrp); 1246 * no new IO will come in this group. So get rid of this group as soon as
1461 1247 * any pending IO in the group is finished.
1462 blkcg_print_blkgs(sf, blkcg, blkg_prfill_stat, &blkcg_policy_cfq, 1248 *
1463 cft->private, false); 1249 * This function is called under rcu_read_lock(). key is the rcu protected
1464 return 0; 1250 * pointer. That means "key" is a valid cfq_data pointer as long as we are rcu
1465} 1251 * read lock.
1466 1252 *
1467static int cfqg_print_rwstat(struct cgroup *cgrp, struct cftype *cft, 1253 * "key" was fetched from blkio_group under blkio_cgroup->lock. That means
1468 struct seq_file *sf) 1254 * it should not be NULL as even if elevator was exiting, cgroup deltion
1469{ 1255 * path got to it first.
1470 struct blkcg *blkcg = cgroup_to_blkcg(cgrp); 1256 */
1471 1257static void cfq_unlink_blkio_group(void *key, struct blkio_group *blkg)
1472 blkcg_print_blkgs(sf, blkcg, blkg_prfill_rwstat, &blkcg_policy_cfq,
1473 cft->private, true);
1474 return 0;
1475}
1476
1477#ifdef CONFIG_DEBUG_BLK_CGROUP
1478static u64 cfqg_prfill_avg_queue_size(struct seq_file *sf,
1479 struct blkg_policy_data *pd, int off)
1480{ 1258{
1481 struct cfq_group *cfqg = pd_to_cfqg(pd); 1259 unsigned long flags;
1482 u64 samples = blkg_stat_read(&cfqg->stats.avg_queue_size_samples); 1260 struct cfq_data *cfqd = key;
1483 u64 v = 0;
1484 1261
1485 if (samples) { 1262 spin_lock_irqsave(cfqd->queue->queue_lock, flags);
1486 v = blkg_stat_read(&cfqg->stats.avg_queue_size_sum); 1263 cfq_destroy_cfqg(cfqd, cfqg_of_blkg(blkg));
1487 do_div(v, samples); 1264 spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
1488 }
1489 __blkg_prfill_u64(sf, pd, v);
1490 return 0;
1491} 1265}
1492 1266
1493/* print avg_queue_size */ 1267#else /* GROUP_IOSCHED */
1494static int cfqg_print_avg_queue_size(struct cgroup *cgrp, struct cftype *cft, 1268static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd)
1495 struct seq_file *sf)
1496{ 1269{
1497 struct blkcg *blkcg = cgroup_to_blkcg(cgrp); 1270 return &cfqd->root_group;
1498
1499 blkcg_print_blkgs(sf, blkcg, cfqg_prfill_avg_queue_size,
1500 &blkcg_policy_cfq, 0, false);
1501 return 0;
1502} 1271}
1503#endif /* CONFIG_DEBUG_BLK_CGROUP */
1504 1272
1505static struct cftype cfq_blkcg_files[] = { 1273static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg)
1506 {
1507 .name = "weight_device",
1508 .read_seq_string = cfqg_print_weight_device,
1509 .write_string = cfqg_set_weight_device,
1510 .max_write_len = 256,
1511 },
1512 {
1513 .name = "weight",
1514 .read_seq_string = cfq_print_weight,
1515 .write_u64 = cfq_set_weight,
1516 },
1517 {
1518 .name = "time",
1519 .private = offsetof(struct cfq_group, stats.time),
1520 .read_seq_string = cfqg_print_stat,
1521 },
1522 {
1523 .name = "sectors",
1524 .private = offsetof(struct cfq_group, stats.sectors),
1525 .read_seq_string = cfqg_print_stat,
1526 },
1527 {
1528 .name = "io_service_bytes",
1529 .private = offsetof(struct cfq_group, stats.service_bytes),
1530 .read_seq_string = cfqg_print_rwstat,
1531 },
1532 {
1533 .name = "io_serviced",
1534 .private = offsetof(struct cfq_group, stats.serviced),
1535 .read_seq_string = cfqg_print_rwstat,
1536 },
1537 {
1538 .name = "io_service_time",
1539 .private = offsetof(struct cfq_group, stats.service_time),
1540 .read_seq_string = cfqg_print_rwstat,
1541 },
1542 {
1543 .name = "io_wait_time",
1544 .private = offsetof(struct cfq_group, stats.wait_time),
1545 .read_seq_string = cfqg_print_rwstat,
1546 },
1547 {
1548 .name = "io_merged",
1549 .private = offsetof(struct cfq_group, stats.merged),
1550 .read_seq_string = cfqg_print_rwstat,
1551 },
1552 {
1553 .name = "io_queued",
1554 .private = offsetof(struct cfq_group, stats.queued),
1555 .read_seq_string = cfqg_print_rwstat,
1556 },
1557#ifdef CONFIG_DEBUG_BLK_CGROUP
1558 {
1559 .name = "avg_queue_size",
1560 .read_seq_string = cfqg_print_avg_queue_size,
1561 },
1562 {
1563 .name = "group_wait_time",
1564 .private = offsetof(struct cfq_group, stats.group_wait_time),
1565 .read_seq_string = cfqg_print_stat,
1566 },
1567 {
1568 .name = "idle_time",
1569 .private = offsetof(struct cfq_group, stats.idle_time),
1570 .read_seq_string = cfqg_print_stat,
1571 },
1572 {
1573 .name = "empty_time",
1574 .private = offsetof(struct cfq_group, stats.empty_time),
1575 .read_seq_string = cfqg_print_stat,
1576 },
1577 {
1578 .name = "dequeue",
1579 .private = offsetof(struct cfq_group, stats.dequeue),
1580 .read_seq_string = cfqg_print_stat,
1581 },
1582 {
1583 .name = "unaccounted_time",
1584 .private = offsetof(struct cfq_group, stats.unaccounted_time),
1585 .read_seq_string = cfqg_print_stat,
1586 },
1587#endif /* CONFIG_DEBUG_BLK_CGROUP */
1588 { } /* terminate */
1589};
1590#else /* GROUP_IOSCHED */
1591static struct cfq_group *cfq_lookup_create_cfqg(struct cfq_data *cfqd,
1592 struct blkcg *blkcg)
1593{ 1274{
1594 return cfqd->root_group; 1275 return cfqg;
1595} 1276}
1596 1277
1597static inline void 1278static inline void
@@ -1599,6 +1280,9 @@ cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) {
1599 cfqq->cfqg = cfqg; 1280 cfqq->cfqg = cfqg;
1600} 1281}
1601 1282
1283static void cfq_release_cfq_groups(struct cfq_data *cfqd) {}
1284static inline void cfq_put_cfqg(struct cfq_group *cfqg) {}
1285
1602#endif /* GROUP_IOSCHED */ 1286#endif /* GROUP_IOSCHED */
1603 1287
1604/* 1288/*
@@ -1865,17 +1549,19 @@ static void cfq_reposition_rq_rb(struct cfq_queue *cfqq, struct request *rq)
1865{ 1549{
1866 elv_rb_del(&cfqq->sort_list, rq); 1550 elv_rb_del(&cfqq->sort_list, rq);
1867 cfqq->queued[rq_is_sync(rq)]--; 1551 cfqq->queued[rq_is_sync(rq)]--;
1868 cfqg_stats_update_io_remove(RQ_CFQG(rq), rq->cmd_flags); 1552 cfq_blkiocg_update_io_remove_stats(&(RQ_CFQG(rq))->blkg,
1553 rq_data_dir(rq), rq_is_sync(rq));
1869 cfq_add_rq_rb(rq); 1554 cfq_add_rq_rb(rq);
1870 cfqg_stats_update_io_add(RQ_CFQG(rq), cfqq->cfqd->serving_group, 1555 cfq_blkiocg_update_io_add_stats(&(RQ_CFQG(rq))->blkg,
1871 rq->cmd_flags); 1556 &cfqq->cfqd->serving_group->blkg, rq_data_dir(rq),
1557 rq_is_sync(rq));
1872} 1558}
1873 1559
1874static struct request * 1560static struct request *
1875cfq_find_rq_fmerge(struct cfq_data *cfqd, struct bio *bio) 1561cfq_find_rq_fmerge(struct cfq_data *cfqd, struct bio *bio)
1876{ 1562{
1877 struct task_struct *tsk = current; 1563 struct task_struct *tsk = current;
1878 struct cfq_io_cq *cic; 1564 struct cfq_io_context *cic;
1879 struct cfq_queue *cfqq; 1565 struct cfq_queue *cfqq;
1880 1566
1881 cic = cfq_cic_lookup(cfqd, tsk->io_context); 1567 cic = cfq_cic_lookup(cfqd, tsk->io_context);
@@ -1924,7 +1610,8 @@ static void cfq_remove_request(struct request *rq)
1924 cfq_del_rq_rb(rq); 1610 cfq_del_rq_rb(rq);
1925 1611
1926 cfqq->cfqd->rq_queued--; 1612 cfqq->cfqd->rq_queued--;
1927 cfqg_stats_update_io_remove(RQ_CFQG(rq), rq->cmd_flags); 1613 cfq_blkiocg_update_io_remove_stats(&(RQ_CFQG(rq))->blkg,
1614 rq_data_dir(rq), rq_is_sync(rq));
1928 if (rq->cmd_flags & REQ_PRIO) { 1615 if (rq->cmd_flags & REQ_PRIO) {
1929 WARN_ON(!cfqq->prio_pending); 1616 WARN_ON(!cfqq->prio_pending);
1930 cfqq->prio_pending--; 1617 cfqq->prio_pending--;
@@ -1959,7 +1646,8 @@ static void cfq_merged_request(struct request_queue *q, struct request *req,
1959static void cfq_bio_merged(struct request_queue *q, struct request *req, 1646static void cfq_bio_merged(struct request_queue *q, struct request *req,
1960 struct bio *bio) 1647 struct bio *bio)
1961{ 1648{
1962 cfqg_stats_update_io_merged(RQ_CFQG(req), bio->bi_rw); 1649 cfq_blkiocg_update_io_merged_stats(&(RQ_CFQG(req))->blkg,
1650 bio_data_dir(bio), cfq_bio_sync(bio));
1963} 1651}
1964 1652
1965static void 1653static void
@@ -1967,14 +1655,11 @@ cfq_merged_requests(struct request_queue *q, struct request *rq,
1967 struct request *next) 1655 struct request *next)
1968{ 1656{
1969 struct cfq_queue *cfqq = RQ_CFQQ(rq); 1657 struct cfq_queue *cfqq = RQ_CFQQ(rq);
1970 struct cfq_data *cfqd = q->elevator->elevator_data;
1971
1972 /* 1658 /*
1973 * reposition in fifo if next is older than rq 1659 * reposition in fifo if next is older than rq
1974 */ 1660 */
1975 if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) && 1661 if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&
1976 time_before(rq_fifo_time(next), rq_fifo_time(rq)) && 1662 time_before(rq_fifo_time(next), rq_fifo_time(rq))) {
1977 cfqq == RQ_CFQQ(next)) {
1978 list_move(&rq->queuelist, &next->queuelist); 1663 list_move(&rq->queuelist, &next->queuelist);
1979 rq_set_fifo_time(rq, rq_fifo_time(next)); 1664 rq_set_fifo_time(rq, rq_fifo_time(next));
1980 } 1665 }
@@ -1982,24 +1667,15 @@ cfq_merged_requests(struct request_queue *q, struct request *rq,
1982 if (cfqq->next_rq == next) 1667 if (cfqq->next_rq == next)
1983 cfqq->next_rq = rq; 1668 cfqq->next_rq = rq;
1984 cfq_remove_request(next); 1669 cfq_remove_request(next);
1985 cfqg_stats_update_io_merged(RQ_CFQG(rq), next->cmd_flags); 1670 cfq_blkiocg_update_io_merged_stats(&(RQ_CFQG(rq))->blkg,
1986 1671 rq_data_dir(next), rq_is_sync(next));
1987 cfqq = RQ_CFQQ(next);
1988 /*
1989 * all requests of this queue are merged to other queues, delete it
1990 * from the service tree. If it's the active_queue,
1991 * cfq_dispatch_requests() will choose to expire it or do idle
1992 */
1993 if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list) &&
1994 cfqq != cfqd->active_queue)
1995 cfq_del_cfqq_rr(cfqd, cfqq);
1996} 1672}
1997 1673
1998static int cfq_allow_merge(struct request_queue *q, struct request *rq, 1674static int cfq_allow_merge(struct request_queue *q, struct request *rq,
1999 struct bio *bio) 1675 struct bio *bio)
2000{ 1676{
2001 struct cfq_data *cfqd = q->elevator->elevator_data; 1677 struct cfq_data *cfqd = q->elevator->elevator_data;
2002 struct cfq_io_cq *cic; 1678 struct cfq_io_context *cic;
2003 struct cfq_queue *cfqq; 1679 struct cfq_queue *cfqq;
2004 1680
2005 /* 1681 /*
@@ -2009,7 +1685,7 @@ static int cfq_allow_merge(struct request_queue *q, struct request *rq,
2009 return false; 1685 return false;
2010 1686
2011 /* 1687 /*
2012 * Lookup the cfqq that this bio will be queued with and allow 1688 * Lookup the cfqq that this bio will be queued with. Allow
2013 * merge only if rq is queued there. 1689 * merge only if rq is queued there.
2014 */ 1690 */
2015 cic = cfq_cic_lookup(cfqd, current->io_context); 1691 cic = cfq_cic_lookup(cfqd, current->io_context);
@@ -2023,7 +1699,7 @@ static int cfq_allow_merge(struct request_queue *q, struct request *rq,
2023static inline void cfq_del_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq) 1699static inline void cfq_del_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2024{ 1700{
2025 del_timer(&cfqd->idle_slice_timer); 1701 del_timer(&cfqd->idle_slice_timer);
2026 cfqg_stats_update_idle_time(cfqq->cfqg); 1702 cfq_blkiocg_update_idle_time_stats(&cfqq->cfqg->blkg);
2027} 1703}
2028 1704
2029static void __cfq_set_active_queue(struct cfq_data *cfqd, 1705static void __cfq_set_active_queue(struct cfq_data *cfqd,
@@ -2032,7 +1708,7 @@ static void __cfq_set_active_queue(struct cfq_data *cfqd,
2032 if (cfqq) { 1708 if (cfqq) {
2033 cfq_log_cfqq(cfqd, cfqq, "set_active wl_prio:%d wl_type:%d", 1709 cfq_log_cfqq(cfqd, cfqq, "set_active wl_prio:%d wl_type:%d",
2034 cfqd->serving_prio, cfqd->serving_type); 1710 cfqd->serving_prio, cfqd->serving_type);
2035 cfqg_stats_update_avg_queue_size(cfqq->cfqg); 1711 cfq_blkiocg_update_avg_queue_size_stats(&cfqq->cfqg->blkg);
2036 cfqq->slice_start = 0; 1712 cfqq->slice_start = 0;
2037 cfqq->dispatch_start = jiffies; 1713 cfqq->dispatch_start = jiffies;
2038 cfqq->allocated_slice = 0; 1714 cfqq->allocated_slice = 0;
@@ -2098,7 +1774,7 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
2098 cfqd->active_queue = NULL; 1774 cfqd->active_queue = NULL;
2099 1775
2100 if (cfqd->active_cic) { 1776 if (cfqd->active_cic) {
2101 put_io_context(cfqd->active_cic->icq.ioc); 1777 put_io_context(cfqd->active_cic->ioc);
2102 cfqd->active_cic = NULL; 1778 cfqd->active_cic = NULL;
2103 } 1779 }
2104} 1780}
@@ -2318,7 +1994,7 @@ static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2318static void cfq_arm_slice_timer(struct cfq_data *cfqd) 1994static void cfq_arm_slice_timer(struct cfq_data *cfqd)
2319{ 1995{
2320 struct cfq_queue *cfqq = cfqd->active_queue; 1996 struct cfq_queue *cfqq = cfqd->active_queue;
2321 struct cfq_io_cq *cic; 1997 struct cfq_io_context *cic;
2322 unsigned long sl, group_idle = 0; 1998 unsigned long sl, group_idle = 0;
2323 1999
2324 /* 2000 /*
@@ -2353,7 +2029,7 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
2353 * task has exited, don't wait 2029 * task has exited, don't wait
2354 */ 2030 */
2355 cic = cfqd->active_cic; 2031 cic = cfqd->active_cic;
2356 if (!cic || !atomic_read(&cic->icq.ioc->active_ref)) 2032 if (!cic || !atomic_read(&cic->ioc->nr_tasks))
2357 return; 2033 return;
2358 2034
2359 /* 2035 /*
@@ -2380,7 +2056,7 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
2380 sl = cfqd->cfq_slice_idle; 2056 sl = cfqd->cfq_slice_idle;
2381 2057
2382 mod_timer(&cfqd->idle_slice_timer, jiffies + sl); 2058 mod_timer(&cfqd->idle_slice_timer, jiffies + sl);
2383 cfqg_stats_set_start_idle_time(cfqq->cfqg); 2059 cfq_blkiocg_update_set_idle_time_stats(&cfqq->cfqg->blkg);
2384 cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu group_idle: %d", sl, 2060 cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu group_idle: %d", sl,
2385 group_idle ? 1 : 0); 2061 group_idle ? 1 : 0);
2386} 2062}
@@ -2403,7 +2079,8 @@ static void cfq_dispatch_insert(struct request_queue *q, struct request *rq)
2403 2079
2404 cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++; 2080 cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++;
2405 cfqq->nr_sectors += blk_rq_sectors(rq); 2081 cfqq->nr_sectors += blk_rq_sectors(rq);
2406 cfqg_stats_update_dispatch(cfqq->cfqg, blk_rq_bytes(rq), rq->cmd_flags); 2082 cfq_blkiocg_update_dispatch_stats(&cfqq->cfqg->blkg, blk_rq_bytes(rq),
2083 rq_data_dir(rq), rq_is_sync(rq));
2407} 2084}
2408 2085
2409/* 2086/*
@@ -2581,8 +2258,7 @@ new_workload:
2581 * to have higher weight. A more accurate thing would be to 2258 * to have higher weight. A more accurate thing would be to
2582 * calculate system wide asnc/sync ratio. 2259 * calculate system wide asnc/sync ratio.
2583 */ 2260 */
2584 tmp = cfqd->cfq_target_latency * 2261 tmp = cfq_target_latency * cfqg_busy_async_queues(cfqd, cfqg);
2585 cfqg_busy_async_queues(cfqd, cfqg);
2586 tmp = tmp/cfqd->busy_queues; 2262 tmp = tmp/cfqd->busy_queues;
2587 slice = min_t(unsigned, slice, tmp); 2263 slice = min_t(unsigned, slice, tmp);
2588 2264
@@ -2904,9 +2580,9 @@ static bool cfq_dispatch_request(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2904 cfq_dispatch_insert(cfqd->queue, rq); 2580 cfq_dispatch_insert(cfqd->queue, rq);
2905 2581
2906 if (!cfqd->active_cic) { 2582 if (!cfqd->active_cic) {
2907 struct cfq_io_cq *cic = RQ_CIC(rq); 2583 struct cfq_io_context *cic = RQ_CIC(rq);
2908 2584
2909 atomic_long_inc(&cic->icq.ioc->refcount); 2585 atomic_long_inc(&cic->ioc->refcount);
2910 cfqd->active_cic = cic; 2586 cfqd->active_cic = cic;
2911 } 2587 }
2912 2588
@@ -2986,7 +2662,85 @@ static void cfq_put_queue(struct cfq_queue *cfqq)
2986 2662
2987 BUG_ON(cfq_cfqq_on_rr(cfqq)); 2663 BUG_ON(cfq_cfqq_on_rr(cfqq));
2988 kmem_cache_free(cfq_pool, cfqq); 2664 kmem_cache_free(cfq_pool, cfqq);
2989 cfqg_put(cfqg); 2665 cfq_put_cfqg(cfqg);
2666}
2667
2668/*
2669 * Call func for each cic attached to this ioc.
2670 */
2671static void
2672call_for_each_cic(struct io_context *ioc,
2673 void (*func)(struct io_context *, struct cfq_io_context *))
2674{
2675 struct cfq_io_context *cic;
2676 struct hlist_node *n;
2677
2678 rcu_read_lock();
2679
2680 hlist_for_each_entry_rcu(cic, n, &ioc->cic_list, cic_list)
2681 func(ioc, cic);
2682
2683 rcu_read_unlock();
2684}
2685
2686static void cfq_cic_free_rcu(struct rcu_head *head)
2687{
2688 struct cfq_io_context *cic;
2689
2690 cic = container_of(head, struct cfq_io_context, rcu_head);
2691
2692 kmem_cache_free(cfq_ioc_pool, cic);
2693 elv_ioc_count_dec(cfq_ioc_count);
2694
2695 if (ioc_gone) {
2696 /*
2697 * CFQ scheduler is exiting, grab exit lock and check
2698 * the pending io context count. If it hits zero,
2699 * complete ioc_gone and set it back to NULL
2700 */
2701 spin_lock(&ioc_gone_lock);
2702 if (ioc_gone && !elv_ioc_count_read(cfq_ioc_count)) {
2703 complete(ioc_gone);
2704 ioc_gone = NULL;
2705 }
2706 spin_unlock(&ioc_gone_lock);
2707 }
2708}
2709
2710static void cfq_cic_free(struct cfq_io_context *cic)
2711{
2712 call_rcu(&cic->rcu_head, cfq_cic_free_rcu);
2713}
2714
2715static void cic_free_func(struct io_context *ioc, struct cfq_io_context *cic)
2716{
2717 unsigned long flags;
2718 unsigned long dead_key = (unsigned long) cic->key;
2719
2720 BUG_ON(!(dead_key & CIC_DEAD_KEY));
2721
2722 spin_lock_irqsave(&ioc->lock, flags);
2723 radix_tree_delete(&ioc->radix_root, dead_key >> CIC_DEAD_INDEX_SHIFT);
2724 hlist_del_rcu(&cic->cic_list);
2725 spin_unlock_irqrestore(&ioc->lock, flags);
2726
2727 cfq_cic_free(cic);
2728}
2729
2730/*
2731 * Must be called with rcu_read_lock() held or preemption otherwise disabled.
2732 * Only two callers of this - ->dtor() which is called with the rcu_read_lock(),
2733 * and ->trim() which is called with the task lock held
2734 */
2735static void cfq_free_io_context(struct io_context *ioc)
2736{
2737 /*
2738 * ioc->refcount is zero here, or we are called from elv_unregister(),
2739 * so no more cic's are allowed to be linked into this ioc. So it
2740 * should be ok to iterate over the known list, we will see all cic's
2741 * since no new ones are added.
2742 */
2743 call_for_each_cic(ioc, cic_free_func);
2990} 2744}
2991 2745
2992static void cfq_put_cooperator(struct cfq_queue *cfqq) 2746static void cfq_put_cooperator(struct cfq_queue *cfqq)
@@ -3022,17 +2776,27 @@ static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
3022 cfq_put_queue(cfqq); 2776 cfq_put_queue(cfqq);
3023} 2777}
3024 2778
3025static void cfq_init_icq(struct io_cq *icq) 2779static void __cfq_exit_single_io_context(struct cfq_data *cfqd,
2780 struct cfq_io_context *cic)
3026{ 2781{
3027 struct cfq_io_cq *cic = icq_to_cic(icq); 2782 struct io_context *ioc = cic->ioc;
3028 2783
3029 cic->ttime.last_end_request = jiffies; 2784 list_del_init(&cic->queue_list);
3030}
3031 2785
3032static void cfq_exit_icq(struct io_cq *icq) 2786 /*
3033{ 2787 * Make sure dead mark is seen for dead queues
3034 struct cfq_io_cq *cic = icq_to_cic(icq); 2788 */
3035 struct cfq_data *cfqd = cic_to_cfqd(cic); 2789 smp_wmb();
2790 cic->key = cfqd_dead_key(cfqd);
2791
2792 rcu_read_lock();
2793 if (rcu_dereference(ioc->ioc_data) == cic) {
2794 rcu_read_unlock();
2795 spin_lock(&ioc->lock);
2796 rcu_assign_pointer(ioc->ioc_data, NULL);
2797 spin_unlock(&ioc->lock);
2798 } else
2799 rcu_read_unlock();
3036 2800
3037 if (cic->cfqq[BLK_RW_ASYNC]) { 2801 if (cic->cfqq[BLK_RW_ASYNC]) {
3038 cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_ASYNC]); 2802 cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_ASYNC]);
@@ -3045,7 +2809,58 @@ static void cfq_exit_icq(struct io_cq *icq)
3045 } 2809 }
3046} 2810}
3047 2811
3048static void cfq_init_prio_data(struct cfq_queue *cfqq, struct cfq_io_cq *cic) 2812static void cfq_exit_single_io_context(struct io_context *ioc,
2813 struct cfq_io_context *cic)
2814{
2815 struct cfq_data *cfqd = cic_to_cfqd(cic);
2816
2817 if (cfqd) {
2818 struct request_queue *q = cfqd->queue;
2819 unsigned long flags;
2820
2821 spin_lock_irqsave(q->queue_lock, flags);
2822
2823 /*
2824 * Ensure we get a fresh copy of the ->key to prevent
2825 * race between exiting task and queue
2826 */
2827 smp_read_barrier_depends();
2828 if (cic->key == cfqd)
2829 __cfq_exit_single_io_context(cfqd, cic);
2830
2831 spin_unlock_irqrestore(q->queue_lock, flags);
2832 }
2833}
2834
2835/*
2836 * The process that ioc belongs to has exited, we need to clean up
2837 * and put the internal structures we have that belongs to that process.
2838 */
2839static void cfq_exit_io_context(struct io_context *ioc)
2840{
2841 call_for_each_cic(ioc, cfq_exit_single_io_context);
2842}
2843
2844static struct cfq_io_context *
2845cfq_alloc_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
2846{
2847 struct cfq_io_context *cic;
2848
2849 cic = kmem_cache_alloc_node(cfq_ioc_pool, gfp_mask | __GFP_ZERO,
2850 cfqd->queue->node);
2851 if (cic) {
2852 cic->ttime.last_end_request = jiffies;
2853 INIT_LIST_HEAD(&cic->queue_list);
2854 INIT_HLIST_NODE(&cic->cic_list);
2855 cic->dtor = cfq_free_io_context;
2856 cic->exit = cfq_exit_io_context;
2857 elv_ioc_count_inc(cfq_ioc_count);
2858 }
2859
2860 return cic;
2861}
2862
2863static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc)
3049{ 2864{
3050 struct task_struct *tsk = current; 2865 struct task_struct *tsk = current;
3051 int ioprio_class; 2866 int ioprio_class;
@@ -3053,7 +2868,7 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq, struct cfq_io_cq *cic)
3053 if (!cfq_cfqq_prio_changed(cfqq)) 2868 if (!cfq_cfqq_prio_changed(cfqq))
3054 return; 2869 return;
3055 2870
3056 ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio); 2871 ioprio_class = IOPRIO_PRIO_CLASS(ioc->ioprio);
3057 switch (ioprio_class) { 2872 switch (ioprio_class) {
3058 default: 2873 default:
3059 printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class); 2874 printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class);
@@ -3065,11 +2880,11 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq, struct cfq_io_cq *cic)
3065 cfqq->ioprio_class = task_nice_ioclass(tsk); 2880 cfqq->ioprio_class = task_nice_ioclass(tsk);
3066 break; 2881 break;
3067 case IOPRIO_CLASS_RT: 2882 case IOPRIO_CLASS_RT:
3068 cfqq->ioprio = IOPRIO_PRIO_DATA(cic->ioprio); 2883 cfqq->ioprio = task_ioprio(ioc);
3069 cfqq->ioprio_class = IOPRIO_CLASS_RT; 2884 cfqq->ioprio_class = IOPRIO_CLASS_RT;
3070 break; 2885 break;
3071 case IOPRIO_CLASS_BE: 2886 case IOPRIO_CLASS_BE:
3072 cfqq->ioprio = IOPRIO_PRIO_DATA(cic->ioprio); 2887 cfqq->ioprio = task_ioprio(ioc);
3073 cfqq->ioprio_class = IOPRIO_CLASS_BE; 2888 cfqq->ioprio_class = IOPRIO_CLASS_BE;
3074 break; 2889 break;
3075 case IOPRIO_CLASS_IDLE: 2890 case IOPRIO_CLASS_IDLE:
@@ -3087,24 +2902,22 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq, struct cfq_io_cq *cic)
3087 cfq_clear_cfqq_prio_changed(cfqq); 2902 cfq_clear_cfqq_prio_changed(cfqq);
3088} 2903}
3089 2904
3090static void check_ioprio_changed(struct cfq_io_cq *cic, struct bio *bio) 2905static void changed_ioprio(struct io_context *ioc, struct cfq_io_context *cic)
3091{ 2906{
3092 int ioprio = cic->icq.ioc->ioprio;
3093 struct cfq_data *cfqd = cic_to_cfqd(cic); 2907 struct cfq_data *cfqd = cic_to_cfqd(cic);
3094 struct cfq_queue *cfqq; 2908 struct cfq_queue *cfqq;
2909 unsigned long flags;
3095 2910
3096 /* 2911 if (unlikely(!cfqd))
3097 * Check whether ioprio has changed. The condition may trigger
3098 * spuriously on a newly created cic but there's no harm.
3099 */
3100 if (unlikely(!cfqd) || likely(cic->ioprio == ioprio))
3101 return; 2912 return;
3102 2913
2914 spin_lock_irqsave(cfqd->queue->queue_lock, flags);
2915
3103 cfqq = cic->cfqq[BLK_RW_ASYNC]; 2916 cfqq = cic->cfqq[BLK_RW_ASYNC];
3104 if (cfqq) { 2917 if (cfqq) {
3105 struct cfq_queue *new_cfqq; 2918 struct cfq_queue *new_cfqq;
3106 new_cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic, bio, 2919 new_cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic->ioc,
3107 GFP_ATOMIC); 2920 GFP_ATOMIC);
3108 if (new_cfqq) { 2921 if (new_cfqq) {
3109 cic->cfqq[BLK_RW_ASYNC] = new_cfqq; 2922 cic->cfqq[BLK_RW_ASYNC] = new_cfqq;
3110 cfq_put_queue(cfqq); 2923 cfq_put_queue(cfqq);
@@ -3115,7 +2928,13 @@ static void check_ioprio_changed(struct cfq_io_cq *cic, struct bio *bio)
3115 if (cfqq) 2928 if (cfqq)
3116 cfq_mark_cfqq_prio_changed(cfqq); 2929 cfq_mark_cfqq_prio_changed(cfqq);
3117 2930
3118 cic->ioprio = ioprio; 2931 spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
2932}
2933
2934static void cfq_ioc_set_ioprio(struct io_context *ioc)
2935{
2936 call_for_each_cic(ioc, changed_ioprio);
2937 ioc->ioprio_changed = 0;
3119} 2938}
3120 2939
3121static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq, 2940static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
@@ -3139,24 +2958,20 @@ static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
3139} 2958}
3140 2959
3141#ifdef CONFIG_CFQ_GROUP_IOSCHED 2960#ifdef CONFIG_CFQ_GROUP_IOSCHED
3142static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) 2961static void changed_cgroup(struct io_context *ioc, struct cfq_io_context *cic)
3143{ 2962{
2963 struct cfq_queue *sync_cfqq = cic_to_cfqq(cic, 1);
3144 struct cfq_data *cfqd = cic_to_cfqd(cic); 2964 struct cfq_data *cfqd = cic_to_cfqd(cic);
3145 struct cfq_queue *sync_cfqq; 2965 unsigned long flags;
3146 uint64_t id; 2966 struct request_queue *q;
3147
3148 rcu_read_lock();
3149 id = bio_blkcg(bio)->id;
3150 rcu_read_unlock();
3151 2967
3152 /* 2968 if (unlikely(!cfqd))
3153 * Check whether blkcg has changed. The condition may trigger
3154 * spuriously on a newly created cic but there's no harm.
3155 */
3156 if (unlikely(!cfqd) || likely(cic->blkcg_id == id))
3157 return; 2969 return;
3158 2970
3159 sync_cfqq = cic_to_cfqq(cic, 1); 2971 q = cfqd->queue;
2972
2973 spin_lock_irqsave(q->queue_lock, flags);
2974
3160 if (sync_cfqq) { 2975 if (sync_cfqq) {
3161 /* 2976 /*
3162 * Drop reference to sync queue. A new sync queue will be 2977 * Drop reference to sync queue. A new sync queue will be
@@ -3167,25 +2982,28 @@ static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)
3167 cfq_put_queue(sync_cfqq); 2982 cfq_put_queue(sync_cfqq);
3168 } 2983 }
3169 2984
3170 cic->blkcg_id = id; 2985 spin_unlock_irqrestore(q->queue_lock, flags);
2986}
2987
2988static void cfq_ioc_set_cgroup(struct io_context *ioc)
2989{
2990 call_for_each_cic(ioc, changed_cgroup);
2991 ioc->cgroup_changed = 0;
3171} 2992}
3172#else
3173static inline void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) { }
3174#endif /* CONFIG_CFQ_GROUP_IOSCHED */ 2993#endif /* CONFIG_CFQ_GROUP_IOSCHED */
3175 2994
3176static struct cfq_queue * 2995static struct cfq_queue *
3177cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic, 2996cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync,
3178 struct bio *bio, gfp_t gfp_mask) 2997 struct io_context *ioc, gfp_t gfp_mask)
3179{ 2998{
3180 struct blkcg *blkcg;
3181 struct cfq_queue *cfqq, *new_cfqq = NULL; 2999 struct cfq_queue *cfqq, *new_cfqq = NULL;
3000 struct cfq_io_context *cic;
3182 struct cfq_group *cfqg; 3001 struct cfq_group *cfqg;
3183 3002
3184retry: 3003retry:
3185 rcu_read_lock(); 3004 cfqg = cfq_get_cfqg(cfqd);
3186 3005 cic = cfq_cic_lookup(cfqd, ioc);
3187 blkcg = bio_blkcg(bio); 3006 /* cic always exists here */
3188 cfqg = cfq_lookup_create_cfqg(cfqd, blkcg);
3189 cfqq = cic_to_cfqq(cic, is_sync); 3007 cfqq = cic_to_cfqq(cic, is_sync);
3190 3008
3191 /* 3009 /*
@@ -3198,7 +3016,6 @@ retry:
3198 cfqq = new_cfqq; 3016 cfqq = new_cfqq;
3199 new_cfqq = NULL; 3017 new_cfqq = NULL;
3200 } else if (gfp_mask & __GFP_WAIT) { 3018 } else if (gfp_mask & __GFP_WAIT) {
3201 rcu_read_unlock();
3202 spin_unlock_irq(cfqd->queue->queue_lock); 3019 spin_unlock_irq(cfqd->queue->queue_lock);
3203 new_cfqq = kmem_cache_alloc_node(cfq_pool, 3020 new_cfqq = kmem_cache_alloc_node(cfq_pool,
3204 gfp_mask | __GFP_ZERO, 3021 gfp_mask | __GFP_ZERO,
@@ -3214,7 +3031,7 @@ retry:
3214 3031
3215 if (cfqq) { 3032 if (cfqq) {
3216 cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync); 3033 cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync);
3217 cfq_init_prio_data(cfqq, cic); 3034 cfq_init_prio_data(cfqq, ioc);
3218 cfq_link_cfqq_cfqg(cfqq, cfqg); 3035 cfq_link_cfqq_cfqg(cfqq, cfqg);
3219 cfq_log_cfqq(cfqd, cfqq, "alloced"); 3036 cfq_log_cfqq(cfqd, cfqq, "alloced");
3220 } else 3037 } else
@@ -3224,7 +3041,6 @@ retry:
3224 if (new_cfqq) 3041 if (new_cfqq)
3225 kmem_cache_free(cfq_pool, new_cfqq); 3042 kmem_cache_free(cfq_pool, new_cfqq);
3226 3043
3227 rcu_read_unlock();
3228 return cfqq; 3044 return cfqq;
3229} 3045}
3230 3046
@@ -3234,9 +3050,6 @@ cfq_async_queue_prio(struct cfq_data *cfqd, int ioprio_class, int ioprio)
3234 switch (ioprio_class) { 3050 switch (ioprio_class) {
3235 case IOPRIO_CLASS_RT: 3051 case IOPRIO_CLASS_RT:
3236 return &cfqd->async_cfqq[0][ioprio]; 3052 return &cfqd->async_cfqq[0][ioprio];
3237 case IOPRIO_CLASS_NONE:
3238 ioprio = IOPRIO_NORM;
3239 /* fall through */
3240 case IOPRIO_CLASS_BE: 3053 case IOPRIO_CLASS_BE:
3241 return &cfqd->async_cfqq[1][ioprio]; 3054 return &cfqd->async_cfqq[1][ioprio];
3242 case IOPRIO_CLASS_IDLE: 3055 case IOPRIO_CLASS_IDLE:
@@ -3247,11 +3060,11 @@ cfq_async_queue_prio(struct cfq_data *cfqd, int ioprio_class, int ioprio)
3247} 3060}
3248 3061
3249static struct cfq_queue * 3062static struct cfq_queue *
3250cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic, 3063cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct io_context *ioc,
3251 struct bio *bio, gfp_t gfp_mask) 3064 gfp_t gfp_mask)
3252{ 3065{
3253 const int ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio); 3066 const int ioprio = task_ioprio(ioc);
3254 const int ioprio = IOPRIO_PRIO_DATA(cic->ioprio); 3067 const int ioprio_class = task_ioprio_class(ioc);
3255 struct cfq_queue **async_cfqq = NULL; 3068 struct cfq_queue **async_cfqq = NULL;
3256 struct cfq_queue *cfqq = NULL; 3069 struct cfq_queue *cfqq = NULL;
3257 3070
@@ -3261,7 +3074,7 @@ cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic,
3261 } 3074 }
3262 3075
3263 if (!cfqq) 3076 if (!cfqq)
3264 cfqq = cfq_find_alloc_queue(cfqd, is_sync, cic, bio, gfp_mask); 3077 cfqq = cfq_find_alloc_queue(cfqd, is_sync, ioc, gfp_mask);
3265 3078
3266 /* 3079 /*
3267 * pin the queue now that it's allocated, scheduler exit will prune it 3080 * pin the queue now that it's allocated, scheduler exit will prune it
@@ -3275,6 +3088,160 @@ cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic,
3275 return cfqq; 3088 return cfqq;
3276} 3089}
3277 3090
3091/*
3092 * We drop cfq io contexts lazily, so we may find a dead one.
3093 */
3094static void
3095cfq_drop_dead_cic(struct cfq_data *cfqd, struct io_context *ioc,
3096 struct cfq_io_context *cic)
3097{
3098 unsigned long flags;
3099
3100 WARN_ON(!list_empty(&cic->queue_list));
3101 BUG_ON(cic->key != cfqd_dead_key(cfqd));
3102
3103 spin_lock_irqsave(&ioc->lock, flags);
3104
3105 BUG_ON(rcu_dereference_check(ioc->ioc_data,
3106 lockdep_is_held(&ioc->lock)) == cic);
3107
3108 radix_tree_delete(&ioc->radix_root, cfqd->cic_index);
3109 hlist_del_rcu(&cic->cic_list);
3110 spin_unlock_irqrestore(&ioc->lock, flags);
3111
3112 cfq_cic_free(cic);
3113}
3114
3115static struct cfq_io_context *
3116cfq_cic_lookup(struct cfq_data *cfqd, struct io_context *ioc)
3117{
3118 struct cfq_io_context *cic;
3119 unsigned long flags;
3120
3121 if (unlikely(!ioc))
3122 return NULL;
3123
3124 rcu_read_lock();
3125
3126 /*
3127 * we maintain a last-hit cache, to avoid browsing over the tree
3128 */
3129 cic = rcu_dereference(ioc->ioc_data);
3130 if (cic && cic->key == cfqd) {
3131 rcu_read_unlock();
3132 return cic;
3133 }
3134
3135 do {
3136 cic = radix_tree_lookup(&ioc->radix_root, cfqd->cic_index);
3137 rcu_read_unlock();
3138 if (!cic)
3139 break;
3140 if (unlikely(cic->key != cfqd)) {
3141 cfq_drop_dead_cic(cfqd, ioc, cic);
3142 rcu_read_lock();
3143 continue;
3144 }
3145
3146 spin_lock_irqsave(&ioc->lock, flags);
3147 rcu_assign_pointer(ioc->ioc_data, cic);
3148 spin_unlock_irqrestore(&ioc->lock, flags);
3149 break;
3150 } while (1);
3151
3152 return cic;
3153}
3154
3155/*
3156 * Add cic into ioc, using cfqd as the search key. This enables us to lookup
3157 * the process specific cfq io context when entered from the block layer.
3158 * Also adds the cic to a per-cfqd list, used when this queue is removed.
3159 */
3160static int cfq_cic_link(struct cfq_data *cfqd, struct io_context *ioc,
3161 struct cfq_io_context *cic, gfp_t gfp_mask)
3162{
3163 unsigned long flags;
3164 int ret;
3165
3166 ret = radix_tree_preload(gfp_mask);
3167 if (!ret) {
3168 cic->ioc = ioc;
3169 cic->key = cfqd;
3170
3171 spin_lock_irqsave(&ioc->lock, flags);
3172 ret = radix_tree_insert(&ioc->radix_root,
3173 cfqd->cic_index, cic);
3174 if (!ret)
3175 hlist_add_head_rcu(&cic->cic_list, &ioc->cic_list);
3176 spin_unlock_irqrestore(&ioc->lock, flags);
3177
3178 radix_tree_preload_end();
3179
3180 if (!ret) {
3181 spin_lock_irqsave(cfqd->queue->queue_lock, flags);
3182 list_add(&cic->queue_list, &cfqd->cic_list);
3183 spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
3184 }
3185 }
3186
3187 if (ret && ret != -EEXIST)
3188 printk(KERN_ERR "cfq: cic link failed!\n");
3189
3190 return ret;
3191}
3192
3193/*
3194 * Setup general io context and cfq io context. There can be several cfq
3195 * io contexts per general io context, if this process is doing io to more
3196 * than one device managed by cfq.
3197 */
3198static struct cfq_io_context *
3199cfq_get_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
3200{
3201 struct io_context *ioc = NULL;
3202 struct cfq_io_context *cic;
3203 int ret;
3204
3205 might_sleep_if(gfp_mask & __GFP_WAIT);
3206
3207 ioc = get_io_context(gfp_mask, cfqd->queue->node);
3208 if (!ioc)
3209 return NULL;
3210
3211retry:
3212 cic = cfq_cic_lookup(cfqd, ioc);
3213 if (cic)
3214 goto out;
3215
3216 cic = cfq_alloc_io_context(cfqd, gfp_mask);
3217 if (cic == NULL)
3218 goto err;
3219
3220 ret = cfq_cic_link(cfqd, ioc, cic, gfp_mask);
3221 if (ret == -EEXIST) {
3222 /* someone has linked cic to ioc already */
3223 cfq_cic_free(cic);
3224 goto retry;
3225 } else if (ret)
3226 goto err_free;
3227
3228out:
3229 smp_read_barrier_depends();
3230 if (unlikely(ioc->ioprio_changed))
3231 cfq_ioc_set_ioprio(ioc);
3232
3233#ifdef CONFIG_CFQ_GROUP_IOSCHED
3234 if (unlikely(ioc->cgroup_changed))
3235 cfq_ioc_set_cgroup(ioc);
3236#endif
3237 return cic;
3238err_free:
3239 cfq_cic_free(cic);
3240err:
3241 put_io_context(ioc);
3242 return NULL;
3243}
3244
3278static void 3245static void
3279__cfq_update_io_thinktime(struct cfq_ttime *ttime, unsigned long slice_idle) 3246__cfq_update_io_thinktime(struct cfq_ttime *ttime, unsigned long slice_idle)
3280{ 3247{
@@ -3288,7 +3255,7 @@ __cfq_update_io_thinktime(struct cfq_ttime *ttime, unsigned long slice_idle)
3288 3255
3289static void 3256static void
3290cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_queue *cfqq, 3257cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_queue *cfqq,
3291 struct cfq_io_cq *cic) 3258 struct cfq_io_context *cic)
3292{ 3259{
3293 if (cfq_cfqq_sync(cfqq)) { 3260 if (cfq_cfqq_sync(cfqq)) {
3294 __cfq_update_io_thinktime(&cic->ttime, cfqd->cfq_slice_idle); 3261 __cfq_update_io_thinktime(&cic->ttime, cfqd->cfq_slice_idle);
@@ -3326,7 +3293,7 @@ cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_queue *cfqq,
3326 */ 3293 */
3327static void 3294static void
3328cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq, 3295cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
3329 struct cfq_io_cq *cic) 3296 struct cfq_io_context *cic)
3330{ 3297{
3331 int old_idle, enable_idle; 3298 int old_idle, enable_idle;
3332 3299
@@ -3343,9 +3310,8 @@ cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
3343 3310
3344 if (cfqq->next_rq && (cfqq->next_rq->cmd_flags & REQ_NOIDLE)) 3311 if (cfqq->next_rq && (cfqq->next_rq->cmd_flags & REQ_NOIDLE))
3345 enable_idle = 0; 3312 enable_idle = 0;
3346 else if (!atomic_read(&cic->icq.ioc->active_ref) || 3313 else if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle ||
3347 !cfqd->cfq_slice_idle || 3314 (!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq)))
3348 (!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq)))
3349 enable_idle = 0; 3315 enable_idle = 0;
3350 else if (sample_valid(cic->ttime.ttime_samples)) { 3316 else if (sample_valid(cic->ttime.ttime_samples)) {
3351 if (cic->ttime.ttime_mean > cfqd->cfq_slice_idle) 3317 if (cic->ttime.ttime_mean > cfqd->cfq_slice_idle)
@@ -3445,7 +3411,7 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
3445 */ 3411 */
3446static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq) 3412static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
3447{ 3413{
3448 enum wl_type_t old_type = cfqq_type(cfqd->active_queue); 3414 struct cfq_queue *old_cfqq = cfqd->active_queue;
3449 3415
3450 cfq_log_cfqq(cfqd, cfqq, "preempt"); 3416 cfq_log_cfqq(cfqd, cfqq, "preempt");
3451 cfq_slice_expired(cfqd, 1); 3417 cfq_slice_expired(cfqd, 1);
@@ -3454,7 +3420,7 @@ static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
3454 * workload type is changed, don't save slice, otherwise preempt 3420 * workload type is changed, don't save slice, otherwise preempt
3455 * doesn't happen 3421 * doesn't happen
3456 */ 3422 */
3457 if (old_type != cfqq_type(cfqq)) 3423 if (cfqq_type(old_cfqq) != cfqq_type(cfqq))
3458 cfqq->cfqg->saved_workload_slice = 0; 3424 cfqq->cfqg->saved_workload_slice = 0;
3459 3425
3460 /* 3426 /*
@@ -3477,7 +3443,7 @@ static void
3477cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq, 3443cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
3478 struct request *rq) 3444 struct request *rq)
3479{ 3445{
3480 struct cfq_io_cq *cic = RQ_CIC(rq); 3446 struct cfq_io_context *cic = RQ_CIC(rq);
3481 3447
3482 cfqd->rq_queued++; 3448 cfqd->rq_queued++;
3483 if (rq->cmd_flags & REQ_PRIO) 3449 if (rq->cmd_flags & REQ_PRIO)
@@ -3507,7 +3473,8 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
3507 cfq_clear_cfqq_wait_request(cfqq); 3473 cfq_clear_cfqq_wait_request(cfqq);
3508 __blk_run_queue(cfqd->queue); 3474 __blk_run_queue(cfqd->queue);
3509 } else { 3475 } else {
3510 cfqg_stats_update_idle_time(cfqq->cfqg); 3476 cfq_blkiocg_update_idle_time_stats(
3477 &cfqq->cfqg->blkg);
3511 cfq_mark_cfqq_must_dispatch(cfqq); 3478 cfq_mark_cfqq_must_dispatch(cfqq);
3512 } 3479 }
3513 } 3480 }
@@ -3529,13 +3496,14 @@ static void cfq_insert_request(struct request_queue *q, struct request *rq)
3529 struct cfq_queue *cfqq = RQ_CFQQ(rq); 3496 struct cfq_queue *cfqq = RQ_CFQQ(rq);
3530 3497
3531 cfq_log_cfqq(cfqd, cfqq, "insert_request"); 3498 cfq_log_cfqq(cfqd, cfqq, "insert_request");
3532 cfq_init_prio_data(cfqq, RQ_CIC(rq)); 3499 cfq_init_prio_data(cfqq, RQ_CIC(rq)->ioc);
3533 3500
3534 rq_set_fifo_time(rq, jiffies + cfqd->cfq_fifo_expire[rq_is_sync(rq)]); 3501 rq_set_fifo_time(rq, jiffies + cfqd->cfq_fifo_expire[rq_is_sync(rq)]);
3535 list_add_tail(&rq->queuelist, &cfqq->fifo); 3502 list_add_tail(&rq->queuelist, &cfqq->fifo);
3536 cfq_add_rq_rb(rq); 3503 cfq_add_rq_rb(rq);
3537 cfqg_stats_update_io_add(RQ_CFQG(rq), cfqd->serving_group, 3504 cfq_blkiocg_update_io_add_stats(&(RQ_CFQG(rq))->blkg,
3538 rq->cmd_flags); 3505 &cfqd->serving_group->blkg, rq_data_dir(rq),
3506 rq_is_sync(rq));
3539 cfq_rq_enqueued(cfqd, cfqq, rq); 3507 cfq_rq_enqueued(cfqd, cfqq, rq);
3540} 3508}
3541 3509
@@ -3578,7 +3546,7 @@ static void cfq_update_hw_tag(struct cfq_data *cfqd)
3578 3546
3579static bool cfq_should_wait_busy(struct cfq_data *cfqd, struct cfq_queue *cfqq) 3547static bool cfq_should_wait_busy(struct cfq_data *cfqd, struct cfq_queue *cfqq)
3580{ 3548{
3581 struct cfq_io_cq *cic = cfqd->active_cic; 3549 struct cfq_io_context *cic = cfqd->active_cic;
3582 3550
3583 /* If the queue already has requests, don't wait */ 3551 /* If the queue already has requests, don't wait */
3584 if (!RB_EMPTY_ROOT(&cfqq->sort_list)) 3552 if (!RB_EMPTY_ROOT(&cfqq->sort_list))
@@ -3631,8 +3599,9 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
3631 cfqd->rq_in_driver--; 3599 cfqd->rq_in_driver--;
3632 cfqq->dispatched--; 3600 cfqq->dispatched--;
3633 (RQ_CFQG(rq))->dispatched--; 3601 (RQ_CFQG(rq))->dispatched--;
3634 cfqg_stats_update_completion(cfqq->cfqg, rq_start_time_ns(rq), 3602 cfq_blkiocg_update_completion_stats(&cfqq->cfqg->blkg,
3635 rq_io_start_time_ns(rq), rq->cmd_flags); 3603 rq_start_time_ns(rq), rq_io_start_time_ns(rq),
3604 rq_data_dir(rq), rq_is_sync(rq));
3636 3605
3637 cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--; 3606 cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--;
3638 3607
@@ -3714,7 +3683,7 @@ static int cfq_may_queue(struct request_queue *q, int rw)
3714{ 3683{
3715 struct cfq_data *cfqd = q->elevator->elevator_data; 3684 struct cfq_data *cfqd = q->elevator->elevator_data;
3716 struct task_struct *tsk = current; 3685 struct task_struct *tsk = current;
3717 struct cfq_io_cq *cic; 3686 struct cfq_io_context *cic;
3718 struct cfq_queue *cfqq; 3687 struct cfq_queue *cfqq;
3719 3688
3720 /* 3689 /*
@@ -3729,7 +3698,7 @@ static int cfq_may_queue(struct request_queue *q, int rw)
3729 3698
3730 cfqq = cic_to_cfqq(cic, rw_is_sync(rw)); 3699 cfqq = cic_to_cfqq(cic, rw_is_sync(rw));
3731 if (cfqq) { 3700 if (cfqq) {
3732 cfq_init_prio_data(cfqq, cic); 3701 cfq_init_prio_data(cfqq, cic->ioc);
3733 3702
3734 return __cfq_may_queue(cfqq); 3703 return __cfq_may_queue(cfqq);
3735 } 3704 }
@@ -3750,17 +3719,21 @@ static void cfq_put_request(struct request *rq)
3750 BUG_ON(!cfqq->allocated[rw]); 3719 BUG_ON(!cfqq->allocated[rw]);
3751 cfqq->allocated[rw]--; 3720 cfqq->allocated[rw]--;
3752 3721
3722 put_io_context(RQ_CIC(rq)->ioc);
3723
3724 rq->elevator_private[0] = NULL;
3725 rq->elevator_private[1] = NULL;
3726
3753 /* Put down rq reference on cfqg */ 3727 /* Put down rq reference on cfqg */
3754 cfqg_put(RQ_CFQG(rq)); 3728 cfq_put_cfqg(RQ_CFQG(rq));
3755 rq->elv.priv[0] = NULL; 3729 rq->elevator_private[2] = NULL;
3756 rq->elv.priv[1] = NULL;
3757 3730
3758 cfq_put_queue(cfqq); 3731 cfq_put_queue(cfqq);
3759 } 3732 }
3760} 3733}
3761 3734
3762static struct cfq_queue * 3735static struct cfq_queue *
3763cfq_merge_cfqqs(struct cfq_data *cfqd, struct cfq_io_cq *cic, 3736cfq_merge_cfqqs(struct cfq_data *cfqd, struct cfq_io_context *cic,
3764 struct cfq_queue *cfqq) 3737 struct cfq_queue *cfqq)
3765{ 3738{
3766 cfq_log_cfqq(cfqd, cfqq, "merging with queue %p", cfqq->new_cfqq); 3739 cfq_log_cfqq(cfqd, cfqq, "merging with queue %p", cfqq->new_cfqq);
@@ -3775,7 +3748,7 @@ cfq_merge_cfqqs(struct cfq_data *cfqd, struct cfq_io_cq *cic,
3775 * was the last process referring to said cfqq. 3748 * was the last process referring to said cfqq.
3776 */ 3749 */
3777static struct cfq_queue * 3750static struct cfq_queue *
3778split_cfqq(struct cfq_io_cq *cic, struct cfq_queue *cfqq) 3751split_cfqq(struct cfq_io_context *cic, struct cfq_queue *cfqq)
3779{ 3752{
3780 if (cfqq_process_refs(cfqq) == 1) { 3753 if (cfqq_process_refs(cfqq) == 1) {
3781 cfqq->pid = current->pid; 3754 cfqq->pid = current->pid;
@@ -3795,25 +3768,28 @@ split_cfqq(struct cfq_io_cq *cic, struct cfq_queue *cfqq)
3795 * Allocate cfq data structures associated with this request. 3768 * Allocate cfq data structures associated with this request.
3796 */ 3769 */
3797static int 3770static int
3798cfq_set_request(struct request_queue *q, struct request *rq, struct bio *bio, 3771cfq_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask)
3799 gfp_t gfp_mask)
3800{ 3772{
3801 struct cfq_data *cfqd = q->elevator->elevator_data; 3773 struct cfq_data *cfqd = q->elevator->elevator_data;
3802 struct cfq_io_cq *cic = icq_to_cic(rq->elv.icq); 3774 struct cfq_io_context *cic;
3803 const int rw = rq_data_dir(rq); 3775 const int rw = rq_data_dir(rq);
3804 const bool is_sync = rq_is_sync(rq); 3776 const bool is_sync = rq_is_sync(rq);
3805 struct cfq_queue *cfqq; 3777 struct cfq_queue *cfqq;
3778 unsigned long flags;
3806 3779
3807 might_sleep_if(gfp_mask & __GFP_WAIT); 3780 might_sleep_if(gfp_mask & __GFP_WAIT);
3808 3781
3809 spin_lock_irq(q->queue_lock); 3782 cic = cfq_get_io_context(cfqd, gfp_mask);
3783
3784 spin_lock_irqsave(q->queue_lock, flags);
3785
3786 if (!cic)
3787 goto queue_fail;
3810 3788
3811 check_ioprio_changed(cic, bio);
3812 check_blkcg_changed(cic, bio);
3813new_queue: 3789new_queue:
3814 cfqq = cic_to_cfqq(cic, is_sync); 3790 cfqq = cic_to_cfqq(cic, is_sync);
3815 if (!cfqq || cfqq == &cfqd->oom_cfqq) { 3791 if (!cfqq || cfqq == &cfqd->oom_cfqq) {
3816 cfqq = cfq_get_queue(cfqd, is_sync, cic, bio, gfp_mask); 3792 cfqq = cfq_get_queue(cfqd, is_sync, cic->ioc, gfp_mask);
3817 cic_set_cfqq(cic, cfqq, is_sync); 3793 cic_set_cfqq(cic, cfqq, is_sync);
3818 } else { 3794 } else {
3819 /* 3795 /*
@@ -3839,11 +3815,17 @@ new_queue:
3839 cfqq->allocated[rw]++; 3815 cfqq->allocated[rw]++;
3840 3816
3841 cfqq->ref++; 3817 cfqq->ref++;
3842 cfqg_get(cfqq->cfqg); 3818 rq->elevator_private[0] = cic;
3843 rq->elv.priv[0] = cfqq; 3819 rq->elevator_private[1] = cfqq;
3844 rq->elv.priv[1] = cfqq->cfqg; 3820 rq->elevator_private[2] = cfq_ref_get_cfqg(cfqq->cfqg);
3845 spin_unlock_irq(q->queue_lock); 3821 spin_unlock_irqrestore(q->queue_lock, flags);
3846 return 0; 3822 return 0;
3823
3824queue_fail:
3825 cfq_schedule_dispatch(cfqd);
3826 spin_unlock_irqrestore(q->queue_lock, flags);
3827 cfq_log(cfqd, "set_request fail");
3828 return 1;
3847} 3829}
3848 3830
3849static void cfq_kick_queue(struct work_struct *work) 3831static void cfq_kick_queue(struct work_struct *work)
@@ -3938,6 +3920,7 @@ static void cfq_exit_queue(struct elevator_queue *e)
3938{ 3920{
3939 struct cfq_data *cfqd = e->elevator_data; 3921 struct cfq_data *cfqd = e->elevator_data;
3940 struct request_queue *q = cfqd->queue; 3922 struct request_queue *q = cfqd->queue;
3923 bool wait = false;
3941 3924
3942 cfq_shutdown_timer_wq(cfqd); 3925 cfq_shutdown_timer_wq(cfqd);
3943 3926
@@ -3946,54 +3929,139 @@ static void cfq_exit_queue(struct elevator_queue *e)
3946 if (cfqd->active_queue) 3929 if (cfqd->active_queue)
3947 __cfq_slice_expired(cfqd, cfqd->active_queue, 0); 3930 __cfq_slice_expired(cfqd, cfqd->active_queue, 0);
3948 3931
3932 while (!list_empty(&cfqd->cic_list)) {
3933 struct cfq_io_context *cic = list_entry(cfqd->cic_list.next,
3934 struct cfq_io_context,
3935 queue_list);
3936
3937 __cfq_exit_single_io_context(cfqd, cic);
3938 }
3939
3949 cfq_put_async_queues(cfqd); 3940 cfq_put_async_queues(cfqd);
3941 cfq_release_cfq_groups(cfqd);
3942
3943 /*
3944 * If there are groups which we could not unlink from blkcg list,
3945 * wait for a rcu period for them to be freed.
3946 */
3947 if (cfqd->nr_blkcg_linked_grps)
3948 wait = true;
3950 3949
3951 spin_unlock_irq(q->queue_lock); 3950 spin_unlock_irq(q->queue_lock);
3952 3951
3953 cfq_shutdown_timer_wq(cfqd); 3952 cfq_shutdown_timer_wq(cfqd);
3954 3953
3954 spin_lock(&cic_index_lock);
3955 ida_remove(&cic_index_ida, cfqd->cic_index);
3956 spin_unlock(&cic_index_lock);
3957
3958 /*
3959 * Wait for cfqg->blkg->key accessors to exit their grace periods.
3960 * Do this wait only if there are other unlinked groups out
3961 * there. This can happen if cgroup deletion path claimed the
3962 * responsibility of cleaning up a group before queue cleanup code
3963 * get to the group.
3964 *
3965 * Do not call synchronize_rcu() unconditionally as there are drivers
3966 * which create/delete request queue hundreds of times during scan/boot
3967 * and synchronize_rcu() can take significant time and slow down boot.
3968 */
3969 if (wait)
3970 synchronize_rcu();
3971
3955#ifdef CONFIG_CFQ_GROUP_IOSCHED 3972#ifdef CONFIG_CFQ_GROUP_IOSCHED
3956 blkcg_deactivate_policy(q, &blkcg_policy_cfq); 3973 /* Free up per cpu stats for root group */
3957#else 3974 free_percpu(cfqd->root_group.blkg.stats_cpu);
3958 kfree(cfqd->root_group);
3959#endif 3975#endif
3960 kfree(cfqd); 3976 kfree(cfqd);
3961} 3977}
3962 3978
3963static int cfq_init_queue(struct request_queue *q) 3979static int cfq_alloc_cic_index(void)
3980{
3981 int index, error;
3982
3983 do {
3984 if (!ida_pre_get(&cic_index_ida, GFP_KERNEL))
3985 return -ENOMEM;
3986
3987 spin_lock(&cic_index_lock);
3988 error = ida_get_new(&cic_index_ida, &index);
3989 spin_unlock(&cic_index_lock);
3990 if (error && error != -EAGAIN)
3991 return error;
3992 } while (error);
3993
3994 return index;
3995}
3996
3997static void *cfq_init_queue(struct request_queue *q)
3964{ 3998{
3965 struct cfq_data *cfqd; 3999 struct cfq_data *cfqd;
3966 struct blkcg_gq *blkg __maybe_unused; 4000 int i, j;
3967 int i, ret; 4001 struct cfq_group *cfqg;
4002 struct cfq_rb_root *st;
4003
4004 i = cfq_alloc_cic_index();
4005 if (i < 0)
4006 return NULL;
3968 4007
3969 cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node); 4008 cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node);
3970 if (!cfqd) 4009 if (!cfqd) {
3971 return -ENOMEM; 4010 spin_lock(&cic_index_lock);
4011 ida_remove(&cic_index_ida, i);
4012 spin_unlock(&cic_index_lock);
4013 return NULL;
4014 }
3972 4015
3973 cfqd->queue = q; 4016 /*
3974 q->elevator->elevator_data = cfqd; 4017 * Don't need take queue_lock in the routine, since we are
4018 * initializing the ioscheduler, and nobody is using cfqd
4019 */
4020 cfqd->cic_index = i;
3975 4021
3976 /* Init root service tree */ 4022 /* Init root service tree */
3977 cfqd->grp_service_tree = CFQ_RB_ROOT; 4023 cfqd->grp_service_tree = CFQ_RB_ROOT;
3978 4024
3979 /* Init root group and prefer root group over other groups by default */ 4025 /* Init root group */
4026 cfqg = &cfqd->root_group;
4027 for_each_cfqg_st(cfqg, i, j, st)
4028 *st = CFQ_RB_ROOT;
4029 RB_CLEAR_NODE(&cfqg->rb_node);
4030
4031 /* Give preference to root group over other groups */
4032 cfqg->weight = 2*BLKIO_WEIGHT_DEFAULT;
4033
3980#ifdef CONFIG_CFQ_GROUP_IOSCHED 4034#ifdef CONFIG_CFQ_GROUP_IOSCHED
3981 ret = blkcg_activate_policy(q, &blkcg_policy_cfq); 4035 /*
3982 if (ret) 4036 * Set root group reference to 2. One reference will be dropped when
3983 goto out_free; 4037 * all groups on cfqd->cfqg_list are being deleted during queue exit.
4038 * Other reference will remain there as we don't want to delete this
4039 * group as it is statically allocated and gets destroyed when
4040 * throtl_data goes away.
4041 */
4042 cfqg->ref = 2;
3984 4043
3985 cfqd->root_group = blkg_to_cfqg(q->root_blkg); 4044 if (blkio_alloc_blkg_stats(&cfqg->blkg)) {
3986#else 4045 kfree(cfqg);
3987 ret = -ENOMEM;
3988 cfqd->root_group = kzalloc_node(sizeof(*cfqd->root_group),
3989 GFP_KERNEL, cfqd->queue->node);
3990 if (!cfqd->root_group)
3991 goto out_free;
3992 4046
3993 cfq_init_cfqg_base(cfqd->root_group); 4047 spin_lock(&cic_index_lock);
3994#endif 4048 ida_remove(&cic_index_ida, cfqd->cic_index);
3995 cfqd->root_group->weight = 2 * CFQ_WEIGHT_DEFAULT; 4049 spin_unlock(&cic_index_lock);
4050
4051 kfree(cfqd);
4052 return NULL;
4053 }
4054
4055 rcu_read_lock();
4056
4057 cfq_blkiocg_add_blkio_group(&blkio_root_cgroup, &cfqg->blkg,
4058 (void *)cfqd, 0);
4059 rcu_read_unlock();
4060 cfqd->nr_blkcg_linked_grps++;
3996 4061
4062 /* Add group on cfqd->cfqg_list */
4063 hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
4064#endif
3997 /* 4065 /*
3998 * Not strictly needed (since RB_ROOT just clears the node and we 4066 * Not strictly needed (since RB_ROOT just clears the node and we
3999 * zeroed cfqd on alloc), but better be safe in case someone decides 4067 * zeroed cfqd on alloc), but better be safe in case someone decides
@@ -4005,17 +4073,15 @@ static int cfq_init_queue(struct request_queue *q)
4005 /* 4073 /*
4006 * Our fallback cfqq if cfq_find_alloc_queue() runs into OOM issues. 4074 * Our fallback cfqq if cfq_find_alloc_queue() runs into OOM issues.
4007 * Grab a permanent reference to it, so that the normal code flow 4075 * Grab a permanent reference to it, so that the normal code flow
4008 * will not attempt to free it. oom_cfqq is linked to root_group 4076 * will not attempt to free it.
4009 * but shouldn't hold a reference as it'll never be unlinked. Lose
4010 * the reference from linking right away.
4011 */ 4077 */
4012 cfq_init_cfqq(cfqd, &cfqd->oom_cfqq, 1, 0); 4078 cfq_init_cfqq(cfqd, &cfqd->oom_cfqq, 1, 0);
4013 cfqd->oom_cfqq.ref++; 4079 cfqd->oom_cfqq.ref++;
4080 cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, &cfqd->root_group);
4014 4081
4015 spin_lock_irq(q->queue_lock); 4082 INIT_LIST_HEAD(&cfqd->cic_list);
4016 cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, cfqd->root_group); 4083
4017 cfqg_put(cfqd->root_group); 4084 cfqd->queue = q;
4018 spin_unlock_irq(q->queue_lock);
4019 4085
4020 init_timer(&cfqd->idle_slice_timer); 4086 init_timer(&cfqd->idle_slice_timer);
4021 cfqd->idle_slice_timer.function = cfq_idle_slice_timer; 4087 cfqd->idle_slice_timer.function = cfq_idle_slice_timer;
@@ -4030,7 +4096,6 @@ static int cfq_init_queue(struct request_queue *q)
4030 cfqd->cfq_back_penalty = cfq_back_penalty; 4096 cfqd->cfq_back_penalty = cfq_back_penalty;
4031 cfqd->cfq_slice[0] = cfq_slice_async; 4097 cfqd->cfq_slice[0] = cfq_slice_async;
4032 cfqd->cfq_slice[1] = cfq_slice_sync; 4098 cfqd->cfq_slice[1] = cfq_slice_sync;
4033 cfqd->cfq_target_latency = cfq_target_latency;
4034 cfqd->cfq_slice_async_rq = cfq_slice_async_rq; 4099 cfqd->cfq_slice_async_rq = cfq_slice_async_rq;
4035 cfqd->cfq_slice_idle = cfq_slice_idle; 4100 cfqd->cfq_slice_idle = cfq_slice_idle;
4036 cfqd->cfq_group_idle = cfq_group_idle; 4101 cfqd->cfq_group_idle = cfq_group_idle;
@@ -4041,11 +4106,35 @@ static int cfq_init_queue(struct request_queue *q)
4041 * second, in order to have larger depth for async operations. 4106 * second, in order to have larger depth for async operations.
4042 */ 4107 */
4043 cfqd->last_delayed_sync = jiffies - HZ; 4108 cfqd->last_delayed_sync = jiffies - HZ;
4044 return 0; 4109 return cfqd;
4110}
4045 4111
4046out_free: 4112static void cfq_slab_kill(void)
4047 kfree(cfqd); 4113{
4048 return ret; 4114 /*
4115 * Caller already ensured that pending RCU callbacks are completed,
4116 * so we should have no busy allocations at this point.
4117 */
4118 if (cfq_pool)
4119 kmem_cache_destroy(cfq_pool);
4120 if (cfq_ioc_pool)
4121 kmem_cache_destroy(cfq_ioc_pool);
4122}
4123
4124static int __init cfq_slab_setup(void)
4125{
4126 cfq_pool = KMEM_CACHE(cfq_queue, 0);
4127 if (!cfq_pool)
4128 goto fail;
4129
4130 cfq_ioc_pool = KMEM_CACHE(cfq_io_context, 0);
4131 if (!cfq_ioc_pool)
4132 goto fail;
4133
4134 return 0;
4135fail:
4136 cfq_slab_kill();
4137 return -ENOMEM;
4049} 4138}
4050 4139
4051/* 4140/*
@@ -4086,7 +4175,6 @@ SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1);
4086SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1); 4175SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1);
4087SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0); 4176SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0);
4088SHOW_FUNCTION(cfq_low_latency_show, cfqd->cfq_latency, 0); 4177SHOW_FUNCTION(cfq_low_latency_show, cfqd->cfq_latency, 0);
4089SHOW_FUNCTION(cfq_target_latency_show, cfqd->cfq_target_latency, 1);
4090#undef SHOW_FUNCTION 4178#undef SHOW_FUNCTION
4091 4179
4092#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ 4180#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \
@@ -4120,7 +4208,6 @@ STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1);
4120STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1, 4208STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1,
4121 UINT_MAX, 0); 4209 UINT_MAX, 0);
4122STORE_FUNCTION(cfq_low_latency_store, &cfqd->cfq_latency, 0, 1, 0); 4210STORE_FUNCTION(cfq_low_latency_store, &cfqd->cfq_latency, 0, 1, 0);
4123STORE_FUNCTION(cfq_target_latency_store, &cfqd->cfq_target_latency, 1, UINT_MAX, 1);
4124#undef STORE_FUNCTION 4211#undef STORE_FUNCTION
4125 4212
4126#define CFQ_ATTR(name) \ 4213#define CFQ_ATTR(name) \
@@ -4138,7 +4225,6 @@ static struct elv_fs_entry cfq_attrs[] = {
4138 CFQ_ATTR(slice_idle), 4225 CFQ_ATTR(slice_idle),
4139 CFQ_ATTR(group_idle), 4226 CFQ_ATTR(group_idle),
4140 CFQ_ATTR(low_latency), 4227 CFQ_ATTR(low_latency),
4141 CFQ_ATTR(target_latency),
4142 __ATTR_NULL 4228 __ATTR_NULL
4143}; 4229};
4144 4230
@@ -4156,35 +4242,32 @@ static struct elevator_type iosched_cfq = {
4156 .elevator_completed_req_fn = cfq_completed_request, 4242 .elevator_completed_req_fn = cfq_completed_request,
4157 .elevator_former_req_fn = elv_rb_former_request, 4243 .elevator_former_req_fn = elv_rb_former_request,
4158 .elevator_latter_req_fn = elv_rb_latter_request, 4244 .elevator_latter_req_fn = elv_rb_latter_request,
4159 .elevator_init_icq_fn = cfq_init_icq,
4160 .elevator_exit_icq_fn = cfq_exit_icq,
4161 .elevator_set_req_fn = cfq_set_request, 4245 .elevator_set_req_fn = cfq_set_request,
4162 .elevator_put_req_fn = cfq_put_request, 4246 .elevator_put_req_fn = cfq_put_request,
4163 .elevator_may_queue_fn = cfq_may_queue, 4247 .elevator_may_queue_fn = cfq_may_queue,
4164 .elevator_init_fn = cfq_init_queue, 4248 .elevator_init_fn = cfq_init_queue,
4165 .elevator_exit_fn = cfq_exit_queue, 4249 .elevator_exit_fn = cfq_exit_queue,
4250 .trim = cfq_free_io_context,
4166 }, 4251 },
4167 .icq_size = sizeof(struct cfq_io_cq),
4168 .icq_align = __alignof__(struct cfq_io_cq),
4169 .elevator_attrs = cfq_attrs, 4252 .elevator_attrs = cfq_attrs,
4170 .elevator_name = "cfq", 4253 .elevator_name = "cfq",
4171 .elevator_owner = THIS_MODULE, 4254 .elevator_owner = THIS_MODULE,
4172}; 4255};
4173 4256
4174#ifdef CONFIG_CFQ_GROUP_IOSCHED 4257#ifdef CONFIG_CFQ_GROUP_IOSCHED
4175static struct blkcg_policy blkcg_policy_cfq = { 4258static struct blkio_policy_type blkio_policy_cfq = {
4176 .pd_size = sizeof(struct cfq_group), 4259 .ops = {
4177 .cftypes = cfq_blkcg_files, 4260 .blkio_unlink_group_fn = cfq_unlink_blkio_group,
4178 4261 .blkio_update_group_weight_fn = cfq_update_blkio_group_weight,
4179 .pd_init_fn = cfq_pd_init, 4262 },
4180 .pd_reset_stats_fn = cfq_pd_reset_stats, 4263 .plid = BLKIO_POLICY_PROP,
4181}; 4264};
4265#else
4266static struct blkio_policy_type blkio_policy_cfq;
4182#endif 4267#endif
4183 4268
4184static int __init cfq_init(void) 4269static int __init cfq_init(void)
4185{ 4270{
4186 int ret;
4187
4188 /* 4271 /*
4189 * could be 0 on HZ < 1000 setups 4272 * could be 0 on HZ < 1000 setups
4190 */ 4273 */
@@ -4196,41 +4279,35 @@ static int __init cfq_init(void)
4196#ifdef CONFIG_CFQ_GROUP_IOSCHED 4279#ifdef CONFIG_CFQ_GROUP_IOSCHED
4197 if (!cfq_group_idle) 4280 if (!cfq_group_idle)
4198 cfq_group_idle = 1; 4281 cfq_group_idle = 1;
4199
4200 ret = blkcg_policy_register(&blkcg_policy_cfq);
4201 if (ret)
4202 return ret;
4203#else 4282#else
4204 cfq_group_idle = 0; 4283 cfq_group_idle = 0;
4205#endif 4284#endif
4285 if (cfq_slab_setup())
4286 return -ENOMEM;
4206 4287
4207 ret = -ENOMEM; 4288 elv_register(&iosched_cfq);
4208 cfq_pool = KMEM_CACHE(cfq_queue, 0); 4289 blkio_policy_register(&blkio_policy_cfq);
4209 if (!cfq_pool)
4210 goto err_pol_unreg;
4211
4212 ret = elv_register(&iosched_cfq);
4213 if (ret)
4214 goto err_free_pool;
4215 4290
4216 return 0; 4291 return 0;
4217
4218err_free_pool:
4219 kmem_cache_destroy(cfq_pool);
4220err_pol_unreg:
4221#ifdef CONFIG_CFQ_GROUP_IOSCHED
4222 blkcg_policy_unregister(&blkcg_policy_cfq);
4223#endif
4224 return ret;
4225} 4292}
4226 4293
4227static void __exit cfq_exit(void) 4294static void __exit cfq_exit(void)
4228{ 4295{
4229#ifdef CONFIG_CFQ_GROUP_IOSCHED 4296 DECLARE_COMPLETION_ONSTACK(all_gone);
4230 blkcg_policy_unregister(&blkcg_policy_cfq); 4297 blkio_policy_unregister(&blkio_policy_cfq);
4231#endif
4232 elv_unregister(&iosched_cfq); 4298 elv_unregister(&iosched_cfq);
4233 kmem_cache_destroy(cfq_pool); 4299 ioc_gone = &all_gone;
4300 /* ioc_gone's update must be visible before reading ioc_count */
4301 smp_wmb();
4302
4303 /*
4304 * this also protects us from entering cfq_slab_kill() with
4305 * pending RCU callbacks
4306 */
4307 if (elv_ioc_count_read(cfq_ioc_count))
4308 wait_for_completion(&all_gone);
4309 ida_destroy(&cic_index_ida);
4310 cfq_slab_kill();
4234} 4311}
4235 4312
4236module_init(cfq_init); 4313module_init(cfq_init);
diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c
index 7c668c8a6f9..7b725020823 100644
--- a/block/compat_ioctl.c
+++ b/block/compat_ioctl.c
@@ -719,9 +719,6 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
719 case BLKSECTGET: 719 case BLKSECTGET:
720 return compat_put_ushort(arg, 720 return compat_put_ushort(arg,
721 queue_max_sectors(bdev_get_queue(bdev))); 721 queue_max_sectors(bdev_get_queue(bdev)));
722 case BLKROTATIONAL:
723 return compat_put_ushort(arg,
724 !blk_queue_nonrot(bdev_get_queue(bdev)));
725 case BLKRASET: /* compatible, but no compat_ptr (!) */ 722 case BLKRASET: /* compatible, but no compat_ptr (!) */
726 case BLKFRASET: 723 case BLKFRASET:
727 if (!capable(CAP_SYS_ADMIN)) 724 if (!capable(CAP_SYS_ADMIN))
diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c
index 90037b5eb17..c644137d9cd 100644
--- a/block/deadline-iosched.c
+++ b/block/deadline-iosched.c
@@ -230,7 +230,7 @@ static inline int deadline_check_fifo(struct deadline_data *dd, int ddir)
230 /* 230 /*
231 * rq is expired! 231 * rq is expired!
232 */ 232 */
233 if (time_after_eq(jiffies, rq_fifo_time(rq))) 233 if (time_after(jiffies, rq_fifo_time(rq)))
234 return 1; 234 return 1;
235 235
236 return 0; 236 return 0;
@@ -337,13 +337,13 @@ static void deadline_exit_queue(struct elevator_queue *e)
337/* 337/*
338 * initialize elevator private data (deadline_data). 338 * initialize elevator private data (deadline_data).
339 */ 339 */
340static int deadline_init_queue(struct request_queue *q) 340static void *deadline_init_queue(struct request_queue *q)
341{ 341{
342 struct deadline_data *dd; 342 struct deadline_data *dd;
343 343
344 dd = kmalloc_node(sizeof(*dd), GFP_KERNEL | __GFP_ZERO, q->node); 344 dd = kmalloc_node(sizeof(*dd), GFP_KERNEL | __GFP_ZERO, q->node);
345 if (!dd) 345 if (!dd)
346 return -ENOMEM; 346 return NULL;
347 347
348 INIT_LIST_HEAD(&dd->fifo_list[READ]); 348 INIT_LIST_HEAD(&dd->fifo_list[READ]);
349 INIT_LIST_HEAD(&dd->fifo_list[WRITE]); 349 INIT_LIST_HEAD(&dd->fifo_list[WRITE]);
@@ -354,9 +354,7 @@ static int deadline_init_queue(struct request_queue *q)
354 dd->writes_starved = writes_starved; 354 dd->writes_starved = writes_starved;
355 dd->front_merges = 1; 355 dd->front_merges = 1;
356 dd->fifo_batch = fifo_batch; 356 dd->fifo_batch = fifo_batch;
357 357 return dd;
358 q->elevator->elevator_data = dd;
359 return 0;
360} 358}
361 359
362/* 360/*
@@ -450,7 +448,9 @@ static struct elevator_type iosched_deadline = {
450 448
451static int __init deadline_init(void) 449static int __init deadline_init(void)
452{ 450{
453 return elv_register(&iosched_deadline); 451 elv_register(&iosched_deadline);
452
453 return 0;
454} 454}
455 455
456static void __exit deadline_exit(void) 456static void __exit deadline_exit(void)
diff --git a/block/elevator.c b/block/elevator.c
index 9edba1b8323..a3b64bc71d8 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -31,6 +31,7 @@
31#include <linux/slab.h> 31#include <linux/slab.h>
32#include <linux/init.h> 32#include <linux/init.h>
33#include <linux/compiler.h> 33#include <linux/compiler.h>
34#include <linux/delay.h>
34#include <linux/blktrace_api.h> 35#include <linux/blktrace_api.h>
35#include <linux/hash.h> 36#include <linux/hash.h>
36#include <linux/uaccess.h> 37#include <linux/uaccess.h>
@@ -38,7 +39,6 @@
38#include <trace/events/block.h> 39#include <trace/events/block.h>
39 40
40#include "blk.h" 41#include "blk.h"
41#include "blk-cgroup.h"
42 42
43static DEFINE_SPINLOCK(elv_list_lock); 43static DEFINE_SPINLOCK(elv_list_lock);
44static LIST_HEAD(elv_list); 44static LIST_HEAD(elv_list);
@@ -62,8 +62,8 @@ static int elv_iosched_allow_merge(struct request *rq, struct bio *bio)
62 struct request_queue *q = rq->q; 62 struct request_queue *q = rq->q;
63 struct elevator_queue *e = q->elevator; 63 struct elevator_queue *e = q->elevator;
64 64
65 if (e->type->ops.elevator_allow_merge_fn) 65 if (e->ops->elevator_allow_merge_fn)
66 return e->type->ops.elevator_allow_merge_fn(q, rq, bio); 66 return e->ops->elevator_allow_merge_fn(q, rq, bio);
67 67
68 return 1; 68 return 1;
69} 69}
@@ -71,9 +71,39 @@ static int elv_iosched_allow_merge(struct request *rq, struct bio *bio)
71/* 71/*
72 * can we safely merge with this request? 72 * can we safely merge with this request?
73 */ 73 */
74bool elv_rq_merge_ok(struct request *rq, struct bio *bio) 74int elv_rq_merge_ok(struct request *rq, struct bio *bio)
75{ 75{
76 if (!blk_rq_merge_ok(rq, bio)) 76 if (!rq_mergeable(rq))
77 return 0;
78
79 /*
80 * Don't merge file system requests and discard requests
81 */
82 if ((bio->bi_rw & REQ_DISCARD) != (rq->bio->bi_rw & REQ_DISCARD))
83 return 0;
84
85 /*
86 * Don't merge discard requests and secure discard requests
87 */
88 if ((bio->bi_rw & REQ_SECURE) != (rq->bio->bi_rw & REQ_SECURE))
89 return 0;
90
91 /*
92 * different data direction or already started, don't merge
93 */
94 if (bio_data_dir(bio) != rq_data_dir(rq))
95 return 0;
96
97 /*
98 * must be same device and not a special request
99 */
100 if (rq->rq_disk != bio->bi_bdev->bd_disk || rq->special)
101 return 0;
102
103 /*
104 * only merge integrity protected bio into ditto rq
105 */
106 if (bio_integrity(bio) != blk_integrity_rq(rq))
77 return 0; 107 return 0;
78 108
79 if (!elv_iosched_allow_merge(rq, bio)) 109 if (!elv_iosched_allow_merge(rq, bio))
@@ -83,6 +113,23 @@ bool elv_rq_merge_ok(struct request *rq, struct bio *bio)
83} 113}
84EXPORT_SYMBOL(elv_rq_merge_ok); 114EXPORT_SYMBOL(elv_rq_merge_ok);
85 115
116int elv_try_merge(struct request *__rq, struct bio *bio)
117{
118 int ret = ELEVATOR_NO_MERGE;
119
120 /*
121 * we can merge and sequence is ok, check if it's possible
122 */
123 if (elv_rq_merge_ok(__rq, bio)) {
124 if (blk_rq_pos(__rq) + blk_rq_sectors(__rq) == bio->bi_sector)
125 ret = ELEVATOR_BACK_MERGE;
126 else if (blk_rq_pos(__rq) - bio_sectors(bio) == bio->bi_sector)
127 ret = ELEVATOR_FRONT_MERGE;
128 }
129
130 return ret;
131}
132
86static struct elevator_type *elevator_find(const char *name) 133static struct elevator_type *elevator_find(const char *name)
87{ 134{
88 struct elevator_type *e; 135 struct elevator_type *e;
@@ -122,7 +169,20 @@ static struct elevator_type *elevator_get(const char *name)
122 return e; 169 return e;
123} 170}
124 171
125static char chosen_elevator[ELV_NAME_MAX]; 172static void *elevator_init_queue(struct request_queue *q,
173 struct elevator_queue *eq)
174{
175 return eq->ops->elevator_init_fn(q);
176}
177
178static void elevator_attach(struct request_queue *q, struct elevator_queue *eq,
179 void *data)
180{
181 q->elevator = eq;
182 eq->elevator_data = data;
183}
184
185static char chosen_elevator[16];
126 186
127static int __init elevator_setup(char *str) 187static int __init elevator_setup(char *str)
128{ 188{
@@ -148,7 +208,8 @@ static struct elevator_queue *elevator_alloc(struct request_queue *q,
148 if (unlikely(!eq)) 208 if (unlikely(!eq))
149 goto err; 209 goto err;
150 210
151 eq->type = e; 211 eq->ops = &e->ops;
212 eq->elevator_type = e;
152 kobject_init(&eq->kobj, &elv_ktype); 213 kobject_init(&eq->kobj, &elv_ktype);
153 mutex_init(&eq->sysfs_lock); 214 mutex_init(&eq->sysfs_lock);
154 215
@@ -172,7 +233,7 @@ static void elevator_release(struct kobject *kobj)
172 struct elevator_queue *e; 233 struct elevator_queue *e;
173 234
174 e = container_of(kobj, struct elevator_queue, kobj); 235 e = container_of(kobj, struct elevator_queue, kobj);
175 elevator_put(e->type); 236 elevator_put(e->elevator_type);
176 kfree(e->hash); 237 kfree(e->hash);
177 kfree(e); 238 kfree(e);
178} 239}
@@ -180,7 +241,8 @@ static void elevator_release(struct kobject *kobj)
180int elevator_init(struct request_queue *q, char *name) 241int elevator_init(struct request_queue *q, char *name)
181{ 242{
182 struct elevator_type *e = NULL; 243 struct elevator_type *e = NULL;
183 int err; 244 struct elevator_queue *eq;
245 void *data;
184 246
185 if (unlikely(q->elevator)) 247 if (unlikely(q->elevator))
186 return 0; 248 return 0;
@@ -213,16 +275,17 @@ int elevator_init(struct request_queue *q, char *name)
213 } 275 }
214 } 276 }
215 277
216 q->elevator = elevator_alloc(q, e); 278 eq = elevator_alloc(q, e);
217 if (!q->elevator) 279 if (!eq)
218 return -ENOMEM; 280 return -ENOMEM;
219 281
220 err = e->ops.elevator_init_fn(q); 282 data = elevator_init_queue(q, eq);
221 if (err) { 283 if (!data) {
222 kobject_put(&q->elevator->kobj); 284 kobject_put(&eq->kobj);
223 return err; 285 return -ENOMEM;
224 } 286 }
225 287
288 elevator_attach(q, eq, data);
226 return 0; 289 return 0;
227} 290}
228EXPORT_SYMBOL(elevator_init); 291EXPORT_SYMBOL(elevator_init);
@@ -230,8 +293,9 @@ EXPORT_SYMBOL(elevator_init);
230void elevator_exit(struct elevator_queue *e) 293void elevator_exit(struct elevator_queue *e)
231{ 294{
232 mutex_lock(&e->sysfs_lock); 295 mutex_lock(&e->sysfs_lock);
233 if (e->type->ops.elevator_exit_fn) 296 if (e->ops->elevator_exit_fn)
234 e->type->ops.elevator_exit_fn(e); 297 e->ops->elevator_exit_fn(e);
298 e->ops = NULL;
235 mutex_unlock(&e->sysfs_lock); 299 mutex_unlock(&e->sysfs_lock);
236 300
237 kobject_put(&e->kobj); 301 kobject_put(&e->kobj);
@@ -421,8 +485,8 @@ int elv_merge(struct request_queue *q, struct request **req, struct bio *bio)
421 /* 485 /*
422 * First try one-hit cache. 486 * First try one-hit cache.
423 */ 487 */
424 if (q->last_merge && elv_rq_merge_ok(q->last_merge, bio)) { 488 if (q->last_merge) {
425 ret = blk_try_merge(q->last_merge, bio); 489 ret = elv_try_merge(q->last_merge, bio);
426 if (ret != ELEVATOR_NO_MERGE) { 490 if (ret != ELEVATOR_NO_MERGE) {
427 *req = q->last_merge; 491 *req = q->last_merge;
428 return ret; 492 return ret;
@@ -441,8 +505,8 @@ int elv_merge(struct request_queue *q, struct request **req, struct bio *bio)
441 return ELEVATOR_BACK_MERGE; 505 return ELEVATOR_BACK_MERGE;
442 } 506 }
443 507
444 if (e->type->ops.elevator_merge_fn) 508 if (e->ops->elevator_merge_fn)
445 return e->type->ops.elevator_merge_fn(q, req, bio); 509 return e->ops->elevator_merge_fn(q, req, bio);
446 510
447 return ELEVATOR_NO_MERGE; 511 return ELEVATOR_NO_MERGE;
448} 512}
@@ -458,7 +522,6 @@ static bool elv_attempt_insert_merge(struct request_queue *q,
458 struct request *rq) 522 struct request *rq)
459{ 523{
460 struct request *__rq; 524 struct request *__rq;
461 bool ret;
462 525
463 if (blk_queue_nomerges(q)) 526 if (blk_queue_nomerges(q))
464 return false; 527 return false;
@@ -472,29 +535,22 @@ static bool elv_attempt_insert_merge(struct request_queue *q,
472 if (blk_queue_noxmerges(q)) 535 if (blk_queue_noxmerges(q))
473 return false; 536 return false;
474 537
475 ret = false;
476 /* 538 /*
477 * See if our hash lookup can find a potential backmerge. 539 * See if our hash lookup can find a potential backmerge.
478 */ 540 */
479 while (1) { 541 __rq = elv_rqhash_find(q, blk_rq_pos(rq));
480 __rq = elv_rqhash_find(q, blk_rq_pos(rq)); 542 if (__rq && blk_attempt_req_merge(q, __rq, rq))
481 if (!__rq || !blk_attempt_req_merge(q, __rq, rq)) 543 return true;
482 break;
483
484 /* The merged request could be merged with others, try again */
485 ret = true;
486 rq = __rq;
487 }
488 544
489 return ret; 545 return false;
490} 546}
491 547
492void elv_merged_request(struct request_queue *q, struct request *rq, int type) 548void elv_merged_request(struct request_queue *q, struct request *rq, int type)
493{ 549{
494 struct elevator_queue *e = q->elevator; 550 struct elevator_queue *e = q->elevator;
495 551
496 if (e->type->ops.elevator_merged_fn) 552 if (e->ops->elevator_merged_fn)
497 e->type->ops.elevator_merged_fn(q, rq, type); 553 e->ops->elevator_merged_fn(q, rq, type);
498 554
499 if (type == ELEVATOR_BACK_MERGE) 555 if (type == ELEVATOR_BACK_MERGE)
500 elv_rqhash_reposition(q, rq); 556 elv_rqhash_reposition(q, rq);
@@ -508,8 +564,8 @@ void elv_merge_requests(struct request_queue *q, struct request *rq,
508 struct elevator_queue *e = q->elevator; 564 struct elevator_queue *e = q->elevator;
509 const int next_sorted = next->cmd_flags & REQ_SORTED; 565 const int next_sorted = next->cmd_flags & REQ_SORTED;
510 566
511 if (next_sorted && e->type->ops.elevator_merge_req_fn) 567 if (next_sorted && e->ops->elevator_merge_req_fn)
512 e->type->ops.elevator_merge_req_fn(q, rq, next); 568 e->ops->elevator_merge_req_fn(q, rq, next);
513 569
514 elv_rqhash_reposition(q, rq); 570 elv_rqhash_reposition(q, rq);
515 571
@@ -526,8 +582,8 @@ void elv_bio_merged(struct request_queue *q, struct request *rq,
526{ 582{
527 struct elevator_queue *e = q->elevator; 583 struct elevator_queue *e = q->elevator;
528 584
529 if (e->type->ops.elevator_bio_merged_fn) 585 if (e->ops->elevator_bio_merged_fn)
530 e->type->ops.elevator_bio_merged_fn(q, rq, bio); 586 e->ops->elevator_bio_merged_fn(q, rq, bio);
531} 587}
532 588
533void elv_requeue_request(struct request_queue *q, struct request *rq) 589void elv_requeue_request(struct request_queue *q, struct request *rq)
@@ -550,18 +606,45 @@ void elv_requeue_request(struct request_queue *q, struct request *rq)
550void elv_drain_elevator(struct request_queue *q) 606void elv_drain_elevator(struct request_queue *q)
551{ 607{
552 static int printed; 608 static int printed;
553 609 while (q->elevator->ops->elevator_dispatch_fn(q, 1))
554 lockdep_assert_held(q->queue_lock);
555
556 while (q->elevator->type->ops.elevator_dispatch_fn(q, 1))
557 ; 610 ;
558 if (q->nr_sorted && printed++ < 10) { 611 if (q->nr_sorted == 0)
612 return;
613 if (printed++ < 10) {
559 printk(KERN_ERR "%s: forced dispatching is broken " 614 printk(KERN_ERR "%s: forced dispatching is broken "
560 "(nr_sorted=%u), please report this\n", 615 "(nr_sorted=%u), please report this\n",
561 q->elevator->type->elevator_name, q->nr_sorted); 616 q->elevator->elevator_type->elevator_name, q->nr_sorted);
617 }
618}
619
620/*
621 * Call with queue lock held, interrupts disabled
622 */
623void elv_quiesce_start(struct request_queue *q)
624{
625 if (!q->elevator)
626 return;
627
628 queue_flag_set(QUEUE_FLAG_ELVSWITCH, q);
629
630 /*
631 * make sure we don't have any requests in flight
632 */
633 elv_drain_elevator(q);
634 while (q->rq.elvpriv) {
635 __blk_run_queue(q);
636 spin_unlock_irq(q->queue_lock);
637 msleep(10);
638 spin_lock_irq(q->queue_lock);
639 elv_drain_elevator(q);
562 } 640 }
563} 641}
564 642
643void elv_quiesce_end(struct request_queue *q)
644{
645 queue_flag_clear(QUEUE_FLAG_ELVSWITCH, q);
646}
647
565void __elv_add_request(struct request_queue *q, struct request *rq, int where) 648void __elv_add_request(struct request_queue *q, struct request *rq, int where)
566{ 649{
567 trace_block_rq_insert(q, rq); 650 trace_block_rq_insert(q, rq);
@@ -570,7 +653,8 @@ void __elv_add_request(struct request_queue *q, struct request *rq, int where)
570 653
571 if (rq->cmd_flags & REQ_SOFTBARRIER) { 654 if (rq->cmd_flags & REQ_SOFTBARRIER) {
572 /* barriers are scheduling boundary, update end_sector */ 655 /* barriers are scheduling boundary, update end_sector */
573 if (rq->cmd_type == REQ_TYPE_FS) { 656 if (rq->cmd_type == REQ_TYPE_FS ||
657 (rq->cmd_flags & REQ_DISCARD)) {
574 q->end_sector = rq_end_sector(rq); 658 q->end_sector = rq_end_sector(rq);
575 q->boundary_rq = rq; 659 q->boundary_rq = rq;
576 } 660 }
@@ -612,7 +696,8 @@ void __elv_add_request(struct request_queue *q, struct request *rq, int where)
612 if (elv_attempt_insert_merge(q, rq)) 696 if (elv_attempt_insert_merge(q, rq))
613 break; 697 break;
614 case ELEVATOR_INSERT_SORT: 698 case ELEVATOR_INSERT_SORT:
615 BUG_ON(rq->cmd_type != REQ_TYPE_FS); 699 BUG_ON(rq->cmd_type != REQ_TYPE_FS &&
700 !(rq->cmd_flags & REQ_DISCARD));
616 rq->cmd_flags |= REQ_SORTED; 701 rq->cmd_flags |= REQ_SORTED;
617 q->nr_sorted++; 702 q->nr_sorted++;
618 if (rq_mergeable(rq)) { 703 if (rq_mergeable(rq)) {
@@ -626,7 +711,7 @@ void __elv_add_request(struct request_queue *q, struct request *rq, int where)
626 * rq cannot be accessed after calling 711 * rq cannot be accessed after calling
627 * elevator_add_req_fn. 712 * elevator_add_req_fn.
628 */ 713 */
629 q->elevator->type->ops.elevator_add_req_fn(q, rq); 714 q->elevator->ops->elevator_add_req_fn(q, rq);
630 break; 715 break;
631 716
632 case ELEVATOR_INSERT_FLUSH: 717 case ELEVATOR_INSERT_FLUSH:
@@ -655,8 +740,8 @@ struct request *elv_latter_request(struct request_queue *q, struct request *rq)
655{ 740{
656 struct elevator_queue *e = q->elevator; 741 struct elevator_queue *e = q->elevator;
657 742
658 if (e->type->ops.elevator_latter_req_fn) 743 if (e->ops->elevator_latter_req_fn)
659 return e->type->ops.elevator_latter_req_fn(q, rq); 744 return e->ops->elevator_latter_req_fn(q, rq);
660 return NULL; 745 return NULL;
661} 746}
662 747
@@ -664,18 +749,19 @@ struct request *elv_former_request(struct request_queue *q, struct request *rq)
664{ 749{
665 struct elevator_queue *e = q->elevator; 750 struct elevator_queue *e = q->elevator;
666 751
667 if (e->type->ops.elevator_former_req_fn) 752 if (e->ops->elevator_former_req_fn)
668 return e->type->ops.elevator_former_req_fn(q, rq); 753 return e->ops->elevator_former_req_fn(q, rq);
669 return NULL; 754 return NULL;
670} 755}
671 756
672int elv_set_request(struct request_queue *q, struct request *rq, 757int elv_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask)
673 struct bio *bio, gfp_t gfp_mask)
674{ 758{
675 struct elevator_queue *e = q->elevator; 759 struct elevator_queue *e = q->elevator;
676 760
677 if (e->type->ops.elevator_set_req_fn) 761 if (e->ops->elevator_set_req_fn)
678 return e->type->ops.elevator_set_req_fn(q, rq, bio, gfp_mask); 762 return e->ops->elevator_set_req_fn(q, rq, gfp_mask);
763
764 rq->elevator_private[0] = NULL;
679 return 0; 765 return 0;
680} 766}
681 767
@@ -683,16 +769,16 @@ void elv_put_request(struct request_queue *q, struct request *rq)
683{ 769{
684 struct elevator_queue *e = q->elevator; 770 struct elevator_queue *e = q->elevator;
685 771
686 if (e->type->ops.elevator_put_req_fn) 772 if (e->ops->elevator_put_req_fn)
687 e->type->ops.elevator_put_req_fn(rq); 773 e->ops->elevator_put_req_fn(rq);
688} 774}
689 775
690int elv_may_queue(struct request_queue *q, int rw) 776int elv_may_queue(struct request_queue *q, int rw)
691{ 777{
692 struct elevator_queue *e = q->elevator; 778 struct elevator_queue *e = q->elevator;
693 779
694 if (e->type->ops.elevator_may_queue_fn) 780 if (e->ops->elevator_may_queue_fn)
695 return e->type->ops.elevator_may_queue_fn(q, rw); 781 return e->ops->elevator_may_queue_fn(q, rw);
696 782
697 return ELV_MQUEUE_MAY; 783 return ELV_MQUEUE_MAY;
698} 784}
@@ -727,8 +813,8 @@ void elv_completed_request(struct request_queue *q, struct request *rq)
727 if (blk_account_rq(rq)) { 813 if (blk_account_rq(rq)) {
728 q->in_flight[rq_is_sync(rq)]--; 814 q->in_flight[rq_is_sync(rq)]--;
729 if ((rq->cmd_flags & REQ_SORTED) && 815 if ((rq->cmd_flags & REQ_SORTED) &&
730 e->type->ops.elevator_completed_req_fn) 816 e->ops->elevator_completed_req_fn)
731 e->type->ops.elevator_completed_req_fn(q, rq); 817 e->ops->elevator_completed_req_fn(q, rq);
732 } 818 }
733} 819}
734 820
@@ -746,7 +832,7 @@ elv_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
746 832
747 e = container_of(kobj, struct elevator_queue, kobj); 833 e = container_of(kobj, struct elevator_queue, kobj);
748 mutex_lock(&e->sysfs_lock); 834 mutex_lock(&e->sysfs_lock);
749 error = e->type ? entry->show(e, page) : -ENOENT; 835 error = e->ops ? entry->show(e, page) : -ENOENT;
750 mutex_unlock(&e->sysfs_lock); 836 mutex_unlock(&e->sysfs_lock);
751 return error; 837 return error;
752} 838}
@@ -764,7 +850,7 @@ elv_attr_store(struct kobject *kobj, struct attribute *attr,
764 850
765 e = container_of(kobj, struct elevator_queue, kobj); 851 e = container_of(kobj, struct elevator_queue, kobj);
766 mutex_lock(&e->sysfs_lock); 852 mutex_lock(&e->sysfs_lock);
767 error = e->type ? entry->store(e, page, length) : -ENOENT; 853 error = e->ops ? entry->store(e, page, length) : -ENOENT;
768 mutex_unlock(&e->sysfs_lock); 854 mutex_unlock(&e->sysfs_lock);
769 return error; 855 return error;
770} 856}
@@ -786,7 +872,7 @@ int elv_register_queue(struct request_queue *q)
786 872
787 error = kobject_add(&e->kobj, &q->kobj, "%s", "iosched"); 873 error = kobject_add(&e->kobj, &q->kobj, "%s", "iosched");
788 if (!error) { 874 if (!error) {
789 struct elv_fs_entry *attr = e->type->elevator_attrs; 875 struct elv_fs_entry *attr = e->elevator_type->elevator_attrs;
790 if (attr) { 876 if (attr) {
791 while (attr->attr.name) { 877 while (attr->attr.name) {
792 if (sysfs_create_file(&e->kobj, &attr->attr)) 878 if (sysfs_create_file(&e->kobj, &attr->attr))
@@ -801,48 +887,29 @@ int elv_register_queue(struct request_queue *q)
801} 887}
802EXPORT_SYMBOL(elv_register_queue); 888EXPORT_SYMBOL(elv_register_queue);
803 889
804void elv_unregister_queue(struct request_queue *q) 890static void __elv_unregister_queue(struct elevator_queue *e)
805{ 891{
806 if (q) { 892 kobject_uevent(&e->kobj, KOBJ_REMOVE);
807 struct elevator_queue *e = q->elevator; 893 kobject_del(&e->kobj);
894 e->registered = 0;
895}
808 896
809 kobject_uevent(&e->kobj, KOBJ_REMOVE); 897void elv_unregister_queue(struct request_queue *q)
810 kobject_del(&e->kobj); 898{
811 e->registered = 0; 899 if (q)
812 } 900 __elv_unregister_queue(q->elevator);
813} 901}
814EXPORT_SYMBOL(elv_unregister_queue); 902EXPORT_SYMBOL(elv_unregister_queue);
815 903
816int elv_register(struct elevator_type *e) 904void elv_register(struct elevator_type *e)
817{ 905{
818 char *def = ""; 906 char *def = "";
819 907
820 /* create icq_cache if requested */
821 if (e->icq_size) {
822 if (WARN_ON(e->icq_size < sizeof(struct io_cq)) ||
823 WARN_ON(e->icq_align < __alignof__(struct io_cq)))
824 return -EINVAL;
825
826 snprintf(e->icq_cache_name, sizeof(e->icq_cache_name),
827 "%s_io_cq", e->elevator_name);
828 e->icq_cache = kmem_cache_create(e->icq_cache_name, e->icq_size,
829 e->icq_align, 0, NULL);
830 if (!e->icq_cache)
831 return -ENOMEM;
832 }
833
834 /* register, don't allow duplicate names */
835 spin_lock(&elv_list_lock); 908 spin_lock(&elv_list_lock);
836 if (elevator_find(e->elevator_name)) { 909 BUG_ON(elevator_find(e->elevator_name));
837 spin_unlock(&elv_list_lock);
838 if (e->icq_cache)
839 kmem_cache_destroy(e->icq_cache);
840 return -EBUSY;
841 }
842 list_add_tail(&e->list, &elv_list); 910 list_add_tail(&e->list, &elv_list);
843 spin_unlock(&elv_list_lock); 911 spin_unlock(&elv_list_lock);
844 912
845 /* print pretty message */
846 if (!strcmp(e->elevator_name, chosen_elevator) || 913 if (!strcmp(e->elevator_name, chosen_elevator) ||
847 (!*chosen_elevator && 914 (!*chosen_elevator &&
848 !strcmp(e->elevator_name, CONFIG_DEFAULT_IOSCHED))) 915 !strcmp(e->elevator_name, CONFIG_DEFAULT_IOSCHED)))
@@ -850,26 +917,30 @@ int elv_register(struct elevator_type *e)
850 917
851 printk(KERN_INFO "io scheduler %s registered%s\n", e->elevator_name, 918 printk(KERN_INFO "io scheduler %s registered%s\n", e->elevator_name,
852 def); 919 def);
853 return 0;
854} 920}
855EXPORT_SYMBOL_GPL(elv_register); 921EXPORT_SYMBOL_GPL(elv_register);
856 922
857void elv_unregister(struct elevator_type *e) 923void elv_unregister(struct elevator_type *e)
858{ 924{
859 /* unregister */ 925 struct task_struct *g, *p;
860 spin_lock(&elv_list_lock);
861 list_del_init(&e->list);
862 spin_unlock(&elv_list_lock);
863 926
864 /* 927 /*
865 * Destroy icq_cache if it exists. icq's are RCU managed. Make 928 * Iterate every thread in the process to remove the io contexts.
866 * sure all RCU operations are complete before proceeding.
867 */ 929 */
868 if (e->icq_cache) { 930 if (e->ops.trim) {
869 rcu_barrier(); 931 read_lock(&tasklist_lock);
870 kmem_cache_destroy(e->icq_cache); 932 do_each_thread(g, p) {
871 e->icq_cache = NULL; 933 task_lock(p);
934 if (p->io_context)
935 e->ops.trim(p->io_context);
936 task_unlock(p);
937 } while_each_thread(g, p);
938 read_unlock(&tasklist_lock);
872 } 939 }
940
941 spin_lock(&elv_list_lock);
942 list_del_init(&e->list);
943 spin_unlock(&elv_list_lock);
873} 944}
874EXPORT_SYMBOL_GPL(elv_unregister); 945EXPORT_SYMBOL_GPL(elv_unregister);
875 946
@@ -881,60 +952,73 @@ EXPORT_SYMBOL_GPL(elv_unregister);
881 */ 952 */
882static int elevator_switch(struct request_queue *q, struct elevator_type *new_e) 953static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
883{ 954{
884 struct elevator_queue *old = q->elevator; 955 struct elevator_queue *old_elevator, *e;
885 bool registered = old->registered; 956 void *data;
886 int err; 957 int err;
887 958
888 /* 959 /*
889 * Turn on BYPASS and drain all requests w/ elevator private data. 960 * Allocate new elevator
890 * Block layer doesn't call into a quiesced elevator - all requests
891 * are directly put on the dispatch list without elevator data
892 * using INSERT_BACK. All requests have SOFTBARRIER set and no
893 * merge happens either.
894 */ 961 */
895 blk_queue_bypass_start(q); 962 e = elevator_alloc(q, new_e);
963 if (!e)
964 return -ENOMEM;
896 965
897 /* unregister and clear all auxiliary data of the old elevator */ 966 data = elevator_init_queue(q, e);
898 if (registered) 967 if (!data) {
899 elv_unregister_queue(q); 968 kobject_put(&e->kobj);
969 return -ENOMEM;
970 }
900 971
972 /*
973 * Turn on BYPASS and drain all requests w/ elevator private data
974 */
901 spin_lock_irq(q->queue_lock); 975 spin_lock_irq(q->queue_lock);
902 ioc_clear_queue(q); 976 elv_quiesce_start(q);
903 spin_unlock_irq(q->queue_lock);
904 977
905 /* allocate, init and register new elevator */ 978 /*
906 err = -ENOMEM; 979 * Remember old elevator.
907 q->elevator = elevator_alloc(q, new_e); 980 */
908 if (!q->elevator) 981 old_elevator = q->elevator;
909 goto fail_init;
910 982
911 err = new_e->ops.elevator_init_fn(q); 983 /*
912 if (err) { 984 * attach and start new elevator
913 kobject_put(&q->elevator->kobj); 985 */
914 goto fail_init; 986 elevator_attach(q, e, data);
915 } 987
988 spin_unlock_irq(q->queue_lock);
989
990 if (old_elevator->registered) {
991 __elv_unregister_queue(old_elevator);
916 992
917 if (registered) {
918 err = elv_register_queue(q); 993 err = elv_register_queue(q);
919 if (err) 994 if (err)
920 goto fail_register; 995 goto fail_register;
921 } 996 }
922 997
923 /* done, kill the old one and finish */ 998 /*
924 elevator_exit(old); 999 * finally exit old elevator and turn off BYPASS.
925 blk_queue_bypass_end(q); 1000 */
1001 elevator_exit(old_elevator);
1002 spin_lock_irq(q->queue_lock);
1003 elv_quiesce_end(q);
1004 spin_unlock_irq(q->queue_lock);
926 1005
927 blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name); 1006 blk_add_trace_msg(q, "elv switch: %s", e->elevator_type->elevator_name);
928 1007
929 return 0; 1008 return 0;
930 1009
931fail_register: 1010fail_register:
932 elevator_exit(q->elevator); 1011 /*
933fail_init: 1012 * switch failed, exit the new io scheduler and reattach the old
934 /* switch failed, restore and re-register old elevator */ 1013 * one again (along with re-adding the sysfs dir)
935 q->elevator = old; 1014 */
1015 elevator_exit(e);
1016 q->elevator = old_elevator;
936 elv_register_queue(q); 1017 elv_register_queue(q);
937 blk_queue_bypass_end(q); 1018
1019 spin_lock_irq(q->queue_lock);
1020 queue_flag_clear(QUEUE_FLAG_ELVSWITCH, q);
1021 spin_unlock_irq(q->queue_lock);
938 1022
939 return err; 1023 return err;
940} 1024}
@@ -957,7 +1041,7 @@ int elevator_change(struct request_queue *q, const char *name)
957 return -EINVAL; 1041 return -EINVAL;
958 } 1042 }
959 1043
960 if (!strcmp(elevator_name, q->elevator->type->elevator_name)) { 1044 if (!strcmp(elevator_name, q->elevator->elevator_type->elevator_name)) {
961 elevator_put(e); 1045 elevator_put(e);
962 return 0; 1046 return 0;
963 } 1047 }
@@ -992,7 +1076,7 @@ ssize_t elv_iosched_show(struct request_queue *q, char *name)
992 if (!q->elevator || !blk_queue_stackable(q)) 1076 if (!q->elevator || !blk_queue_stackable(q))
993 return sprintf(name, "none\n"); 1077 return sprintf(name, "none\n");
994 1078
995 elv = e->type; 1079 elv = e->elevator_type;
996 1080
997 spin_lock(&elv_list_lock); 1081 spin_lock(&elv_list_lock);
998 list_for_each_entry(__e, &elv_list, list) { 1082 list_for_each_entry(__e, &elv_list, list) {
diff --git a/block/genhd.c b/block/genhd.c
index 9a289d7c84b..d3834710b95 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -15,6 +15,7 @@
15#include <linux/slab.h> 15#include <linux/slab.h>
16#include <linux/kmod.h> 16#include <linux/kmod.h>
17#include <linux/kobj_map.h> 17#include <linux/kobj_map.h>
18#include <linux/buffer_head.h>
18#include <linux/mutex.h> 19#include <linux/mutex.h>
19#include <linux/idr.h> 20#include <linux/idr.h>
20#include <linux/log2.h> 21#include <linux/log2.h>
@@ -35,7 +36,6 @@ static DEFINE_IDR(ext_devt_idr);
35 36
36static struct device_type disk_type; 37static struct device_type disk_type;
37 38
38static void disk_alloc_events(struct gendisk *disk);
39static void disk_add_events(struct gendisk *disk); 39static void disk_add_events(struct gendisk *disk);
40static void disk_del_events(struct gendisk *disk); 40static void disk_del_events(struct gendisk *disk);
41static void disk_release_events(struct gendisk *disk); 41static void disk_release_events(struct gendisk *disk);
@@ -154,7 +154,7 @@ struct hd_struct *disk_part_iter_next(struct disk_part_iter *piter)
154 part = rcu_dereference(ptbl->part[piter->idx]); 154 part = rcu_dereference(ptbl->part[piter->idx]);
155 if (!part) 155 if (!part)
156 continue; 156 continue;
157 if (!part_nr_sects_read(part) && 157 if (!part->nr_sects &&
158 !(piter->flags & DISK_PITER_INCL_EMPTY) && 158 !(piter->flags & DISK_PITER_INCL_EMPTY) &&
159 !(piter->flags & DISK_PITER_INCL_EMPTY_PART0 && 159 !(piter->flags & DISK_PITER_INCL_EMPTY_PART0 &&
160 piter->idx == 0)) 160 piter->idx == 0))
@@ -191,7 +191,7 @@ EXPORT_SYMBOL_GPL(disk_part_iter_exit);
191static inline int sector_in_part(struct hd_struct *part, sector_t sector) 191static inline int sector_in_part(struct hd_struct *part, sector_t sector)
192{ 192{
193 return part->start_sect <= sector && 193 return part->start_sect <= sector &&
194 sector < part->start_sect + part_nr_sects_read(part); 194 sector < part->start_sect + part->nr_sects;
195} 195}
196 196
197/** 197/**
@@ -507,7 +507,7 @@ static int exact_lock(dev_t devt, void *data)
507 return 0; 507 return 0;
508} 508}
509 509
510static void register_disk(struct gendisk *disk) 510void register_disk(struct gendisk *disk)
511{ 511{
512 struct device *ddev = disk_to_dev(disk); 512 struct device *ddev = disk_to_dev(disk);
513 struct block_device *bdev; 513 struct block_device *bdev;
@@ -536,7 +536,7 @@ static void register_disk(struct gendisk *disk)
536 disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj); 536 disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj);
537 537
538 /* No minors to use for partitions */ 538 /* No minors to use for partitions */
539 if (!disk_part_scan_enabled(disk)) 539 if (!disk_partitionable(disk))
540 goto exit; 540 goto exit;
541 541
542 /* No such device (e.g., media were just removed) */ 542 /* No such device (e.g., media were just removed) */
@@ -602,8 +602,6 @@ void add_disk(struct gendisk *disk)
602 disk->major = MAJOR(devt); 602 disk->major = MAJOR(devt);
603 disk->first_minor = MINOR(devt); 603 disk->first_minor = MINOR(devt);
604 604
605 disk_alloc_events(disk);
606
607 /* Register BDI before referencing it from bdev */ 605 /* Register BDI before referencing it from bdev */
608 bdi = &disk->queue->backing_dev_info; 606 bdi = &disk->queue->backing_dev_info;
609 bdi_register_dev(bdi, disk_devt(disk)); 607 bdi_register_dev(bdi, disk_devt(disk));
@@ -617,7 +615,7 @@ void add_disk(struct gendisk *disk)
617 * Take an extra ref on queue which will be put on disk_release() 615 * Take an extra ref on queue which will be put on disk_release()
618 * so that it sticks around as long as @disk is there. 616 * so that it sticks around as long as @disk is there.
619 */ 617 */
620 WARN_ON_ONCE(!blk_get_queue(disk->queue)); 618 WARN_ON_ONCE(blk_get_queue(disk->queue));
621 619
622 retval = sysfs_create_link(&disk_to_dev(disk)->kobj, &bdi->dev->kobj, 620 retval = sysfs_create_link(&disk_to_dev(disk)->kobj, &bdi->dev->kobj,
623 "bdi"); 621 "bdi");
@@ -743,6 +741,7 @@ void __init printk_all_partitions(void)
743 struct hd_struct *part; 741 struct hd_struct *part;
744 char name_buf[BDEVNAME_SIZE]; 742 char name_buf[BDEVNAME_SIZE];
745 char devt_buf[BDEVT_SIZE]; 743 char devt_buf[BDEVT_SIZE];
744 u8 uuid[PARTITION_META_INFO_UUIDLTH * 2 + 1];
746 745
747 /* 746 /*
748 * Don't show empty devices or things that have been 747 * Don't show empty devices or things that have been
@@ -761,11 +760,14 @@ void __init printk_all_partitions(void)
761 while ((part = disk_part_iter_next(&piter))) { 760 while ((part = disk_part_iter_next(&piter))) {
762 bool is_part0 = part == &disk->part0; 761 bool is_part0 = part == &disk->part0;
763 762
763 uuid[0] = 0;
764 if (part->info)
765 part_unpack_uuid(part->info->uuid, uuid);
766
764 printk("%s%s %10llu %s %s", is_part0 ? "" : " ", 767 printk("%s%s %10llu %s %s", is_part0 ? "" : " ",
765 bdevt_str(part_devt(part), devt_buf), 768 bdevt_str(part_devt(part), devt_buf),
766 (unsigned long long)part_nr_sects_read(part) >> 1 769 (unsigned long long)part->nr_sects >> 1,
767 , disk_name(disk, part->partno, name_buf), 770 disk_name(disk, part->partno, name_buf), uuid);
768 part->info ? part->info->uuid : "");
769 if (is_part0) { 771 if (is_part0) {
770 if (disk->driverfs_dev != NULL && 772 if (disk->driverfs_dev != NULL &&
771 disk->driverfs_dev->driver != NULL) 773 disk->driverfs_dev->driver != NULL)
@@ -829,7 +831,7 @@ static void disk_seqf_stop(struct seq_file *seqf, void *v)
829 831
830static void *show_partition_start(struct seq_file *seqf, loff_t *pos) 832static void *show_partition_start(struct seq_file *seqf, loff_t *pos)
831{ 833{
832 void *p; 834 static void *p;
833 835
834 p = disk_seqf_start(seqf, pos); 836 p = disk_seqf_start(seqf, pos);
835 if (!IS_ERR_OR_NULL(p) && !*pos) 837 if (!IS_ERR_OR_NULL(p) && !*pos)
@@ -845,7 +847,7 @@ static int show_partition(struct seq_file *seqf, void *v)
845 char buf[BDEVNAME_SIZE]; 847 char buf[BDEVNAME_SIZE];
846 848
847 /* Don't show non-partitionable removeable devices or empty devices */ 849 /* Don't show non-partitionable removeable devices or empty devices */
848 if (!get_capacity(sgp) || (!disk_max_parts(sgp) && 850 if (!get_capacity(sgp) || (!disk_partitionable(sgp) &&
849 (sgp->flags & GENHD_FL_REMOVABLE))) 851 (sgp->flags & GENHD_FL_REMOVABLE)))
850 return 0; 852 return 0;
851 if (sgp->flags & GENHD_FL_SUPPRESS_PARTITION_INFO) 853 if (sgp->flags & GENHD_FL_SUPPRESS_PARTITION_INFO)
@@ -856,7 +858,7 @@ static int show_partition(struct seq_file *seqf, void *v)
856 while ((part = disk_part_iter_next(&piter))) 858 while ((part = disk_part_iter_next(&piter)))
857 seq_printf(seqf, "%4d %7d %10llu %s\n", 859 seq_printf(seqf, "%4d %7d %10llu %s\n",
858 MAJOR(part_devt(part)), MINOR(part_devt(part)), 860 MAJOR(part_devt(part)), MINOR(part_devt(part)),
859 (unsigned long long)part_nr_sects_read(part) >> 1, 861 (unsigned long long)part->nr_sects >> 1,
860 disk_name(sgp, part->partno, buf)); 862 disk_name(sgp, part->partno, buf));
861 disk_part_iter_exit(&piter); 863 disk_part_iter_exit(&piter);
862 864
@@ -1103,11 +1105,27 @@ static void disk_release(struct device *dev)
1103 blk_put_queue(disk->queue); 1105 blk_put_queue(disk->queue);
1104 kfree(disk); 1106 kfree(disk);
1105} 1107}
1108
1109static int disk_uevent(struct device *dev, struct kobj_uevent_env *env)
1110{
1111 struct gendisk *disk = dev_to_disk(dev);
1112 struct disk_part_iter piter;
1113 struct hd_struct *part;
1114 int cnt = 0;
1115
1116 disk_part_iter_init(&piter, disk, 0);
1117 while((part = disk_part_iter_next(&piter)))
1118 cnt++;
1119 disk_part_iter_exit(&piter);
1120 add_uevent_var(env, "NPARTS=%u", cnt);
1121 return 0;
1122}
1123
1106struct class block_class = { 1124struct class block_class = {
1107 .name = "block", 1125 .name = "block",
1108}; 1126};
1109 1127
1110static char *block_devnode(struct device *dev, umode_t *mode) 1128static char *block_devnode(struct device *dev, mode_t *mode)
1111{ 1129{
1112 struct gendisk *disk = dev_to_disk(dev); 1130 struct gendisk *disk = dev_to_disk(dev);
1113 1131
@@ -1121,6 +1139,7 @@ static struct device_type disk_type = {
1121 .groups = disk_attr_groups, 1139 .groups = disk_attr_groups,
1122 .release = disk_release, 1140 .release = disk_release,
1123 .devnode = block_devnode, 1141 .devnode = block_devnode,
1142 .uevent = disk_uevent,
1124}; 1143};
1125 1144
1126#ifdef CONFIG_PROC_FS 1145#ifdef CONFIG_PROC_FS
@@ -1239,7 +1258,7 @@ EXPORT_SYMBOL(blk_lookup_devt);
1239 1258
1240struct gendisk *alloc_disk(int minors) 1259struct gendisk *alloc_disk(int minors)
1241{ 1260{
1242 return alloc_disk_node(minors, NUMA_NO_NODE); 1261 return alloc_disk_node(minors, -1);
1243} 1262}
1244EXPORT_SYMBOL(alloc_disk); 1263EXPORT_SYMBOL(alloc_disk);
1245 1264
@@ -1262,16 +1281,6 @@ struct gendisk *alloc_disk_node(int minors, int node_id)
1262 } 1281 }
1263 disk->part_tbl->part[0] = &disk->part0; 1282 disk->part_tbl->part[0] = &disk->part0;
1264 1283
1265 /*
1266 * set_capacity() and get_capacity() currently don't use
1267 * seqcounter to read/update the part0->nr_sects. Still init
1268 * the counter as we can read the sectors in IO submission
1269 * patch using seqence counters.
1270 *
1271 * TODO: Ideally set_capacity() and get_capacity() should be
1272 * converted to make use of bd_mutex and sequence counters.
1273 */
1274 seqcount_init(&disk->part0.nr_sects_seq);
1275 hd_ref_init(&disk->part0); 1284 hd_ref_init(&disk->part0);
1276 1285
1277 disk->minors = minors; 1286 disk->minors = minors;
@@ -1484,9 +1493,9 @@ static void __disk_unblock_events(struct gendisk *disk, bool check_now)
1484 intv = disk_events_poll_jiffies(disk); 1493 intv = disk_events_poll_jiffies(disk);
1485 set_timer_slack(&ev->dwork.timer, intv / 4); 1494 set_timer_slack(&ev->dwork.timer, intv / 4);
1486 if (check_now) 1495 if (check_now)
1487 queue_delayed_work(system_freezable_wq, &ev->dwork, 0); 1496 queue_delayed_work(system_nrt_wq, &ev->dwork, 0);
1488 else if (intv) 1497 else if (intv)
1489 queue_delayed_work(system_freezable_wq, &ev->dwork, intv); 1498 queue_delayed_work(system_nrt_wq, &ev->dwork, intv);
1490out_unlock: 1499out_unlock:
1491 spin_unlock_irqrestore(&ev->lock, flags); 1500 spin_unlock_irqrestore(&ev->lock, flags);
1492} 1501}
@@ -1528,8 +1537,10 @@ void disk_flush_events(struct gendisk *disk, unsigned int mask)
1528 1537
1529 spin_lock_irq(&ev->lock); 1538 spin_lock_irq(&ev->lock);
1530 ev->clearing |= mask; 1539 ev->clearing |= mask;
1531 if (!ev->block) 1540 if (!ev->block) {
1532 mod_delayed_work(system_freezable_wq, &ev->dwork, 0); 1541 cancel_delayed_work(&ev->dwork);
1542 queue_delayed_work(system_nrt_wq, &ev->dwork, 0);
1543 }
1533 spin_unlock_irq(&ev->lock); 1544 spin_unlock_irq(&ev->lock);
1534} 1545}
1535 1546
@@ -1565,7 +1576,7 @@ unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask)
1565 1576
1566 /* uncondtionally schedule event check and wait for it to finish */ 1577 /* uncondtionally schedule event check and wait for it to finish */
1567 disk_block_events(disk); 1578 disk_block_events(disk);
1568 queue_delayed_work(system_freezable_wq, &ev->dwork, 0); 1579 queue_delayed_work(system_nrt_wq, &ev->dwork, 0);
1569 flush_delayed_work(&ev->dwork); 1580 flush_delayed_work(&ev->dwork);
1570 __disk_unblock_events(disk, false); 1581 __disk_unblock_events(disk, false);
1571 1582
@@ -1602,7 +1613,7 @@ static void disk_events_workfn(struct work_struct *work)
1602 1613
1603 intv = disk_events_poll_jiffies(disk); 1614 intv = disk_events_poll_jiffies(disk);
1604 if (!ev->block && intv) 1615 if (!ev->block && intv)
1605 queue_delayed_work(system_freezable_wq, &ev->dwork, intv); 1616 queue_delayed_work(system_nrt_wq, &ev->dwork, intv);
1606 1617
1607 spin_unlock_irq(&ev->lock); 1618 spin_unlock_irq(&ev->lock);
1608 1619
@@ -1740,9 +1751,9 @@ module_param_cb(events_dfl_poll_msecs, &disk_events_dfl_poll_msecs_param_ops,
1740 &disk_events_dfl_poll_msecs, 0644); 1751 &disk_events_dfl_poll_msecs, 0644);
1741 1752
1742/* 1753/*
1743 * disk_{alloc|add|del|release}_events - initialize and destroy disk_events. 1754 * disk_{add|del|release}_events - initialize and destroy disk_events.
1744 */ 1755 */
1745static void disk_alloc_events(struct gendisk *disk) 1756static void disk_add_events(struct gendisk *disk)
1746{ 1757{
1747 struct disk_events *ev; 1758 struct disk_events *ev;
1748 1759
@@ -1755,6 +1766,16 @@ static void disk_alloc_events(struct gendisk *disk)
1755 return; 1766 return;
1756 } 1767 }
1757 1768
1769 if (sysfs_create_files(&disk_to_dev(disk)->kobj,
1770 disk_events_attrs) < 0) {
1771 pr_warn("%s: failed to create sysfs files for events\n",
1772 disk->disk_name);
1773 kfree(ev);
1774 return;
1775 }
1776
1777 disk->ev = ev;
1778
1758 INIT_LIST_HEAD(&ev->node); 1779 INIT_LIST_HEAD(&ev->node);
1759 ev->disk = disk; 1780 ev->disk = disk;
1760 spin_lock_init(&ev->lock); 1781 spin_lock_init(&ev->lock);
@@ -1763,21 +1784,8 @@ static void disk_alloc_events(struct gendisk *disk)
1763 ev->poll_msecs = -1; 1784 ev->poll_msecs = -1;
1764 INIT_DELAYED_WORK(&ev->dwork, disk_events_workfn); 1785 INIT_DELAYED_WORK(&ev->dwork, disk_events_workfn);
1765 1786
1766 disk->ev = ev;
1767}
1768
1769static void disk_add_events(struct gendisk *disk)
1770{
1771 if (!disk->ev)
1772 return;
1773
1774 /* FIXME: error handling */
1775 if (sysfs_create_files(&disk_to_dev(disk)->kobj, disk_events_attrs) < 0)
1776 pr_warn("%s: failed to create sysfs files for events\n",
1777 disk->disk_name);
1778
1779 mutex_lock(&disk_events_mutex); 1787 mutex_lock(&disk_events_mutex);
1780 list_add_tail(&disk->ev->node, &disk_events); 1788 list_add_tail(&ev->node, &disk_events);
1781 mutex_unlock(&disk_events_mutex); 1789 mutex_unlock(&disk_events_mutex);
1782 1790
1783 /* 1791 /*
diff --git a/block/ioctl.c b/block/ioctl.c
index a31d91d9bc5..1124cd29726 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -1,11 +1,10 @@
1#include <linux/capability.h> 1#include <linux/capability.h>
2#include <linux/blkdev.h> 2#include <linux/blkdev.h>
3#include <linux/export.h>
4#include <linux/gfp.h> 3#include <linux/gfp.h>
5#include <linux/blkpg.h> 4#include <linux/blkpg.h>
6#include <linux/hdreg.h> 5#include <linux/hdreg.h>
7#include <linux/backing-dev.h> 6#include <linux/backing-dev.h>
8#include <linux/fs.h> 7#include <linux/buffer_head.h>
9#include <linux/blktrace_api.h> 8#include <linux/blktrace_api.h>
10#include <asm/uaccess.h> 9#include <asm/uaccess.h>
11 10
@@ -13,7 +12,7 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user
13{ 12{
14 struct block_device *bdevp; 13 struct block_device *bdevp;
15 struct gendisk *disk; 14 struct gendisk *disk;
16 struct hd_struct *part, *lpart; 15 struct hd_struct *part;
17 struct blkpg_ioctl_arg a; 16 struct blkpg_ioctl_arg a;
18 struct blkpg_partition p; 17 struct blkpg_partition p;
19 struct disk_part_iter piter; 18 struct disk_part_iter piter;
@@ -36,12 +35,12 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user
36 case BLKPG_ADD_PARTITION: 35 case BLKPG_ADD_PARTITION:
37 start = p.start >> 9; 36 start = p.start >> 9;
38 length = p.length >> 9; 37 length = p.length >> 9;
39 /* check for fit in a hd_struct */ 38 /* check for fit in a hd_struct */
40 if (sizeof(sector_t) == sizeof(long) && 39 if (sizeof(sector_t) == sizeof(long) &&
41 sizeof(long long) > sizeof(long)) { 40 sizeof(long long) > sizeof(long)) {
42 long pstart = start, plength = length; 41 long pstart = start, plength = length;
43 if (pstart != start || plength != length 42 if (pstart != start || plength != length
44 || pstart < 0 || plength < 0 || partno > 65535) 43 || pstart < 0 || plength < 0)
45 return -EINVAL; 44 return -EINVAL;
46 } 45 }
47 46
@@ -92,59 +91,6 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user
92 bdput(bdevp); 91 bdput(bdevp);
93 92
94 return 0; 93 return 0;
95 case BLKPG_RESIZE_PARTITION:
96 start = p.start >> 9;
97 /* new length of partition in bytes */
98 length = p.length >> 9;
99 /* check for fit in a hd_struct */
100 if (sizeof(sector_t) == sizeof(long) &&
101 sizeof(long long) > sizeof(long)) {
102 long pstart = start, plength = length;
103 if (pstart != start || plength != length
104 || pstart < 0 || plength < 0)
105 return -EINVAL;
106 }
107 part = disk_get_part(disk, partno);
108 if (!part)
109 return -ENXIO;
110 bdevp = bdget(part_devt(part));
111 if (!bdevp) {
112 disk_put_part(part);
113 return -ENOMEM;
114 }
115 mutex_lock(&bdevp->bd_mutex);
116 mutex_lock_nested(&bdev->bd_mutex, 1);
117 if (start != part->start_sect) {
118 mutex_unlock(&bdevp->bd_mutex);
119 mutex_unlock(&bdev->bd_mutex);
120 bdput(bdevp);
121 disk_put_part(part);
122 return -EINVAL;
123 }
124 /* overlap? */
125 disk_part_iter_init(&piter, disk,
126 DISK_PITER_INCL_EMPTY);
127 while ((lpart = disk_part_iter_next(&piter))) {
128 if (lpart->partno != partno &&
129 !(start + length <= lpart->start_sect ||
130 start >= lpart->start_sect + lpart->nr_sects)
131 ) {
132 disk_part_iter_exit(&piter);
133 mutex_unlock(&bdevp->bd_mutex);
134 mutex_unlock(&bdev->bd_mutex);
135 bdput(bdevp);
136 disk_put_part(part);
137 return -EBUSY;
138 }
139 }
140 disk_part_iter_exit(&piter);
141 part_nr_sects_write(part, (sector_t)length);
142 i_size_write(bdevp->bd_inode, p.length);
143 mutex_unlock(&bdevp->bd_mutex);
144 mutex_unlock(&bdev->bd_mutex);
145 bdput(bdevp);
146 disk_put_part(part);
147 return 0;
148 default: 94 default:
149 return -EINVAL; 95 return -EINVAL;
150 } 96 }
@@ -155,7 +101,7 @@ static int blkdev_reread_part(struct block_device *bdev)
155 struct gendisk *disk = bdev->bd_disk; 101 struct gendisk *disk = bdev->bd_disk;
156 int res; 102 int res;
157 103
158 if (!disk_part_scan_enabled(disk) || bdev != bdev->bd_contains) 104 if (!disk_partitionable(disk) || bdev != bdev->bd_contains)
159 return -EINVAL; 105 return -EINVAL;
160 if (!capable(CAP_SYS_ADMIN)) 106 if (!capable(CAP_SYS_ADMIN))
161 return -EACCES; 107 return -EACCES;
@@ -185,22 +131,6 @@ static int blk_ioctl_discard(struct block_device *bdev, uint64_t start,
185 return blkdev_issue_discard(bdev, start, len, GFP_KERNEL, flags); 131 return blkdev_issue_discard(bdev, start, len, GFP_KERNEL, flags);
186} 132}
187 133
188static int blk_ioctl_zeroout(struct block_device *bdev, uint64_t start,
189 uint64_t len)
190{
191 if (start & 511)
192 return -EINVAL;
193 if (len & 511)
194 return -EINVAL;
195 start >>= 9;
196 len >>= 9;
197
198 if (start + len > (i_size_read(bdev->bd_inode) >> 9))
199 return -EINVAL;
200
201 return blkdev_issue_zeroout(bdev, start, len, GFP_KERNEL);
202}
203
204static int put_ushort(unsigned long arg, unsigned short val) 134static int put_ushort(unsigned long arg, unsigned short val)
205{ 135{
206 return put_user(val, (unsigned short __user *)arg); 136 return put_user(val, (unsigned short __user *)arg);
@@ -249,26 +179,6 @@ int __blkdev_driver_ioctl(struct block_device *bdev, fmode_t mode,
249EXPORT_SYMBOL_GPL(__blkdev_driver_ioctl); 179EXPORT_SYMBOL_GPL(__blkdev_driver_ioctl);
250 180
251/* 181/*
252 * Is it an unrecognized ioctl? The correct returns are either
253 * ENOTTY (final) or ENOIOCTLCMD ("I don't know this one, try a
254 * fallback"). ENOIOCTLCMD gets turned into ENOTTY by the ioctl
255 * code before returning.
256 *
257 * Confused drivers sometimes return EINVAL, which is wrong. It
258 * means "I understood the ioctl command, but the parameters to
259 * it were wrong".
260 *
261 * We should aim to just fix the broken drivers, the EINVAL case
262 * should go away.
263 */
264static inline int is_unrecognized_ioctl(int ret)
265{
266 return ret == -EINVAL ||
267 ret == -ENOTTY ||
268 ret == -ENOIOCTLCMD;
269}
270
271/*
272 * always keep this in sync with compat_blkdev_ioctl() 182 * always keep this in sync with compat_blkdev_ioctl()
273 */ 183 */
274int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, 184int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
@@ -285,7 +195,8 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
285 return -EACCES; 195 return -EACCES;
286 196
287 ret = __blkdev_driver_ioctl(bdev, mode, cmd, arg); 197 ret = __blkdev_driver_ioctl(bdev, mode, cmd, arg);
288 if (!is_unrecognized_ioctl(ret)) 198 /* -EINVAL to handle old uncorrected drivers */
199 if (ret != -EINVAL && ret != -ENOTTY)
289 return ret; 200 return ret;
290 201
291 fsync_bdev(bdev); 202 fsync_bdev(bdev);
@@ -294,7 +205,8 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
294 205
295 case BLKROSET: 206 case BLKROSET:
296 ret = __blkdev_driver_ioctl(bdev, mode, cmd, arg); 207 ret = __blkdev_driver_ioctl(bdev, mode, cmd, arg);
297 if (!is_unrecognized_ioctl(ret)) 208 /* -EINVAL to handle old uncorrected drivers */
209 if (ret != -EINVAL && ret != -ENOTTY)
298 return ret; 210 return ret;
299 if (!capable(CAP_SYS_ADMIN)) 211 if (!capable(CAP_SYS_ADMIN))
300 return -EACCES; 212 return -EACCES;
@@ -316,17 +228,6 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
316 return blk_ioctl_discard(bdev, range[0], range[1], 228 return blk_ioctl_discard(bdev, range[0], range[1],
317 cmd == BLKSECDISCARD); 229 cmd == BLKSECDISCARD);
318 } 230 }
319 case BLKZEROOUT: {
320 uint64_t range[2];
321
322 if (!(mode & FMODE_WRITE))
323 return -EBADF;
324
325 if (copy_from_user(range, (void __user *)arg, sizeof(range)))
326 return -EFAULT;
327
328 return blk_ioctl_zeroout(bdev, range[0], range[1]);
329 }
330 231
331 case HDIO_GETGEO: { 232 case HDIO_GETGEO: {
332 struct hd_geometry geo; 233 struct hd_geometry geo;
@@ -376,8 +277,6 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
376 return put_uint(arg, bdev_discard_zeroes_data(bdev)); 277 return put_uint(arg, bdev_discard_zeroes_data(bdev));
377 case BLKSECTGET: 278 case BLKSECTGET:
378 return put_ushort(arg, queue_max_sectors(bdev_get_queue(bdev))); 279 return put_ushort(arg, queue_max_sectors(bdev_get_queue(bdev)));
379 case BLKROTATIONAL:
380 return put_ushort(arg, !blk_queue_nonrot(bdev_get_queue(bdev)));
381 case BLKRASET: 280 case BLKRASET:
382 case BLKFRASET: 281 case BLKFRASET:
383 if(!capable(CAP_SYS_ADMIN)) 282 if(!capable(CAP_SYS_ADMIN))
diff --git a/block/noop-iosched.c b/block/noop-iosched.c
index 5d1bf70e33d..06389e9ef96 100644
--- a/block/noop-iosched.c
+++ b/block/noop-iosched.c
@@ -59,17 +59,15 @@ noop_latter_request(struct request_queue *q, struct request *rq)
59 return list_entry(rq->queuelist.next, struct request, queuelist); 59 return list_entry(rq->queuelist.next, struct request, queuelist);
60} 60}
61 61
62static int noop_init_queue(struct request_queue *q) 62static void *noop_init_queue(struct request_queue *q)
63{ 63{
64 struct noop_data *nd; 64 struct noop_data *nd;
65 65
66 nd = kmalloc_node(sizeof(*nd), GFP_KERNEL, q->node); 66 nd = kmalloc_node(sizeof(*nd), GFP_KERNEL, q->node);
67 if (!nd) 67 if (!nd)
68 return -ENOMEM; 68 return NULL;
69
70 INIT_LIST_HEAD(&nd->queue); 69 INIT_LIST_HEAD(&nd->queue);
71 q->elevator->elevator_data = nd; 70 return nd;
72 return 0;
73} 71}
74 72
75static void noop_exit_queue(struct elevator_queue *e) 73static void noop_exit_queue(struct elevator_queue *e)
@@ -96,7 +94,9 @@ static struct elevator_type elevator_noop = {
96 94
97static int __init noop_init(void) 95static int __init noop_init(void)
98{ 96{
99 return elv_register(&elevator_noop); 97 elv_register(&elevator_noop);
98
99 return 0;
100} 100}
101 101
102static void __exit noop_exit(void) 102static void __exit noop_exit(void)
diff --git a/block/partition-generic.c b/block/partition-generic.c
deleted file mode 100644
index f1d14519cc0..00000000000
--- a/block/partition-generic.c
+++ /dev/null
@@ -1,571 +0,0 @@
1/*
2 * Code extracted from drivers/block/genhd.c
3 * Copyright (C) 1991-1998 Linus Torvalds
4 * Re-organised Feb 1998 Russell King
5 *
6 * We now have independent partition support from the
7 * block drivers, which allows all the partition code to
8 * be grouped in one location, and it to be mostly self
9 * contained.
10 */
11
12#include <linux/init.h>
13#include <linux/module.h>
14#include <linux/fs.h>
15#include <linux/slab.h>
16#include <linux/kmod.h>
17#include <linux/ctype.h>
18#include <linux/genhd.h>
19#include <linux/blktrace_api.h>
20
21#include "partitions/check.h"
22
23#ifdef CONFIG_BLK_DEV_MD
24extern void md_autodetect_dev(dev_t dev);
25#endif
26
27/*
28 * disk_name() is used by partition check code and the genhd driver.
29 * It formats the devicename of the indicated disk into
30 * the supplied buffer (of size at least 32), and returns
31 * a pointer to that same buffer (for convenience).
32 */
33
34char *disk_name(struct gendisk *hd, int partno, char *buf)
35{
36 if (!partno)
37 snprintf(buf, BDEVNAME_SIZE, "%s", hd->disk_name);
38 else if (isdigit(hd->disk_name[strlen(hd->disk_name)-1]))
39 snprintf(buf, BDEVNAME_SIZE, "%sp%d", hd->disk_name, partno);
40 else
41 snprintf(buf, BDEVNAME_SIZE, "%s%d", hd->disk_name, partno);
42
43 return buf;
44}
45
46const char *bdevname(struct block_device *bdev, char *buf)
47{
48 return disk_name(bdev->bd_disk, bdev->bd_part->partno, buf);
49}
50
51EXPORT_SYMBOL(bdevname);
52
53/*
54 * There's very little reason to use this, you should really
55 * have a struct block_device just about everywhere and use
56 * bdevname() instead.
57 */
58const char *__bdevname(dev_t dev, char *buffer)
59{
60 scnprintf(buffer, BDEVNAME_SIZE, "unknown-block(%u,%u)",
61 MAJOR(dev), MINOR(dev));
62 return buffer;
63}
64
65EXPORT_SYMBOL(__bdevname);
66
67static ssize_t part_partition_show(struct device *dev,
68 struct device_attribute *attr, char *buf)
69{
70 struct hd_struct *p = dev_to_part(dev);
71
72 return sprintf(buf, "%d\n", p->partno);
73}
74
75static ssize_t part_start_show(struct device *dev,
76 struct device_attribute *attr, char *buf)
77{
78 struct hd_struct *p = dev_to_part(dev);
79
80 return sprintf(buf, "%llu\n",(unsigned long long)p->start_sect);
81}
82
83ssize_t part_size_show(struct device *dev,
84 struct device_attribute *attr, char *buf)
85{
86 struct hd_struct *p = dev_to_part(dev);
87 return sprintf(buf, "%llu\n",(unsigned long long)part_nr_sects_read(p));
88}
89
90static ssize_t part_ro_show(struct device *dev,
91 struct device_attribute *attr, char *buf)
92{
93 struct hd_struct *p = dev_to_part(dev);
94 return sprintf(buf, "%d\n", p->policy ? 1 : 0);
95}
96
97static ssize_t part_alignment_offset_show(struct device *dev,
98 struct device_attribute *attr, char *buf)
99{
100 struct hd_struct *p = dev_to_part(dev);
101 return sprintf(buf, "%llu\n", (unsigned long long)p->alignment_offset);
102}
103
104static ssize_t part_discard_alignment_show(struct device *dev,
105 struct device_attribute *attr, char *buf)
106{
107 struct hd_struct *p = dev_to_part(dev);
108 return sprintf(buf, "%u\n", p->discard_alignment);
109}
110
111ssize_t part_stat_show(struct device *dev,
112 struct device_attribute *attr, char *buf)
113{
114 struct hd_struct *p = dev_to_part(dev);
115 int cpu;
116
117 cpu = part_stat_lock();
118 part_round_stats(cpu, p);
119 part_stat_unlock();
120 return sprintf(buf,
121 "%8lu %8lu %8llu %8u "
122 "%8lu %8lu %8llu %8u "
123 "%8u %8u %8u"
124 "\n",
125 part_stat_read(p, ios[READ]),
126 part_stat_read(p, merges[READ]),
127 (unsigned long long)part_stat_read(p, sectors[READ]),
128 jiffies_to_msecs(part_stat_read(p, ticks[READ])),
129 part_stat_read(p, ios[WRITE]),
130 part_stat_read(p, merges[WRITE]),
131 (unsigned long long)part_stat_read(p, sectors[WRITE]),
132 jiffies_to_msecs(part_stat_read(p, ticks[WRITE])),
133 part_in_flight(p),
134 jiffies_to_msecs(part_stat_read(p, io_ticks)),
135 jiffies_to_msecs(part_stat_read(p, time_in_queue)));
136}
137
138ssize_t part_inflight_show(struct device *dev,
139 struct device_attribute *attr, char *buf)
140{
141 struct hd_struct *p = dev_to_part(dev);
142
143 return sprintf(buf, "%8u %8u\n", atomic_read(&p->in_flight[0]),
144 atomic_read(&p->in_flight[1]));
145}
146
147#ifdef CONFIG_FAIL_MAKE_REQUEST
148ssize_t part_fail_show(struct device *dev,
149 struct device_attribute *attr, char *buf)
150{
151 struct hd_struct *p = dev_to_part(dev);
152
153 return sprintf(buf, "%d\n", p->make_it_fail);
154}
155
156ssize_t part_fail_store(struct device *dev,
157 struct device_attribute *attr,
158 const char *buf, size_t count)
159{
160 struct hd_struct *p = dev_to_part(dev);
161 int i;
162
163 if (count > 0 && sscanf(buf, "%d", &i) > 0)
164 p->make_it_fail = (i == 0) ? 0 : 1;
165
166 return count;
167}
168#endif
169
170static DEVICE_ATTR(partition, S_IRUGO, part_partition_show, NULL);
171static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL);
172static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
173static DEVICE_ATTR(ro, S_IRUGO, part_ro_show, NULL);
174static DEVICE_ATTR(alignment_offset, S_IRUGO, part_alignment_offset_show, NULL);
175static DEVICE_ATTR(discard_alignment, S_IRUGO, part_discard_alignment_show,
176 NULL);
177static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL);
178static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL);
179#ifdef CONFIG_FAIL_MAKE_REQUEST
180static struct device_attribute dev_attr_fail =
181 __ATTR(make-it-fail, S_IRUGO|S_IWUSR, part_fail_show, part_fail_store);
182#endif
183
184static struct attribute *part_attrs[] = {
185 &dev_attr_partition.attr,
186 &dev_attr_start.attr,
187 &dev_attr_size.attr,
188 &dev_attr_ro.attr,
189 &dev_attr_alignment_offset.attr,
190 &dev_attr_discard_alignment.attr,
191 &dev_attr_stat.attr,
192 &dev_attr_inflight.attr,
193#ifdef CONFIG_FAIL_MAKE_REQUEST
194 &dev_attr_fail.attr,
195#endif
196 NULL
197};
198
199static struct attribute_group part_attr_group = {
200 .attrs = part_attrs,
201};
202
203static const struct attribute_group *part_attr_groups[] = {
204 &part_attr_group,
205#ifdef CONFIG_BLK_DEV_IO_TRACE
206 &blk_trace_attr_group,
207#endif
208 NULL
209};
210
211static void part_release(struct device *dev)
212{
213 struct hd_struct *p = dev_to_part(dev);
214 free_part_stats(p);
215 free_part_info(p);
216 kfree(p);
217}
218
219struct device_type part_type = {
220 .name = "partition",
221 .groups = part_attr_groups,
222 .release = part_release,
223};
224
225static void delete_partition_rcu_cb(struct rcu_head *head)
226{
227 struct hd_struct *part = container_of(head, struct hd_struct, rcu_head);
228
229 part->start_sect = 0;
230 part->nr_sects = 0;
231 part_stat_set_all(part, 0);
232 put_device(part_to_dev(part));
233}
234
235void __delete_partition(struct hd_struct *part)
236{
237 call_rcu(&part->rcu_head, delete_partition_rcu_cb);
238}
239
240void delete_partition(struct gendisk *disk, int partno)
241{
242 struct disk_part_tbl *ptbl = disk->part_tbl;
243 struct hd_struct *part;
244
245 if (partno >= ptbl->len)
246 return;
247
248 part = ptbl->part[partno];
249 if (!part)
250 return;
251
252 blk_free_devt(part_devt(part));
253 rcu_assign_pointer(ptbl->part[partno], NULL);
254 rcu_assign_pointer(ptbl->last_lookup, NULL);
255 kobject_put(part->holder_dir);
256 device_del(part_to_dev(part));
257
258 hd_struct_put(part);
259}
260
261static ssize_t whole_disk_show(struct device *dev,
262 struct device_attribute *attr, char *buf)
263{
264 return 0;
265}
266static DEVICE_ATTR(whole_disk, S_IRUSR | S_IRGRP | S_IROTH,
267 whole_disk_show, NULL);
268
269struct hd_struct *add_partition(struct gendisk *disk, int partno,
270 sector_t start, sector_t len, int flags,
271 struct partition_meta_info *info)
272{
273 struct hd_struct *p;
274 dev_t devt = MKDEV(0, 0);
275 struct device *ddev = disk_to_dev(disk);
276 struct device *pdev;
277 struct disk_part_tbl *ptbl;
278 const char *dname;
279 int err;
280
281 err = disk_expand_part_tbl(disk, partno);
282 if (err)
283 return ERR_PTR(err);
284 ptbl = disk->part_tbl;
285
286 if (ptbl->part[partno])
287 return ERR_PTR(-EBUSY);
288
289 p = kzalloc(sizeof(*p), GFP_KERNEL);
290 if (!p)
291 return ERR_PTR(-EBUSY);
292
293 if (!init_part_stats(p)) {
294 err = -ENOMEM;
295 goto out_free;
296 }
297
298 seqcount_init(&p->nr_sects_seq);
299 pdev = part_to_dev(p);
300
301 p->start_sect = start;
302 p->alignment_offset =
303 queue_limit_alignment_offset(&disk->queue->limits, start);
304 p->discard_alignment =
305 queue_limit_discard_alignment(&disk->queue->limits, start);
306 p->nr_sects = len;
307 p->partno = partno;
308 p->policy = get_disk_ro(disk);
309
310 if (info) {
311 struct partition_meta_info *pinfo = alloc_part_info(disk);
312 if (!pinfo)
313 goto out_free_stats;
314 memcpy(pinfo, info, sizeof(*info));
315 p->info = pinfo;
316 }
317
318 dname = dev_name(ddev);
319 if (isdigit(dname[strlen(dname) - 1]))
320 dev_set_name(pdev, "%sp%d", dname, partno);
321 else
322 dev_set_name(pdev, "%s%d", dname, partno);
323
324 device_initialize(pdev);
325 pdev->class = &block_class;
326 pdev->type = &part_type;
327 pdev->parent = ddev;
328
329 err = blk_alloc_devt(p, &devt);
330 if (err)
331 goto out_free_info;
332 pdev->devt = devt;
333
334 /* delay uevent until 'holders' subdir is created */
335 dev_set_uevent_suppress(pdev, 1);
336 err = device_add(pdev);
337 if (err)
338 goto out_put;
339
340 err = -ENOMEM;
341 p->holder_dir = kobject_create_and_add("holders", &pdev->kobj);
342 if (!p->holder_dir)
343 goto out_del;
344
345 dev_set_uevent_suppress(pdev, 0);
346 if (flags & ADDPART_FLAG_WHOLEDISK) {
347 err = device_create_file(pdev, &dev_attr_whole_disk);
348 if (err)
349 goto out_del;
350 }
351
352 /* everything is up and running, commence */
353 rcu_assign_pointer(ptbl->part[partno], p);
354
355 /* suppress uevent if the disk suppresses it */
356 if (!dev_get_uevent_suppress(ddev))
357 kobject_uevent(&pdev->kobj, KOBJ_ADD);
358
359 hd_ref_init(p);
360 return p;
361
362out_free_info:
363 free_part_info(p);
364out_free_stats:
365 free_part_stats(p);
366out_free:
367 kfree(p);
368 return ERR_PTR(err);
369out_del:
370 kobject_put(p->holder_dir);
371 device_del(pdev);
372out_put:
373 put_device(pdev);
374 blk_free_devt(devt);
375 return ERR_PTR(err);
376}
377
378static bool disk_unlock_native_capacity(struct gendisk *disk)
379{
380 const struct block_device_operations *bdops = disk->fops;
381
382 if (bdops->unlock_native_capacity &&
383 !(disk->flags & GENHD_FL_NATIVE_CAPACITY)) {
384 printk(KERN_CONT "enabling native capacity\n");
385 bdops->unlock_native_capacity(disk);
386 disk->flags |= GENHD_FL_NATIVE_CAPACITY;
387 return true;
388 } else {
389 printk(KERN_CONT "truncated\n");
390 return false;
391 }
392}
393
394static int drop_partitions(struct gendisk *disk, struct block_device *bdev)
395{
396 struct disk_part_iter piter;
397 struct hd_struct *part;
398 int res;
399
400 if (bdev->bd_part_count)
401 return -EBUSY;
402 res = invalidate_partition(disk, 0);
403 if (res)
404 return res;
405
406 disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY);
407 while ((part = disk_part_iter_next(&piter)))
408 delete_partition(disk, part->partno);
409 disk_part_iter_exit(&piter);
410
411 return 0;
412}
413
414int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
415{
416 struct parsed_partitions *state = NULL;
417 struct hd_struct *part;
418 int p, highest, res;
419rescan:
420 if (state && !IS_ERR(state)) {
421 kfree(state);
422 state = NULL;
423 }
424
425 res = drop_partitions(disk, bdev);
426 if (res)
427 return res;
428
429 if (disk->fops->revalidate_disk)
430 disk->fops->revalidate_disk(disk);
431 check_disk_size_change(disk, bdev);
432 bdev->bd_invalidated = 0;
433 if (!get_capacity(disk) || !(state = check_partition(disk, bdev)))
434 return 0;
435 if (IS_ERR(state)) {
436 /*
437 * I/O error reading the partition table. If any
438 * partition code tried to read beyond EOD, retry
439 * after unlocking native capacity.
440 */
441 if (PTR_ERR(state) == -ENOSPC) {
442 printk(KERN_WARNING "%s: partition table beyond EOD, ",
443 disk->disk_name);
444 if (disk_unlock_native_capacity(disk))
445 goto rescan;
446 }
447 return -EIO;
448 }
449 /*
450 * If any partition code tried to read beyond EOD, try
451 * unlocking native capacity even if partition table is
452 * successfully read as we could be missing some partitions.
453 */
454 if (state->access_beyond_eod) {
455 printk(KERN_WARNING
456 "%s: partition table partially beyond EOD, ",
457 disk->disk_name);
458 if (disk_unlock_native_capacity(disk))
459 goto rescan;
460 }
461
462 /* tell userspace that the media / partition table may have changed */
463 kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE);
464
465 /* Detect the highest partition number and preallocate
466 * disk->part_tbl. This is an optimization and not strictly
467 * necessary.
468 */
469 for (p = 1, highest = 0; p < state->limit; p++)
470 if (state->parts[p].size)
471 highest = p;
472
473 disk_expand_part_tbl(disk, highest);
474
475 /* add partitions */
476 for (p = 1; p < state->limit; p++) {
477 sector_t size, from;
478 struct partition_meta_info *info = NULL;
479
480 size = state->parts[p].size;
481 if (!size)
482 continue;
483
484 from = state->parts[p].from;
485 if (from >= get_capacity(disk)) {
486 printk(KERN_WARNING
487 "%s: p%d start %llu is beyond EOD, ",
488 disk->disk_name, p, (unsigned long long) from);
489 if (disk_unlock_native_capacity(disk))
490 goto rescan;
491 continue;
492 }
493
494 if (from + size > get_capacity(disk)) {
495 printk(KERN_WARNING
496 "%s: p%d size %llu extends beyond EOD, ",
497 disk->disk_name, p, (unsigned long long) size);
498
499 if (disk_unlock_native_capacity(disk)) {
500 /* free state and restart */
501 goto rescan;
502 } else {
503 /*
504 * we can not ignore partitions of broken tables
505 * created by for example camera firmware, but
506 * we limit them to the end of the disk to avoid
507 * creating invalid block devices
508 */
509 size = get_capacity(disk) - from;
510 }
511 }
512
513 if (state->parts[p].has_info)
514 info = &state->parts[p].info;
515 part = add_partition(disk, p, from, size,
516 state->parts[p].flags,
517 &state->parts[p].info);
518 if (IS_ERR(part)) {
519 printk(KERN_ERR " %s: p%d could not be added: %ld\n",
520 disk->disk_name, p, -PTR_ERR(part));
521 continue;
522 }
523#ifdef CONFIG_BLK_DEV_MD
524 if (state->parts[p].flags & ADDPART_FLAG_RAID)
525 md_autodetect_dev(part_to_dev(part)->devt);
526#endif
527 }
528 kfree(state);
529 return 0;
530}
531
532int invalidate_partitions(struct gendisk *disk, struct block_device *bdev)
533{
534 int res;
535
536 if (!bdev->bd_invalidated)
537 return 0;
538
539 res = drop_partitions(disk, bdev);
540 if (res)
541 return res;
542
543 set_capacity(disk, 0);
544 check_disk_size_change(disk, bdev);
545 bdev->bd_invalidated = 0;
546 /* tell userspace that the media / partition table may have changed */
547 kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE);
548
549 return 0;
550}
551
552unsigned char *read_dev_sector(struct block_device *bdev, sector_t n, Sector *p)
553{
554 struct address_space *mapping = bdev->bd_inode->i_mapping;
555 struct page *page;
556
557 page = read_mapping_page(mapping, (pgoff_t)(n >> (PAGE_CACHE_SHIFT-9)),
558 NULL);
559 if (!IS_ERR(page)) {
560 if (PageError(page))
561 goto fail;
562 p->v = page;
563 return (unsigned char *)page_address(page) + ((n & ((1 << (PAGE_CACHE_SHIFT - 9)) - 1)) << 9);
564fail:
565 page_cache_release(page);
566 }
567 p->v = NULL;
568 return NULL;
569}
570
571EXPORT_SYMBOL(read_dev_sector);
diff --git a/block/partitions/Kconfig b/block/partitions/Kconfig
deleted file mode 100644
index 75a54e1adbb..00000000000
--- a/block/partitions/Kconfig
+++ /dev/null
@@ -1,251 +0,0 @@
1#
2# Partition configuration
3#
4config PARTITION_ADVANCED
5 bool "Advanced partition selection"
6 help
7 Say Y here if you would like to use hard disks under Linux which
8 were partitioned under an operating system running on a different
9 architecture than your Linux system.
10
11 Note that the answer to this question won't directly affect the
12 kernel: saying N will just cause the configurator to skip all
13 the questions about foreign partitioning schemes.
14
15 If unsure, say N.
16
17config ACORN_PARTITION
18 bool "Acorn partition support" if PARTITION_ADVANCED
19 default y if ARCH_ACORN
20 help
21 Support hard disks partitioned under Acorn operating systems.
22
23config ACORN_PARTITION_CUMANA
24 bool "Cumana partition support" if PARTITION_ADVANCED
25 default y if ARCH_ACORN
26 depends on ACORN_PARTITION
27 help
28 Say Y here if you would like to use hard disks under Linux which
29 were partitioned using the Cumana interface on Acorn machines.
30
31config ACORN_PARTITION_EESOX
32 bool "EESOX partition support" if PARTITION_ADVANCED
33 default y if ARCH_ACORN
34 depends on ACORN_PARTITION
35
36config ACORN_PARTITION_ICS
37 bool "ICS partition support" if PARTITION_ADVANCED
38 default y if ARCH_ACORN
39 depends on ACORN_PARTITION
40 help
41 Say Y here if you would like to use hard disks under Linux which
42 were partitioned using the ICS interface on Acorn machines.
43
44config ACORN_PARTITION_ADFS
45 bool "Native filecore partition support" if PARTITION_ADVANCED
46 default y if ARCH_ACORN
47 depends on ACORN_PARTITION
48 help
49 The Acorn Disc Filing System is the standard file system of the
50 RiscOS operating system which runs on Acorn's ARM-based Risc PC
51 systems and the Acorn Archimedes range of machines. If you say
52 `Y' here, Linux will support disk partitions created under ADFS.
53
54config ACORN_PARTITION_POWERTEC
55 bool "PowerTec partition support" if PARTITION_ADVANCED
56 default y if ARCH_ACORN
57 depends on ACORN_PARTITION
58 help
59 Support reading partition tables created on Acorn machines using
60 the PowerTec SCSI drive.
61
62config ACORN_PARTITION_RISCIX
63 bool "RISCiX partition support" if PARTITION_ADVANCED
64 default y if ARCH_ACORN
65 depends on ACORN_PARTITION
66 help
67 Once upon a time, there was a native Unix port for the Acorn series
68 of machines called RISCiX. If you say 'Y' here, Linux will be able
69 to read disks partitioned under RISCiX.
70
71config OSF_PARTITION
72 bool "Alpha OSF partition support" if PARTITION_ADVANCED
73 default y if ALPHA
74 help
75 Say Y here if you would like to use hard disks under Linux which
76 were partitioned on an Alpha machine.
77
78config AMIGA_PARTITION
79 bool "Amiga partition table support" if PARTITION_ADVANCED
80 default y if (AMIGA || AFFS_FS=y)
81 help
82 Say Y here if you would like to use hard disks under Linux which
83 were partitioned under AmigaOS.
84
85config ATARI_PARTITION
86 bool "Atari partition table support" if PARTITION_ADVANCED
87 default y if ATARI
88 help
89 Say Y here if you would like to use hard disks under Linux which
90 were partitioned under the Atari OS.
91
92config IBM_PARTITION
93 bool "IBM disk label and partition support"
94 depends on PARTITION_ADVANCED && S390
95 help
96 Say Y here if you would like to be able to read the hard disk
97 partition table format used by IBM DASD disks operating under CMS.
98 Otherwise, say N.
99
100config MAC_PARTITION
101 bool "Macintosh partition map support" if PARTITION_ADVANCED
102 default y if (MAC || PPC_PMAC)
103 help
104 Say Y here if you would like to use hard disks under Linux which
105 were partitioned on a Macintosh.
106
107config MSDOS_PARTITION
108 bool "PC BIOS (MSDOS partition tables) support" if PARTITION_ADVANCED
109 default y
110 help
111 Say Y here.
112
113config BSD_DISKLABEL
114 bool "BSD disklabel (FreeBSD partition tables) support"
115 depends on PARTITION_ADVANCED && MSDOS_PARTITION
116 help
117 FreeBSD uses its own hard disk partition scheme on your PC. It
118 requires only one entry in the primary partition table of your disk
119 and manages it similarly to DOS extended partitions, putting in its
120 first sector a new partition table in BSD disklabel format. Saying Y
121 here allows you to read these disklabels and further mount FreeBSD
122 partitions from within Linux if you have also said Y to "UFS
123 file system support", above. If you don't know what all this is
124 about, say N.
125
126config MINIX_SUBPARTITION
127 bool "Minix subpartition support"
128 depends on PARTITION_ADVANCED && MSDOS_PARTITION
129 help
130 Minix 2.0.0/2.0.2 subpartition table support for Linux.
131 Say Y here if you want to mount and use Minix 2.0.0/2.0.2
132 subpartitions.
133
134config SOLARIS_X86_PARTITION
135 bool "Solaris (x86) partition table support"
136 depends on PARTITION_ADVANCED && MSDOS_PARTITION
137 help
138 Like most systems, Solaris x86 uses its own hard disk partition
139 table format, incompatible with all others. Saying Y here allows you
140 to read these partition tables and further mount Solaris x86
141 partitions from within Linux if you have also said Y to "UFS
142 file system support", above.
143
144config UNIXWARE_DISKLABEL
145 bool "Unixware slices support"
146 depends on PARTITION_ADVANCED && MSDOS_PARTITION
147 ---help---
148 Like some systems, UnixWare uses its own slice table inside a
149 partition (VTOC - Virtual Table of Contents). Its format is
150 incompatible with all other OSes. Saying Y here allows you to read
151 VTOC and further mount UnixWare partitions read-only from within
152 Linux if you have also said Y to "UFS file system support" or
153 "System V and Coherent file system support", above.
154
155 This is mainly used to carry data from a UnixWare box to your
156 Linux box via a removable medium like magneto-optical, ZIP or
157 removable IDE drives. Note, however, that a good portable way to
158 transport files and directories between unixes (and even other
159 operating systems) is given by the tar program ("man tar" or
160 preferably "info tar").
161
162 If you don't know what all this is about, say N.
163
164config LDM_PARTITION
165 bool "Windows Logical Disk Manager (Dynamic Disk) support"
166 depends on PARTITION_ADVANCED
167 ---help---
168 Say Y here if you would like to use hard disks under Linux which
169 were partitioned using Windows 2000's/XP's or Vista's Logical Disk
170 Manager. They are also known as "Dynamic Disks".
171
172 Note this driver only supports Dynamic Disks with a protective MBR
173 label, i.e. DOS partition table. It does not support GPT labelled
174 Dynamic Disks yet as can be created with Vista.
175
176 Windows 2000 introduced the concept of Dynamic Disks to get around
177 the limitations of the PC's partitioning scheme. The Logical Disk
178 Manager allows the user to repartition a disk and create spanned,
179 mirrored, striped or RAID volumes, all without the need for
180 rebooting.
181
182 Normal partitions are now called Basic Disks under Windows 2000, XP,
183 and Vista.
184
185 For a fuller description read <file:Documentation/ldm.txt>.
186
187 If unsure, say N.
188
189config LDM_DEBUG
190 bool "Windows LDM extra logging"
191 depends on LDM_PARTITION
192 help
193 Say Y here if you would like LDM to log verbosely. This could be
194 helpful if the driver doesn't work as expected and you'd like to
195 report a bug.
196
197 If unsure, say N.
198
199config SGI_PARTITION
200 bool "SGI partition support" if PARTITION_ADVANCED
201 default y if DEFAULT_SGI_PARTITION
202 help
203 Say Y here if you would like to be able to read the hard disk
204 partition table format used by SGI machines.
205
206config ULTRIX_PARTITION
207 bool "Ultrix partition table support" if PARTITION_ADVANCED
208 default y if MACH_DECSTATION
209 help
210 Say Y here if you would like to be able to read the hard disk
211 partition table format used by DEC (now Compaq) Ultrix machines.
212 Otherwise, say N.
213
214config SUN_PARTITION
215 bool "Sun partition tables support" if PARTITION_ADVANCED
216 default y if (SPARC || SUN3 || SUN3X)
217 ---help---
218 Like most systems, SunOS uses its own hard disk partition table
219 format, incompatible with all others. Saying Y here allows you to
220 read these partition tables and further mount SunOS partitions from
221 within Linux if you have also said Y to "UFS file system support",
222 above. This is mainly used to carry data from a SPARC under SunOS to
223 your Linux box via a removable medium like magneto-optical or ZIP
224 drives; note however that a good portable way to transport files and
225 directories between unixes (and even other operating systems) is
226 given by the tar program ("man tar" or preferably "info tar"). If
227 you don't know what all this is about, say N.
228
229config KARMA_PARTITION
230 bool "Karma Partition support"
231 depends on PARTITION_ADVANCED
232 help
233 Say Y here if you would like to mount the Rio Karma MP3 player, as it
234 uses a proprietary partition table.
235
236config EFI_PARTITION
237 bool "EFI GUID Partition support" if PARTITION_ADVANCED
238 default y
239 select CRC32
240 help
241 Say Y here if you would like to use hard disks under Linux which
242 were partitioned using EFI GPT.
243
244config SYSV68_PARTITION
245 bool "SYSV68 partition table support" if PARTITION_ADVANCED
246 default y if VME
247 help
248 Say Y here if you would like to be able to read the hard disk
249 partition table format used by Motorola Delta machines (using
250 sysv68).
251 Otherwise, say N.
diff --git a/block/partitions/Makefile b/block/partitions/Makefile
deleted file mode 100644
index 03af8eac51d..00000000000
--- a/block/partitions/Makefile
+++ /dev/null
@@ -1,20 +0,0 @@
1#
2# Makefile for the linux kernel.
3#
4
5obj-$(CONFIG_BLOCK) := check.o
6
7obj-$(CONFIG_ACORN_PARTITION) += acorn.o
8obj-$(CONFIG_AMIGA_PARTITION) += amiga.o
9obj-$(CONFIG_ATARI_PARTITION) += atari.o
10obj-$(CONFIG_MAC_PARTITION) += mac.o
11obj-$(CONFIG_LDM_PARTITION) += ldm.o
12obj-$(CONFIG_MSDOS_PARTITION) += msdos.o
13obj-$(CONFIG_OSF_PARTITION) += osf.o
14obj-$(CONFIG_SGI_PARTITION) += sgi.o
15obj-$(CONFIG_SUN_PARTITION) += sun.o
16obj-$(CONFIG_ULTRIX_PARTITION) += ultrix.o
17obj-$(CONFIG_IBM_PARTITION) += ibm.o
18obj-$(CONFIG_EFI_PARTITION) += efi.o
19obj-$(CONFIG_KARMA_PARTITION) += karma.o
20obj-$(CONFIG_SYSV68_PARTITION) += sysv68.o
diff --git a/block/partitions/acorn.c b/block/partitions/acorn.c
deleted file mode 100644
index fbeb697374d..00000000000
--- a/block/partitions/acorn.c
+++ /dev/null
@@ -1,556 +0,0 @@
1/*
2 * linux/fs/partitions/acorn.c
3 *
4 * Copyright (c) 1996-2000 Russell King.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 *
10 * Scan ADFS partitions on hard disk drives. Unfortunately, there
11 * isn't a standard for partitioning drives on Acorn machines, so
12 * every single manufacturer of SCSI and IDE cards created their own
13 * method.
14 */
15#include <linux/buffer_head.h>
16#include <linux/adfs_fs.h>
17
18#include "check.h"
19#include "acorn.h"
20
21/*
22 * Partition types. (Oh for reusability)
23 */
24#define PARTITION_RISCIX_MFM 1
25#define PARTITION_RISCIX_SCSI 2
26#define PARTITION_LINUX 9
27
28#if defined(CONFIG_ACORN_PARTITION_CUMANA) || \
29 defined(CONFIG_ACORN_PARTITION_ADFS)
30static struct adfs_discrecord *
31adfs_partition(struct parsed_partitions *state, char *name, char *data,
32 unsigned long first_sector, int slot)
33{
34 struct adfs_discrecord *dr;
35 unsigned int nr_sects;
36
37 if (adfs_checkbblk(data))
38 return NULL;
39
40 dr = (struct adfs_discrecord *)(data + 0x1c0);
41
42 if (dr->disc_size == 0 && dr->disc_size_high == 0)
43 return NULL;
44
45 nr_sects = (le32_to_cpu(dr->disc_size_high) << 23) |
46 (le32_to_cpu(dr->disc_size) >> 9);
47
48 if (name) {
49 strlcat(state->pp_buf, " [", PAGE_SIZE);
50 strlcat(state->pp_buf, name, PAGE_SIZE);
51 strlcat(state->pp_buf, "]", PAGE_SIZE);
52 }
53 put_partition(state, slot, first_sector, nr_sects);
54 return dr;
55}
56#endif
57
58#ifdef CONFIG_ACORN_PARTITION_RISCIX
59
60struct riscix_part {
61 __le32 start;
62 __le32 length;
63 __le32 one;
64 char name[16];
65};
66
67struct riscix_record {
68 __le32 magic;
69#define RISCIX_MAGIC cpu_to_le32(0x4a657320)
70 __le32 date;
71 struct riscix_part part[8];
72};
73
74#if defined(CONFIG_ACORN_PARTITION_CUMANA) || \
75 defined(CONFIG_ACORN_PARTITION_ADFS)
76static int riscix_partition(struct parsed_partitions *state,
77 unsigned long first_sect, int slot,
78 unsigned long nr_sects)
79{
80 Sector sect;
81 struct riscix_record *rr;
82
83 rr = read_part_sector(state, first_sect, &sect);
84 if (!rr)
85 return -1;
86
87 strlcat(state->pp_buf, " [RISCiX]", PAGE_SIZE);
88
89
90 if (rr->magic == RISCIX_MAGIC) {
91 unsigned long size = nr_sects > 2 ? 2 : nr_sects;
92 int part;
93
94 strlcat(state->pp_buf, " <", PAGE_SIZE);
95
96 put_partition(state, slot++, first_sect, size);
97 for (part = 0; part < 8; part++) {
98 if (rr->part[part].one &&
99 memcmp(rr->part[part].name, "All\0", 4)) {
100 put_partition(state, slot++,
101 le32_to_cpu(rr->part[part].start),
102 le32_to_cpu(rr->part[part].length));
103 strlcat(state->pp_buf, "(", PAGE_SIZE);
104 strlcat(state->pp_buf, rr->part[part].name, PAGE_SIZE);
105 strlcat(state->pp_buf, ")", PAGE_SIZE);
106 }
107 }
108
109 strlcat(state->pp_buf, " >\n", PAGE_SIZE);
110 } else {
111 put_partition(state, slot++, first_sect, nr_sects);
112 }
113
114 put_dev_sector(sect);
115 return slot;
116}
117#endif
118#endif
119
120#define LINUX_NATIVE_MAGIC 0xdeafa1de
121#define LINUX_SWAP_MAGIC 0xdeafab1e
122
123struct linux_part {
124 __le32 magic;
125 __le32 start_sect;
126 __le32 nr_sects;
127};
128
129#if defined(CONFIG_ACORN_PARTITION_CUMANA) || \
130 defined(CONFIG_ACORN_PARTITION_ADFS)
131static int linux_partition(struct parsed_partitions *state,
132 unsigned long first_sect, int slot,
133 unsigned long nr_sects)
134{
135 Sector sect;
136 struct linux_part *linuxp;
137 unsigned long size = nr_sects > 2 ? 2 : nr_sects;
138
139 strlcat(state->pp_buf, " [Linux]", PAGE_SIZE);
140
141 put_partition(state, slot++, first_sect, size);
142
143 linuxp = read_part_sector(state, first_sect, &sect);
144 if (!linuxp)
145 return -1;
146
147 strlcat(state->pp_buf, " <", PAGE_SIZE);
148 while (linuxp->magic == cpu_to_le32(LINUX_NATIVE_MAGIC) ||
149 linuxp->magic == cpu_to_le32(LINUX_SWAP_MAGIC)) {
150 if (slot == state->limit)
151 break;
152 put_partition(state, slot++, first_sect +
153 le32_to_cpu(linuxp->start_sect),
154 le32_to_cpu(linuxp->nr_sects));
155 linuxp ++;
156 }
157 strlcat(state->pp_buf, " >", PAGE_SIZE);
158
159 put_dev_sector(sect);
160 return slot;
161}
162#endif
163
164#ifdef CONFIG_ACORN_PARTITION_CUMANA
165int adfspart_check_CUMANA(struct parsed_partitions *state)
166{
167 unsigned long first_sector = 0;
168 unsigned int start_blk = 0;
169 Sector sect;
170 unsigned char *data;
171 char *name = "CUMANA/ADFS";
172 int first = 1;
173 int slot = 1;
174
175 /*
176 * Try Cumana style partitions - sector 6 contains ADFS boot block
177 * with pointer to next 'drive'.
178 *
179 * There are unknowns in this code - is the 'cylinder number' of the
180 * next partition relative to the start of this one - I'm assuming
181 * it is.
182 *
183 * Also, which ID did Cumana use?
184 *
185 * This is totally unfinished, and will require more work to get it
186 * going. Hence it is totally untested.
187 */
188 do {
189 struct adfs_discrecord *dr;
190 unsigned int nr_sects;
191
192 data = read_part_sector(state, start_blk * 2 + 6, &sect);
193 if (!data)
194 return -1;
195
196 if (slot == state->limit)
197 break;
198
199 dr = adfs_partition(state, name, data, first_sector, slot++);
200 if (!dr)
201 break;
202
203 name = NULL;
204
205 nr_sects = (data[0x1fd] + (data[0x1fe] << 8)) *
206 (dr->heads + (dr->lowsector & 0x40 ? 1 : 0)) *
207 dr->secspertrack;
208
209 if (!nr_sects)
210 break;
211
212 first = 0;
213 first_sector += nr_sects;
214 start_blk += nr_sects >> (BLOCK_SIZE_BITS - 9);
215 nr_sects = 0; /* hmm - should be partition size */
216
217 switch (data[0x1fc] & 15) {
218 case 0: /* No partition / ADFS? */
219 break;
220
221#ifdef CONFIG_ACORN_PARTITION_RISCIX
222 case PARTITION_RISCIX_SCSI:
223 /* RISCiX - we don't know how to find the next one. */
224 slot = riscix_partition(state, first_sector, slot,
225 nr_sects);
226 break;
227#endif
228
229 case PARTITION_LINUX:
230 slot = linux_partition(state, first_sector, slot,
231 nr_sects);
232 break;
233 }
234 put_dev_sector(sect);
235 if (slot == -1)
236 return -1;
237 } while (1);
238 put_dev_sector(sect);
239 return first ? 0 : 1;
240}
241#endif
242
243#ifdef CONFIG_ACORN_PARTITION_ADFS
244/*
245 * Purpose: allocate ADFS partitions.
246 *
247 * Params : hd - pointer to gendisk structure to store partition info.
248 * dev - device number to access.
249 *
250 * Returns: -1 on error, 0 for no ADFS boot sector, 1 for ok.
251 *
252 * Alloc : hda = whole drive
253 * hda1 = ADFS partition on first drive.
254 * hda2 = non-ADFS partition.
255 */
256int adfspart_check_ADFS(struct parsed_partitions *state)
257{
258 unsigned long start_sect, nr_sects, sectscyl, heads;
259 Sector sect;
260 unsigned char *data;
261 struct adfs_discrecord *dr;
262 unsigned char id;
263 int slot = 1;
264
265 data = read_part_sector(state, 6, &sect);
266 if (!data)
267 return -1;
268
269 dr = adfs_partition(state, "ADFS", data, 0, slot++);
270 if (!dr) {
271 put_dev_sector(sect);
272 return 0;
273 }
274
275 heads = dr->heads + ((dr->lowsector >> 6) & 1);
276 sectscyl = dr->secspertrack * heads;
277 start_sect = ((data[0x1fe] << 8) + data[0x1fd]) * sectscyl;
278 id = data[0x1fc] & 15;
279 put_dev_sector(sect);
280
281 /*
282 * Work out start of non-adfs partition.
283 */
284 nr_sects = (state->bdev->bd_inode->i_size >> 9) - start_sect;
285
286 if (start_sect) {
287 switch (id) {
288#ifdef CONFIG_ACORN_PARTITION_RISCIX
289 case PARTITION_RISCIX_SCSI:
290 case PARTITION_RISCIX_MFM:
291 slot = riscix_partition(state, start_sect, slot,
292 nr_sects);
293 break;
294#endif
295
296 case PARTITION_LINUX:
297 slot = linux_partition(state, start_sect, slot,
298 nr_sects);
299 break;
300 }
301 }
302 strlcat(state->pp_buf, "\n", PAGE_SIZE);
303 return 1;
304}
305#endif
306
307#ifdef CONFIG_ACORN_PARTITION_ICS
308
309struct ics_part {
310 __le32 start;
311 __le32 size;
312};
313
314static int adfspart_check_ICSLinux(struct parsed_partitions *state,
315 unsigned long block)
316{
317 Sector sect;
318 unsigned char *data = read_part_sector(state, block, &sect);
319 int result = 0;
320
321 if (data) {
322 if (memcmp(data, "LinuxPart", 9) == 0)
323 result = 1;
324 put_dev_sector(sect);
325 }
326
327 return result;
328}
329
330/*
331 * Check for a valid ICS partition using the checksum.
332 */
333static inline int valid_ics_sector(const unsigned char *data)
334{
335 unsigned long sum;
336 int i;
337
338 for (i = 0, sum = 0x50617274; i < 508; i++)
339 sum += data[i];
340
341 sum -= le32_to_cpu(*(__le32 *)(&data[508]));
342
343 return sum == 0;
344}
345
346/*
347 * Purpose: allocate ICS partitions.
348 * Params : hd - pointer to gendisk structure to store partition info.
349 * dev - device number to access.
350 * Returns: -1 on error, 0 for no ICS table, 1 for partitions ok.
351 * Alloc : hda = whole drive
352 * hda1 = ADFS partition 0 on first drive.
353 * hda2 = ADFS partition 1 on first drive.
354 * ..etc..
355 */
356int adfspart_check_ICS(struct parsed_partitions *state)
357{
358 const unsigned char *data;
359 const struct ics_part *p;
360 int slot;
361 Sector sect;
362
363 /*
364 * Try ICS style partitions - sector 0 contains partition info.
365 */
366 data = read_part_sector(state, 0, &sect);
367 if (!data)
368 return -1;
369
370 if (!valid_ics_sector(data)) {
371 put_dev_sector(sect);
372 return 0;
373 }
374
375 strlcat(state->pp_buf, " [ICS]", PAGE_SIZE);
376
377 for (slot = 1, p = (const struct ics_part *)data; p->size; p++) {
378 u32 start = le32_to_cpu(p->start);
379 s32 size = le32_to_cpu(p->size); /* yes, it's signed. */
380
381 if (slot == state->limit)
382 break;
383
384 /*
385 * Negative sizes tell the RISC OS ICS driver to ignore
386 * this partition - in effect it says that this does not
387 * contain an ADFS filesystem.
388 */
389 if (size < 0) {
390 size = -size;
391
392 /*
393 * Our own extension - We use the first sector
394 * of the partition to identify what type this
395 * partition is. We must not make this visible
396 * to the filesystem.
397 */
398 if (size > 1 && adfspart_check_ICSLinux(state, start)) {
399 start += 1;
400 size -= 1;
401 }
402 }
403
404 if (size)
405 put_partition(state, slot++, start, size);
406 }
407
408 put_dev_sector(sect);
409 strlcat(state->pp_buf, "\n", PAGE_SIZE);
410 return 1;
411}
412#endif
413
414#ifdef CONFIG_ACORN_PARTITION_POWERTEC
415struct ptec_part {
416 __le32 unused1;
417 __le32 unused2;
418 __le32 start;
419 __le32 size;
420 __le32 unused5;
421 char type[8];
422};
423
424static inline int valid_ptec_sector(const unsigned char *data)
425{
426 unsigned char checksum = 0x2a;
427 int i;
428
429 /*
430 * If it looks like a PC/BIOS partition, then it
431 * probably isn't PowerTec.
432 */
433 if (data[510] == 0x55 && data[511] == 0xaa)
434 return 0;
435
436 for (i = 0; i < 511; i++)
437 checksum += data[i];
438
439 return checksum == data[511];
440}
441
442/*
443 * Purpose: allocate ICS partitions.
444 * Params : hd - pointer to gendisk structure to store partition info.
445 * dev - device number to access.
446 * Returns: -1 on error, 0 for no ICS table, 1 for partitions ok.
447 * Alloc : hda = whole drive
448 * hda1 = ADFS partition 0 on first drive.
449 * hda2 = ADFS partition 1 on first drive.
450 * ..etc..
451 */
452int adfspart_check_POWERTEC(struct parsed_partitions *state)
453{
454 Sector sect;
455 const unsigned char *data;
456 const struct ptec_part *p;
457 int slot = 1;
458 int i;
459
460 data = read_part_sector(state, 0, &sect);
461 if (!data)
462 return -1;
463
464 if (!valid_ptec_sector(data)) {
465 put_dev_sector(sect);
466 return 0;
467 }
468
469 strlcat(state->pp_buf, " [POWERTEC]", PAGE_SIZE);
470
471 for (i = 0, p = (const struct ptec_part *)data; i < 12; i++, p++) {
472 u32 start = le32_to_cpu(p->start);
473 u32 size = le32_to_cpu(p->size);
474
475 if (size)
476 put_partition(state, slot++, start, size);
477 }
478
479 put_dev_sector(sect);
480 strlcat(state->pp_buf, "\n", PAGE_SIZE);
481 return 1;
482}
483#endif
484
485#ifdef CONFIG_ACORN_PARTITION_EESOX
486struct eesox_part {
487 char magic[6];
488 char name[10];
489 __le32 start;
490 __le32 unused6;
491 __le32 unused7;
492 __le32 unused8;
493};
494
495/*
496 * Guess who created this format?
497 */
498static const char eesox_name[] = {
499 'N', 'e', 'i', 'l', ' ',
500 'C', 'r', 'i', 't', 'c', 'h', 'e', 'l', 'l', ' ', ' '
501};
502
503/*
504 * EESOX SCSI partition format.
505 *
506 * This is a goddamned awful partition format. We don't seem to store
507 * the size of the partition in this table, only the start addresses.
508 *
509 * There are two possibilities where the size comes from:
510 * 1. The individual ADFS boot block entries that are placed on the disk.
511 * 2. The start address of the next entry.
512 */
513int adfspart_check_EESOX(struct parsed_partitions *state)
514{
515 Sector sect;
516 const unsigned char *data;
517 unsigned char buffer[256];
518 struct eesox_part *p;
519 sector_t start = 0;
520 int i, slot = 1;
521
522 data = read_part_sector(state, 7, &sect);
523 if (!data)
524 return -1;
525
526 /*
527 * "Decrypt" the partition table. God knows why...
528 */
529 for (i = 0; i < 256; i++)
530 buffer[i] = data[i] ^ eesox_name[i & 15];
531
532 put_dev_sector(sect);
533
534 for (i = 0, p = (struct eesox_part *)buffer; i < 8; i++, p++) {
535 sector_t next;
536
537 if (memcmp(p->magic, "Eesox", 6))
538 break;
539
540 next = le32_to_cpu(p->start);
541 if (i)
542 put_partition(state, slot++, start, next - start);
543 start = next;
544 }
545
546 if (i != 0) {
547 sector_t size;
548
549 size = get_capacity(state->bdev->bd_disk);
550 put_partition(state, slot++, start, size - start);
551 strlcat(state->pp_buf, "\n", PAGE_SIZE);
552 }
553
554 return i ? 1 : 0;
555}
556#endif
diff --git a/block/partitions/acorn.h b/block/partitions/acorn.h
deleted file mode 100644
index ede82852969..00000000000
--- a/block/partitions/acorn.h
+++ /dev/null
@@ -1,14 +0,0 @@
1/*
2 * linux/fs/partitions/acorn.h
3 *
4 * Copyright (C) 1996-2001 Russell King.
5 *
6 * I _hate_ this partitioning mess - why can't we have one defined
7 * format, and everyone stick to it?
8 */
9
10int adfspart_check_CUMANA(struct parsed_partitions *state);
11int adfspart_check_ADFS(struct parsed_partitions *state);
12int adfspart_check_ICS(struct parsed_partitions *state);
13int adfspart_check_POWERTEC(struct parsed_partitions *state);
14int adfspart_check_EESOX(struct parsed_partitions *state);
diff --git a/block/partitions/amiga.c b/block/partitions/amiga.c
deleted file mode 100644
index 70cbf44a156..00000000000
--- a/block/partitions/amiga.c
+++ /dev/null
@@ -1,139 +0,0 @@
1/*
2 * fs/partitions/amiga.c
3 *
4 * Code extracted from drivers/block/genhd.c
5 *
6 * Copyright (C) 1991-1998 Linus Torvalds
7 * Re-organised Feb 1998 Russell King
8 */
9
10#include <linux/types.h>
11#include <linux/affs_hardblocks.h>
12
13#include "check.h"
14#include "amiga.h"
15
16static __inline__ u32
17checksum_block(__be32 *m, int size)
18{
19 u32 sum = 0;
20
21 while (size--)
22 sum += be32_to_cpu(*m++);
23 return sum;
24}
25
26int amiga_partition(struct parsed_partitions *state)
27{
28 Sector sect;
29 unsigned char *data;
30 struct RigidDiskBlock *rdb;
31 struct PartitionBlock *pb;
32 int start_sect, nr_sects, blk, part, res = 0;
33 int blksize = 1; /* Multiplier for disk block size */
34 int slot = 1;
35 char b[BDEVNAME_SIZE];
36
37 for (blk = 0; ; blk++, put_dev_sector(sect)) {
38 if (blk == RDB_ALLOCATION_LIMIT)
39 goto rdb_done;
40 data = read_part_sector(state, blk, &sect);
41 if (!data) {
42 if (warn_no_part)
43 printk("Dev %s: unable to read RDB block %d\n",
44 bdevname(state->bdev, b), blk);
45 res = -1;
46 goto rdb_done;
47 }
48 if (*(__be32 *)data != cpu_to_be32(IDNAME_RIGIDDISK))
49 continue;
50
51 rdb = (struct RigidDiskBlock *)data;
52 if (checksum_block((__be32 *)data, be32_to_cpu(rdb->rdb_SummedLongs) & 0x7F) == 0)
53 break;
54 /* Try again with 0xdc..0xdf zeroed, Windows might have
55 * trashed it.
56 */
57 *(__be32 *)(data+0xdc) = 0;
58 if (checksum_block((__be32 *)data,
59 be32_to_cpu(rdb->rdb_SummedLongs) & 0x7F)==0) {
60 printk("Warning: Trashed word at 0xd0 in block %d "
61 "ignored in checksum calculation\n",blk);
62 break;
63 }
64
65 printk("Dev %s: RDB in block %d has bad checksum\n",
66 bdevname(state->bdev, b), blk);
67 }
68
69 /* blksize is blocks per 512 byte standard block */
70 blksize = be32_to_cpu( rdb->rdb_BlockBytes ) / 512;
71
72 {
73 char tmp[7 + 10 + 1 + 1];
74
75 /* Be more informative */
76 snprintf(tmp, sizeof(tmp), " RDSK (%d)", blksize * 512);
77 strlcat(state->pp_buf, tmp, PAGE_SIZE);
78 }
79 blk = be32_to_cpu(rdb->rdb_PartitionList);
80 put_dev_sector(sect);
81 for (part = 1; blk>0 && part<=16; part++, put_dev_sector(sect)) {
82 blk *= blksize; /* Read in terms partition table understands */
83 data = read_part_sector(state, blk, &sect);
84 if (!data) {
85 if (warn_no_part)
86 printk("Dev %s: unable to read partition block %d\n",
87 bdevname(state->bdev, b), blk);
88 res = -1;
89 goto rdb_done;
90 }
91 pb = (struct PartitionBlock *)data;
92 blk = be32_to_cpu(pb->pb_Next);
93 if (pb->pb_ID != cpu_to_be32(IDNAME_PARTITION))
94 continue;
95 if (checksum_block((__be32 *)pb, be32_to_cpu(pb->pb_SummedLongs) & 0x7F) != 0 )
96 continue;
97
98 /* Tell Kernel about it */
99
100 nr_sects = (be32_to_cpu(pb->pb_Environment[10]) + 1 -
101 be32_to_cpu(pb->pb_Environment[9])) *
102 be32_to_cpu(pb->pb_Environment[3]) *
103 be32_to_cpu(pb->pb_Environment[5]) *
104 blksize;
105 if (!nr_sects)
106 continue;
107 start_sect = be32_to_cpu(pb->pb_Environment[9]) *
108 be32_to_cpu(pb->pb_Environment[3]) *
109 be32_to_cpu(pb->pb_Environment[5]) *
110 blksize;
111 put_partition(state,slot++,start_sect,nr_sects);
112 {
113 /* Be even more informative to aid mounting */
114 char dostype[4];
115 char tmp[42];
116
117 __be32 *dt = (__be32 *)dostype;
118 *dt = pb->pb_Environment[16];
119 if (dostype[3] < ' ')
120 snprintf(tmp, sizeof(tmp), " (%c%c%c^%c)",
121 dostype[0], dostype[1],
122 dostype[2], dostype[3] + '@' );
123 else
124 snprintf(tmp, sizeof(tmp), " (%c%c%c%c)",
125 dostype[0], dostype[1],
126 dostype[2], dostype[3]);
127 strlcat(state->pp_buf, tmp, PAGE_SIZE);
128 snprintf(tmp, sizeof(tmp), "(res %d spb %d)",
129 be32_to_cpu(pb->pb_Environment[6]),
130 be32_to_cpu(pb->pb_Environment[4]));
131 strlcat(state->pp_buf, tmp, PAGE_SIZE);
132 }
133 res = 1;
134 }
135 strlcat(state->pp_buf, "\n", PAGE_SIZE);
136
137rdb_done:
138 return res;
139}
diff --git a/block/partitions/amiga.h b/block/partitions/amiga.h
deleted file mode 100644
index d094585cada..00000000000
--- a/block/partitions/amiga.h
+++ /dev/null
@@ -1,6 +0,0 @@
1/*
2 * fs/partitions/amiga.h
3 */
4
5int amiga_partition(struct parsed_partitions *state);
6
diff --git a/block/partitions/atari.c b/block/partitions/atari.c
deleted file mode 100644
index 9875b05e80a..00000000000
--- a/block/partitions/atari.c
+++ /dev/null
@@ -1,149 +0,0 @@
1/*
2 * fs/partitions/atari.c
3 *
4 * Code extracted from drivers/block/genhd.c
5 *
6 * Copyright (C) 1991-1998 Linus Torvalds
7 * Re-organised Feb 1998 Russell King
8 */
9
10#include <linux/ctype.h>
11#include "check.h"
12#include "atari.h"
13
14/* ++guenther: this should be settable by the user ("make config")?.
15 */
16#define ICD_PARTS
17
18/* check if a partition entry looks valid -- Atari format is assumed if at
19 least one of the primary entries is ok this way */
20#define VALID_PARTITION(pi,hdsiz) \
21 (((pi)->flg & 1) && \
22 isalnum((pi)->id[0]) && isalnum((pi)->id[1]) && isalnum((pi)->id[2]) && \
23 be32_to_cpu((pi)->st) <= (hdsiz) && \
24 be32_to_cpu((pi)->st) + be32_to_cpu((pi)->siz) <= (hdsiz))
25
26static inline int OK_id(char *s)
27{
28 return memcmp (s, "GEM", 3) == 0 || memcmp (s, "BGM", 3) == 0 ||
29 memcmp (s, "LNX", 3) == 0 || memcmp (s, "SWP", 3) == 0 ||
30 memcmp (s, "RAW", 3) == 0 ;
31}
32
33int atari_partition(struct parsed_partitions *state)
34{
35 Sector sect;
36 struct rootsector *rs;
37 struct partition_info *pi;
38 u32 extensect;
39 u32 hd_size;
40 int slot;
41#ifdef ICD_PARTS
42 int part_fmt = 0; /* 0:unknown, 1:AHDI, 2:ICD/Supra */
43#endif
44
45 rs = read_part_sector(state, 0, &sect);
46 if (!rs)
47 return -1;
48
49 /* Verify this is an Atari rootsector: */
50 hd_size = state->bdev->bd_inode->i_size >> 9;
51 if (!VALID_PARTITION(&rs->part[0], hd_size) &&
52 !VALID_PARTITION(&rs->part[1], hd_size) &&
53 !VALID_PARTITION(&rs->part[2], hd_size) &&
54 !VALID_PARTITION(&rs->part[3], hd_size)) {
55 /*
56 * if there's no valid primary partition, assume that no Atari
57 * format partition table (there's no reliable magic or the like
58 * :-()
59 */
60 put_dev_sector(sect);
61 return 0;
62 }
63
64 pi = &rs->part[0];
65 strlcat(state->pp_buf, " AHDI", PAGE_SIZE);
66 for (slot = 1; pi < &rs->part[4] && slot < state->limit; slot++, pi++) {
67 struct rootsector *xrs;
68 Sector sect2;
69 ulong partsect;
70
71 if ( !(pi->flg & 1) )
72 continue;
73 /* active partition */
74 if (memcmp (pi->id, "XGM", 3) != 0) {
75 /* we don't care about other id's */
76 put_partition (state, slot, be32_to_cpu(pi->st),
77 be32_to_cpu(pi->siz));
78 continue;
79 }
80 /* extension partition */
81#ifdef ICD_PARTS
82 part_fmt = 1;
83#endif
84 strlcat(state->pp_buf, " XGM<", PAGE_SIZE);
85 partsect = extensect = be32_to_cpu(pi->st);
86 while (1) {
87 xrs = read_part_sector(state, partsect, &sect2);
88 if (!xrs) {
89 printk (" block %ld read failed\n", partsect);
90 put_dev_sector(sect);
91 return -1;
92 }
93
94 /* ++roman: sanity check: bit 0 of flg field must be set */
95 if (!(xrs->part[0].flg & 1)) {
96 printk( "\nFirst sub-partition in extended partition is not valid!\n" );
97 put_dev_sector(sect2);
98 break;
99 }
100
101 put_partition(state, slot,
102 partsect + be32_to_cpu(xrs->part[0].st),
103 be32_to_cpu(xrs->part[0].siz));
104
105 if (!(xrs->part[1].flg & 1)) {
106 /* end of linked partition list */
107 put_dev_sector(sect2);
108 break;
109 }
110 if (memcmp( xrs->part[1].id, "XGM", 3 ) != 0) {
111 printk("\nID of extended partition is not XGM!\n");
112 put_dev_sector(sect2);
113 break;
114 }
115
116 partsect = be32_to_cpu(xrs->part[1].st) + extensect;
117 put_dev_sector(sect2);
118 if (++slot == state->limit) {
119 printk( "\nMaximum number of partitions reached!\n" );
120 break;
121 }
122 }
123 strlcat(state->pp_buf, " >", PAGE_SIZE);
124 }
125#ifdef ICD_PARTS
126 if ( part_fmt!=1 ) { /* no extended partitions -> test ICD-format */
127 pi = &rs->icdpart[0];
128 /* sanity check: no ICD format if first partition invalid */
129 if (OK_id(pi->id)) {
130 strlcat(state->pp_buf, " ICD<", PAGE_SIZE);
131 for (; pi < &rs->icdpart[8] && slot < state->limit; slot++, pi++) {
132 /* accept only GEM,BGM,RAW,LNX,SWP partitions */
133 if (!((pi->flg & 1) && OK_id(pi->id)))
134 continue;
135 part_fmt = 2;
136 put_partition (state, slot,
137 be32_to_cpu(pi->st),
138 be32_to_cpu(pi->siz));
139 }
140 strlcat(state->pp_buf, " >", PAGE_SIZE);
141 }
142 }
143#endif
144 put_dev_sector(sect);
145
146 strlcat(state->pp_buf, "\n", PAGE_SIZE);
147
148 return 1;
149}
diff --git a/block/partitions/atari.h b/block/partitions/atari.h
deleted file mode 100644
index fe2d32a89f3..00000000000
--- a/block/partitions/atari.h
+++ /dev/null
@@ -1,34 +0,0 @@
1/*
2 * fs/partitions/atari.h
3 * Moved by Russell King from:
4 *
5 * linux/include/linux/atari_rootsec.h
6 * definitions for Atari Rootsector layout
7 * by Andreas Schwab (schwab@ls5.informatik.uni-dortmund.de)
8 *
9 * modified for ICD/Supra partitioning scheme restricted to at most 12
10 * partitions
11 * by Guenther Kelleter (guenther@pool.informatik.rwth-aachen.de)
12 */
13
14struct partition_info
15{
16 u8 flg; /* bit 0: active; bit 7: bootable */
17 char id[3]; /* "GEM", "BGM", "XGM", or other */
18 __be32 st; /* start of partition */
19 __be32 siz; /* length of partition */
20};
21
22struct rootsector
23{
24 char unused[0x156]; /* room for boot code */
25 struct partition_info icdpart[8]; /* info for ICD-partitions 5..12 */
26 char unused2[0xc];
27 u32 hd_siz; /* size of disk in blocks */
28 struct partition_info part[4];
29 u32 bsl_st; /* start of bad sector list */
30 u32 bsl_cnt; /* length of bad sector list */
31 u16 checksum; /* checksum for bootable disks */
32} __attribute__((__packed__));
33
34int atari_partition(struct parsed_partitions *state);
diff --git a/block/partitions/check.c b/block/partitions/check.c
deleted file mode 100644
index bc908672c97..00000000000
--- a/block/partitions/check.c
+++ /dev/null
@@ -1,166 +0,0 @@
1/*
2 * fs/partitions/check.c
3 *
4 * Code extracted from drivers/block/genhd.c
5 * Copyright (C) 1991-1998 Linus Torvalds
6 * Re-organised Feb 1998 Russell King
7 *
8 * We now have independent partition support from the
9 * block drivers, which allows all the partition code to
10 * be grouped in one location, and it to be mostly self
11 * contained.
12 *
13 * Added needed MAJORS for new pairs, {hdi,hdj}, {hdk,hdl}
14 */
15
16#include <linux/slab.h>
17#include <linux/ctype.h>
18#include <linux/genhd.h>
19
20#include "check.h"
21
22#include "acorn.h"
23#include "amiga.h"
24#include "atari.h"
25#include "ldm.h"
26#include "mac.h"
27#include "msdos.h"
28#include "osf.h"
29#include "sgi.h"
30#include "sun.h"
31#include "ibm.h"
32#include "ultrix.h"
33#include "efi.h"
34#include "karma.h"
35#include "sysv68.h"
36
37int warn_no_part = 1; /*This is ugly: should make genhd removable media aware*/
38
39static int (*check_part[])(struct parsed_partitions *) = {
40 /*
41 * Probe partition formats with tables at disk address 0
42 * that also have an ADFS boot block at 0xdc0.
43 */
44#ifdef CONFIG_ACORN_PARTITION_ICS
45 adfspart_check_ICS,
46#endif
47#ifdef CONFIG_ACORN_PARTITION_POWERTEC
48 adfspart_check_POWERTEC,
49#endif
50#ifdef CONFIG_ACORN_PARTITION_EESOX
51 adfspart_check_EESOX,
52#endif
53
54 /*
55 * Now move on to formats that only have partition info at
56 * disk address 0xdc0. Since these may also have stale
57 * PC/BIOS partition tables, they need to come before
58 * the msdos entry.
59 */
60#ifdef CONFIG_ACORN_PARTITION_CUMANA
61 adfspart_check_CUMANA,
62#endif
63#ifdef CONFIG_ACORN_PARTITION_ADFS
64 adfspart_check_ADFS,
65#endif
66
67#ifdef CONFIG_EFI_PARTITION
68 efi_partition, /* this must come before msdos */
69#endif
70#ifdef CONFIG_SGI_PARTITION
71 sgi_partition,
72#endif
73#ifdef CONFIG_LDM_PARTITION
74 ldm_partition, /* this must come before msdos */
75#endif
76#ifdef CONFIG_MSDOS_PARTITION
77 msdos_partition,
78#endif
79#ifdef CONFIG_OSF_PARTITION
80 osf_partition,
81#endif
82#ifdef CONFIG_SUN_PARTITION
83 sun_partition,
84#endif
85#ifdef CONFIG_AMIGA_PARTITION
86 amiga_partition,
87#endif
88#ifdef CONFIG_ATARI_PARTITION
89 atari_partition,
90#endif
91#ifdef CONFIG_MAC_PARTITION
92 mac_partition,
93#endif
94#ifdef CONFIG_ULTRIX_PARTITION
95 ultrix_partition,
96#endif
97#ifdef CONFIG_IBM_PARTITION
98 ibm_partition,
99#endif
100#ifdef CONFIG_KARMA_PARTITION
101 karma_partition,
102#endif
103#ifdef CONFIG_SYSV68_PARTITION
104 sysv68_partition,
105#endif
106 NULL
107};
108
109struct parsed_partitions *
110check_partition(struct gendisk *hd, struct block_device *bdev)
111{
112 struct parsed_partitions *state;
113 int i, res, err;
114
115 state = kzalloc(sizeof(struct parsed_partitions), GFP_KERNEL);
116 if (!state)
117 return NULL;
118 state->pp_buf = (char *)__get_free_page(GFP_KERNEL);
119 if (!state->pp_buf) {
120 kfree(state);
121 return NULL;
122 }
123 state->pp_buf[0] = '\0';
124
125 state->bdev = bdev;
126 disk_name(hd, 0, state->name);
127 snprintf(state->pp_buf, PAGE_SIZE, " %s:", state->name);
128 if (isdigit(state->name[strlen(state->name)-1]))
129 sprintf(state->name, "p");
130
131 state->limit = disk_max_parts(hd);
132 i = res = err = 0;
133 while (!res && check_part[i]) {
134 memset(&state->parts, 0, sizeof(state->parts));
135 res = check_part[i++](state);
136 if (res < 0) {
137 /* We have hit an I/O error which we don't report now.
138 * But record it, and let the others do their job.
139 */
140 err = res;
141 res = 0;
142 }
143
144 }
145 if (res > 0) {
146 printk(KERN_INFO "%s", state->pp_buf);
147
148 free_page((unsigned long)state->pp_buf);
149 return state;
150 }
151 if (state->access_beyond_eod)
152 err = -ENOSPC;
153 if (err)
154 /* The partition is unrecognized. So report I/O errors if there were any */
155 res = err;
156 if (!res)
157 strlcat(state->pp_buf, " unknown partition table\n", PAGE_SIZE);
158 else if (warn_no_part)
159 strlcat(state->pp_buf, " unable to read partition table\n", PAGE_SIZE);
160
161 printk(KERN_INFO "%s", state->pp_buf);
162
163 free_page((unsigned long)state->pp_buf);
164 kfree(state);
165 return ERR_PTR(res);
166}
diff --git a/block/partitions/check.h b/block/partitions/check.h
deleted file mode 100644
index 52b100311ec..00000000000
--- a/block/partitions/check.h
+++ /dev/null
@@ -1,52 +0,0 @@
1#include <linux/pagemap.h>
2#include <linux/blkdev.h>
3#include <linux/genhd.h>
4
5/*
6 * add_gd_partition adds a partitions details to the devices partition
7 * description.
8 */
9struct parsed_partitions {
10 struct block_device *bdev;
11 char name[BDEVNAME_SIZE];
12 struct {
13 sector_t from;
14 sector_t size;
15 int flags;
16 bool has_info;
17 struct partition_meta_info info;
18 } parts[DISK_MAX_PARTS];
19 int next;
20 int limit;
21 bool access_beyond_eod;
22 char *pp_buf;
23};
24
25struct parsed_partitions *
26check_partition(struct gendisk *, struct block_device *);
27
28static inline void *read_part_sector(struct parsed_partitions *state,
29 sector_t n, Sector *p)
30{
31 if (n >= get_capacity(state->bdev->bd_disk)) {
32 state->access_beyond_eod = true;
33 return NULL;
34 }
35 return read_dev_sector(state->bdev, n, p);
36}
37
38static inline void
39put_partition(struct parsed_partitions *p, int n, sector_t from, sector_t size)
40{
41 if (n < p->limit) {
42 char tmp[1 + BDEVNAME_SIZE + 10 + 1];
43
44 p->parts[n].from = from;
45 p->parts[n].size = size;
46 snprintf(tmp, sizeof(tmp), " %s%d", p->name, n);
47 strlcat(p->pp_buf, tmp, PAGE_SIZE);
48 }
49}
50
51extern int warn_no_part;
52
diff --git a/block/partitions/efi.c b/block/partitions/efi.c
deleted file mode 100644
index b62fb88b871..00000000000
--- a/block/partitions/efi.c
+++ /dev/null
@@ -1,670 +0,0 @@
1/************************************************************
2 * EFI GUID Partition Table handling
3 *
4 * http://www.uefi.org/specs/
5 * http://www.intel.com/technology/efi/
6 *
7 * efi.[ch] by Matt Domsch <Matt_Domsch@dell.com>
8 * Copyright 2000,2001,2002,2004 Dell Inc.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23 *
24 *
25 * TODO:
26 *
27 * Changelog:
28 * Mon Nov 09 2004 Matt Domsch <Matt_Domsch@dell.com>
29 * - test for valid PMBR and valid PGPT before ever reading
30 * AGPT, allow override with 'gpt' kernel command line option.
31 * - check for first/last_usable_lba outside of size of disk
32 *
33 * Tue Mar 26 2002 Matt Domsch <Matt_Domsch@dell.com>
34 * - Ported to 2.5.7-pre1 and 2.5.7-dj2
35 * - Applied patch to avoid fault in alternate header handling
36 * - cleaned up find_valid_gpt
37 * - On-disk structure and copy in memory is *always* LE now -
38 * swab fields as needed
39 * - remove print_gpt_header()
40 * - only use first max_p partition entries, to keep the kernel minor number
41 * and partition numbers tied.
42 *
43 * Mon Feb 04 2002 Matt Domsch <Matt_Domsch@dell.com>
44 * - Removed __PRIPTR_PREFIX - not being used
45 *
46 * Mon Jan 14 2002 Matt Domsch <Matt_Domsch@dell.com>
47 * - Ported to 2.5.2-pre11 + library crc32 patch Linus applied
48 *
49 * Thu Dec 6 2001 Matt Domsch <Matt_Domsch@dell.com>
50 * - Added compare_gpts().
51 * - moved le_efi_guid_to_cpus() back into this file. GPT is the only
52 * thing that keeps EFI GUIDs on disk.
53 * - Changed gpt structure names and members to be simpler and more Linux-like.
54 *
55 * Wed Oct 17 2001 Matt Domsch <Matt_Domsch@dell.com>
56 * - Removed CONFIG_DEVFS_VOLUMES_UUID code entirely per Martin Wilck
57 *
58 * Wed Oct 10 2001 Matt Domsch <Matt_Domsch@dell.com>
59 * - Changed function comments to DocBook style per Andreas Dilger suggestion.
60 *
61 * Mon Oct 08 2001 Matt Domsch <Matt_Domsch@dell.com>
62 * - Change read_lba() to use the page cache per Al Viro's work.
63 * - print u64s properly on all architectures
64 * - fixed debug_printk(), now Dprintk()
65 *
66 * Mon Oct 01 2001 Matt Domsch <Matt_Domsch@dell.com>
67 * - Style cleanups
68 * - made most functions static
69 * - Endianness addition
70 * - remove test for second alternate header, as it's not per spec,
71 * and is unnecessary. There's now a method to read/write the last
72 * sector of an odd-sized disk from user space. No tools have ever
73 * been released which used this code, so it's effectively dead.
74 * - Per Asit Mallick of Intel, added a test for a valid PMBR.
75 * - Added kernel command line option 'gpt' to override valid PMBR test.
76 *
77 * Wed Jun 6 2001 Martin Wilck <Martin.Wilck@Fujitsu-Siemens.com>
78 * - added devfs volume UUID support (/dev/volumes/uuids) for
79 * mounting file systems by the partition GUID.
80 *
81 * Tue Dec 5 2000 Matt Domsch <Matt_Domsch@dell.com>
82 * - Moved crc32() to linux/lib, added efi_crc32().
83 *
84 * Thu Nov 30 2000 Matt Domsch <Matt_Domsch@dell.com>
85 * - Replaced Intel's CRC32 function with an equivalent
86 * non-license-restricted version.
87 *
88 * Wed Oct 25 2000 Matt Domsch <Matt_Domsch@dell.com>
89 * - Fixed the last_lba() call to return the proper last block
90 *
91 * Thu Oct 12 2000 Matt Domsch <Matt_Domsch@dell.com>
92 * - Thanks to Andries Brouwer for his debugging assistance.
93 * - Code works, detects all the partitions.
94 *
95 ************************************************************/
96#include <linux/crc32.h>
97#include <linux/ctype.h>
98#include <linux/math64.h>
99#include <linux/slab.h>
100#include "check.h"
101#include "efi.h"
102
103/* This allows a kernel command line option 'gpt' to override
104 * the test for invalid PMBR. Not __initdata because reloading
105 * the partition tables happens after init too.
106 */
107static int force_gpt;
108static int __init
109force_gpt_fn(char *str)
110{
111 force_gpt = 1;
112 return 1;
113}
114__setup("gpt", force_gpt_fn);
115
116
117/**
118 * efi_crc32() - EFI version of crc32 function
119 * @buf: buffer to calculate crc32 of
120 * @len - length of buf
121 *
122 * Description: Returns EFI-style CRC32 value for @buf
123 *
124 * This function uses the little endian Ethernet polynomial
125 * but seeds the function with ~0, and xor's with ~0 at the end.
126 * Note, the EFI Specification, v1.02, has a reference to
127 * Dr. Dobbs Journal, May 1994 (actually it's in May 1992).
128 */
129static inline u32
130efi_crc32(const void *buf, unsigned long len)
131{
132 return (crc32(~0L, buf, len) ^ ~0L);
133}
134
135/**
136 * last_lba(): return number of last logical block of device
137 * @bdev: block device
138 *
139 * Description: Returns last LBA value on success, 0 on error.
140 * This is stored (by sd and ide-geometry) in
141 * the part[0] entry for this disk, and is the number of
142 * physical sectors available on the disk.
143 */
144static u64 last_lba(struct block_device *bdev)
145{
146 if (!bdev || !bdev->bd_inode)
147 return 0;
148 return div_u64(bdev->bd_inode->i_size,
149 bdev_logical_block_size(bdev)) - 1ULL;
150}
151
152static inline int
153pmbr_part_valid(struct partition *part)
154{
155 if (part->sys_ind == EFI_PMBR_OSTYPE_EFI_GPT &&
156 le32_to_cpu(part->start_sect) == 1UL)
157 return 1;
158 return 0;
159}
160
161/**
162 * is_pmbr_valid(): test Protective MBR for validity
163 * @mbr: pointer to a legacy mbr structure
164 *
165 * Description: Returns 1 if PMBR is valid, 0 otherwise.
166 * Validity depends on two things:
167 * 1) MSDOS signature is in the last two bytes of the MBR
168 * 2) One partition of type 0xEE is found
169 */
170static int
171is_pmbr_valid(legacy_mbr *mbr)
172{
173 int i;
174 if (!mbr || le16_to_cpu(mbr->signature) != MSDOS_MBR_SIGNATURE)
175 return 0;
176 for (i = 0; i < 4; i++)
177 if (pmbr_part_valid(&mbr->partition_record[i]))
178 return 1;
179 return 0;
180}
181
182/**
183 * read_lba(): Read bytes from disk, starting at given LBA
184 * @state
185 * @lba
186 * @buffer
187 * @size_t
188 *
189 * Description: Reads @count bytes from @state->bdev into @buffer.
190 * Returns number of bytes read on success, 0 on error.
191 */
192static size_t read_lba(struct parsed_partitions *state,
193 u64 lba, u8 *buffer, size_t count)
194{
195 size_t totalreadcount = 0;
196 struct block_device *bdev = state->bdev;
197 sector_t n = lba * (bdev_logical_block_size(bdev) / 512);
198
199 if (!buffer || lba > last_lba(bdev))
200 return 0;
201
202 while (count) {
203 int copied = 512;
204 Sector sect;
205 unsigned char *data = read_part_sector(state, n++, &sect);
206 if (!data)
207 break;
208 if (copied > count)
209 copied = count;
210 memcpy(buffer, data, copied);
211 put_dev_sector(sect);
212 buffer += copied;
213 totalreadcount +=copied;
214 count -= copied;
215 }
216 return totalreadcount;
217}
218
219/**
220 * alloc_read_gpt_entries(): reads partition entries from disk
221 * @state
222 * @gpt - GPT header
223 *
224 * Description: Returns ptes on success, NULL on error.
225 * Allocates space for PTEs based on information found in @gpt.
226 * Notes: remember to free pte when you're done!
227 */
228static gpt_entry *alloc_read_gpt_entries(struct parsed_partitions *state,
229 gpt_header *gpt)
230{
231 size_t count;
232 gpt_entry *pte;
233
234 if (!gpt)
235 return NULL;
236
237 count = le32_to_cpu(gpt->num_partition_entries) *
238 le32_to_cpu(gpt->sizeof_partition_entry);
239 if (!count)
240 return NULL;
241 pte = kzalloc(count, GFP_KERNEL);
242 if (!pte)
243 return NULL;
244
245 if (read_lba(state, le64_to_cpu(gpt->partition_entry_lba),
246 (u8 *) pte,
247 count) < count) {
248 kfree(pte);
249 pte=NULL;
250 return NULL;
251 }
252 return pte;
253}
254
255/**
256 * alloc_read_gpt_header(): Allocates GPT header, reads into it from disk
257 * @state
258 * @lba is the Logical Block Address of the partition table
259 *
260 * Description: returns GPT header on success, NULL on error. Allocates
261 * and fills a GPT header starting at @ from @state->bdev.
262 * Note: remember to free gpt when finished with it.
263 */
264static gpt_header *alloc_read_gpt_header(struct parsed_partitions *state,
265 u64 lba)
266{
267 gpt_header *gpt;
268 unsigned ssz = bdev_logical_block_size(state->bdev);
269
270 gpt = kzalloc(ssz, GFP_KERNEL);
271 if (!gpt)
272 return NULL;
273
274 if (read_lba(state, lba, (u8 *) gpt, ssz) < ssz) {
275 kfree(gpt);
276 gpt=NULL;
277 return NULL;
278 }
279
280 return gpt;
281}
282
283/**
284 * is_gpt_valid() - tests one GPT header and PTEs for validity
285 * @state
286 * @lba is the logical block address of the GPT header to test
287 * @gpt is a GPT header ptr, filled on return.
288 * @ptes is a PTEs ptr, filled on return.
289 *
290 * Description: returns 1 if valid, 0 on error.
291 * If valid, returns pointers to newly allocated GPT header and PTEs.
292 */
293static int is_gpt_valid(struct parsed_partitions *state, u64 lba,
294 gpt_header **gpt, gpt_entry **ptes)
295{
296 u32 crc, origcrc;
297 u64 lastlba;
298
299 if (!ptes)
300 return 0;
301 if (!(*gpt = alloc_read_gpt_header(state, lba)))
302 return 0;
303
304 /* Check the GUID Partition Table signature */
305 if (le64_to_cpu((*gpt)->signature) != GPT_HEADER_SIGNATURE) {
306 pr_debug("GUID Partition Table Header signature is wrong:"
307 "%lld != %lld\n",
308 (unsigned long long)le64_to_cpu((*gpt)->signature),
309 (unsigned long long)GPT_HEADER_SIGNATURE);
310 goto fail;
311 }
312
313 /* Check the GUID Partition Table header size */
314 if (le32_to_cpu((*gpt)->header_size) >
315 bdev_logical_block_size(state->bdev)) {
316 pr_debug("GUID Partition Table Header size is wrong: %u > %u\n",
317 le32_to_cpu((*gpt)->header_size),
318 bdev_logical_block_size(state->bdev));
319 goto fail;
320 }
321
322 /* Check the GUID Partition Table CRC */
323 origcrc = le32_to_cpu((*gpt)->header_crc32);
324 (*gpt)->header_crc32 = 0;
325 crc = efi_crc32((const unsigned char *) (*gpt), le32_to_cpu((*gpt)->header_size));
326
327 if (crc != origcrc) {
328 pr_debug("GUID Partition Table Header CRC is wrong: %x != %x\n",
329 crc, origcrc);
330 goto fail;
331 }
332 (*gpt)->header_crc32 = cpu_to_le32(origcrc);
333
334 /* Check that the my_lba entry points to the LBA that contains
335 * the GUID Partition Table */
336 if (le64_to_cpu((*gpt)->my_lba) != lba) {
337 pr_debug("GPT my_lba incorrect: %lld != %lld\n",
338 (unsigned long long)le64_to_cpu((*gpt)->my_lba),
339 (unsigned long long)lba);
340 goto fail;
341 }
342
343 /* Check the first_usable_lba and last_usable_lba are
344 * within the disk.
345 */
346 lastlba = last_lba(state->bdev);
347 if (le64_to_cpu((*gpt)->first_usable_lba) > lastlba) {
348 pr_debug("GPT: first_usable_lba incorrect: %lld > %lld\n",
349 (unsigned long long)le64_to_cpu((*gpt)->first_usable_lba),
350 (unsigned long long)lastlba);
351 goto fail;
352 }
353 if (le64_to_cpu((*gpt)->last_usable_lba) > lastlba) {
354 pr_debug("GPT: last_usable_lba incorrect: %lld > %lld\n",
355 (unsigned long long)le64_to_cpu((*gpt)->last_usable_lba),
356 (unsigned long long)lastlba);
357 goto fail;
358 }
359
360 /* Check that sizeof_partition_entry has the correct value */
361 if (le32_to_cpu((*gpt)->sizeof_partition_entry) != sizeof(gpt_entry)) {
362 pr_debug("GUID Partitition Entry Size check failed.\n");
363 goto fail;
364 }
365
366 if (!(*ptes = alloc_read_gpt_entries(state, *gpt)))
367 goto fail;
368
369 /* Check the GUID Partition Entry Array CRC */
370 crc = efi_crc32((const unsigned char *) (*ptes),
371 le32_to_cpu((*gpt)->num_partition_entries) *
372 le32_to_cpu((*gpt)->sizeof_partition_entry));
373
374 if (crc != le32_to_cpu((*gpt)->partition_entry_array_crc32)) {
375 pr_debug("GUID Partitition Entry Array CRC check failed.\n");
376 goto fail_ptes;
377 }
378
379 /* We're done, all's well */
380 return 1;
381
382 fail_ptes:
383 kfree(*ptes);
384 *ptes = NULL;
385 fail:
386 kfree(*gpt);
387 *gpt = NULL;
388 return 0;
389}
390
391/**
392 * is_pte_valid() - tests one PTE for validity
393 * @pte is the pte to check
394 * @lastlba is last lba of the disk
395 *
396 * Description: returns 1 if valid, 0 on error.
397 */
398static inline int
399is_pte_valid(const gpt_entry *pte, const u64 lastlba)
400{
401 if ((!efi_guidcmp(pte->partition_type_guid, NULL_GUID)) ||
402 le64_to_cpu(pte->starting_lba) > lastlba ||
403 le64_to_cpu(pte->ending_lba) > lastlba)
404 return 0;
405 return 1;
406}
407
408/**
409 * compare_gpts() - Search disk for valid GPT headers and PTEs
410 * @pgpt is the primary GPT header
411 * @agpt is the alternate GPT header
412 * @lastlba is the last LBA number
413 * Description: Returns nothing. Sanity checks pgpt and agpt fields
414 * and prints warnings on discrepancies.
415 *
416 */
417static void
418compare_gpts(gpt_header *pgpt, gpt_header *agpt, u64 lastlba)
419{
420 int error_found = 0;
421 if (!pgpt || !agpt)
422 return;
423 if (le64_to_cpu(pgpt->my_lba) != le64_to_cpu(agpt->alternate_lba)) {
424 printk(KERN_WARNING
425 "GPT:Primary header LBA != Alt. header alternate_lba\n");
426 printk(KERN_WARNING "GPT:%lld != %lld\n",
427 (unsigned long long)le64_to_cpu(pgpt->my_lba),
428 (unsigned long long)le64_to_cpu(agpt->alternate_lba));
429 error_found++;
430 }
431 if (le64_to_cpu(pgpt->alternate_lba) != le64_to_cpu(agpt->my_lba)) {
432 printk(KERN_WARNING
433 "GPT:Primary header alternate_lba != Alt. header my_lba\n");
434 printk(KERN_WARNING "GPT:%lld != %lld\n",
435 (unsigned long long)le64_to_cpu(pgpt->alternate_lba),
436 (unsigned long long)le64_to_cpu(agpt->my_lba));
437 error_found++;
438 }
439 if (le64_to_cpu(pgpt->first_usable_lba) !=
440 le64_to_cpu(agpt->first_usable_lba)) {
441 printk(KERN_WARNING "GPT:first_usable_lbas don't match.\n");
442 printk(KERN_WARNING "GPT:%lld != %lld\n",
443 (unsigned long long)le64_to_cpu(pgpt->first_usable_lba),
444 (unsigned long long)le64_to_cpu(agpt->first_usable_lba));
445 error_found++;
446 }
447 if (le64_to_cpu(pgpt->last_usable_lba) !=
448 le64_to_cpu(agpt->last_usable_lba)) {
449 printk(KERN_WARNING "GPT:last_usable_lbas don't match.\n");
450 printk(KERN_WARNING "GPT:%lld != %lld\n",
451 (unsigned long long)le64_to_cpu(pgpt->last_usable_lba),
452 (unsigned long long)le64_to_cpu(agpt->last_usable_lba));
453 error_found++;
454 }
455 if (efi_guidcmp(pgpt->disk_guid, agpt->disk_guid)) {
456 printk(KERN_WARNING "GPT:disk_guids don't match.\n");
457 error_found++;
458 }
459 if (le32_to_cpu(pgpt->num_partition_entries) !=
460 le32_to_cpu(agpt->num_partition_entries)) {
461 printk(KERN_WARNING "GPT:num_partition_entries don't match: "
462 "0x%x != 0x%x\n",
463 le32_to_cpu(pgpt->num_partition_entries),
464 le32_to_cpu(agpt->num_partition_entries));
465 error_found++;
466 }
467 if (le32_to_cpu(pgpt->sizeof_partition_entry) !=
468 le32_to_cpu(agpt->sizeof_partition_entry)) {
469 printk(KERN_WARNING
470 "GPT:sizeof_partition_entry values don't match: "
471 "0x%x != 0x%x\n",
472 le32_to_cpu(pgpt->sizeof_partition_entry),
473 le32_to_cpu(agpt->sizeof_partition_entry));
474 error_found++;
475 }
476 if (le32_to_cpu(pgpt->partition_entry_array_crc32) !=
477 le32_to_cpu(agpt->partition_entry_array_crc32)) {
478 printk(KERN_WARNING
479 "GPT:partition_entry_array_crc32 values don't match: "
480 "0x%x != 0x%x\n",
481 le32_to_cpu(pgpt->partition_entry_array_crc32),
482 le32_to_cpu(agpt->partition_entry_array_crc32));
483 error_found++;
484 }
485 if (le64_to_cpu(pgpt->alternate_lba) != lastlba) {
486 printk(KERN_WARNING
487 "GPT:Primary header thinks Alt. header is not at the end of the disk.\n");
488 printk(KERN_WARNING "GPT:%lld != %lld\n",
489 (unsigned long long)le64_to_cpu(pgpt->alternate_lba),
490 (unsigned long long)lastlba);
491 error_found++;
492 }
493
494 if (le64_to_cpu(agpt->my_lba) != lastlba) {
495 printk(KERN_WARNING
496 "GPT:Alternate GPT header not at the end of the disk.\n");
497 printk(KERN_WARNING "GPT:%lld != %lld\n",
498 (unsigned long long)le64_to_cpu(agpt->my_lba),
499 (unsigned long long)lastlba);
500 error_found++;
501 }
502
503 if (error_found)
504 printk(KERN_WARNING
505 "GPT: Use GNU Parted to correct GPT errors.\n");
506 return;
507}
508
509/**
510 * find_valid_gpt() - Search disk for valid GPT headers and PTEs
511 * @state
512 * @gpt is a GPT header ptr, filled on return.
513 * @ptes is a PTEs ptr, filled on return.
514 * Description: Returns 1 if valid, 0 on error.
515 * If valid, returns pointers to newly allocated GPT header and PTEs.
516 * Validity depends on PMBR being valid (or being overridden by the
517 * 'gpt' kernel command line option) and finding either the Primary
518 * GPT header and PTEs valid, or the Alternate GPT header and PTEs
519 * valid. If the Primary GPT header is not valid, the Alternate GPT header
520 * is not checked unless the 'gpt' kernel command line option is passed.
521 * This protects against devices which misreport their size, and forces
522 * the user to decide to use the Alternate GPT.
523 */
524static int find_valid_gpt(struct parsed_partitions *state, gpt_header **gpt,
525 gpt_entry **ptes)
526{
527 int good_pgpt = 0, good_agpt = 0, good_pmbr = 0;
528 gpt_header *pgpt = NULL, *agpt = NULL;
529 gpt_entry *pptes = NULL, *aptes = NULL;
530 legacy_mbr *legacymbr;
531 u64 lastlba;
532
533 if (!ptes)
534 return 0;
535
536 lastlba = last_lba(state->bdev);
537 if (!force_gpt) {
538 /* This will be added to the EFI Spec. per Intel after v1.02. */
539 legacymbr = kzalloc(sizeof (*legacymbr), GFP_KERNEL);
540 if (legacymbr) {
541 read_lba(state, 0, (u8 *) legacymbr,
542 sizeof (*legacymbr));
543 good_pmbr = is_pmbr_valid(legacymbr);
544 kfree(legacymbr);
545 }
546 if (!good_pmbr)
547 goto fail;
548 }
549
550 good_pgpt = is_gpt_valid(state, GPT_PRIMARY_PARTITION_TABLE_LBA,
551 &pgpt, &pptes);
552 if (good_pgpt)
553 good_agpt = is_gpt_valid(state,
554 le64_to_cpu(pgpt->alternate_lba),
555 &agpt, &aptes);
556 if (!good_agpt && force_gpt)
557 good_agpt = is_gpt_valid(state, lastlba, &agpt, &aptes);
558
559 /* The obviously unsuccessful case */
560 if (!good_pgpt && !good_agpt)
561 goto fail;
562
563 compare_gpts(pgpt, agpt, lastlba);
564
565 /* The good cases */
566 if (good_pgpt) {
567 *gpt = pgpt;
568 *ptes = pptes;
569 kfree(agpt);
570 kfree(aptes);
571 if (!good_agpt) {
572 printk(KERN_WARNING
573 "Alternate GPT is invalid, "
574 "using primary GPT.\n");
575 }
576 return 1;
577 }
578 else if (good_agpt) {
579 *gpt = agpt;
580 *ptes = aptes;
581 kfree(pgpt);
582 kfree(pptes);
583 printk(KERN_WARNING
584 "Primary GPT is invalid, using alternate GPT.\n");
585 return 1;
586 }
587
588 fail:
589 kfree(pgpt);
590 kfree(agpt);
591 kfree(pptes);
592 kfree(aptes);
593 *gpt = NULL;
594 *ptes = NULL;
595 return 0;
596}
597
598/**
599 * efi_partition(struct parsed_partitions *state)
600 * @state
601 *
602 * Description: called from check.c, if the disk contains GPT
603 * partitions, sets up partition entries in the kernel.
604 *
605 * If the first block on the disk is a legacy MBR,
606 * it will get handled by msdos_partition().
607 * If it's a Protective MBR, we'll handle it here.
608 *
609 * We do not create a Linux partition for GPT, but
610 * only for the actual data partitions.
611 * Returns:
612 * -1 if unable to read the partition table
613 * 0 if this isn't our partition table
614 * 1 if successful
615 *
616 */
617int efi_partition(struct parsed_partitions *state)
618{
619 gpt_header *gpt = NULL;
620 gpt_entry *ptes = NULL;
621 u32 i;
622 unsigned ssz = bdev_logical_block_size(state->bdev) / 512;
623
624 if (!find_valid_gpt(state, &gpt, &ptes) || !gpt || !ptes) {
625 kfree(gpt);
626 kfree(ptes);
627 return 0;
628 }
629
630 pr_debug("GUID Partition Table is valid! Yea!\n");
631
632 for (i = 0; i < le32_to_cpu(gpt->num_partition_entries) && i < state->limit-1; i++) {
633 struct partition_meta_info *info;
634 unsigned label_count = 0;
635 unsigned label_max;
636 u64 start = le64_to_cpu(ptes[i].starting_lba);
637 u64 size = le64_to_cpu(ptes[i].ending_lba) -
638 le64_to_cpu(ptes[i].starting_lba) + 1ULL;
639
640 if (!is_pte_valid(&ptes[i], last_lba(state->bdev)))
641 continue;
642
643 put_partition(state, i+1, start * ssz, size * ssz);
644
645 /* If this is a RAID volume, tell md */
646 if (!efi_guidcmp(ptes[i].partition_type_guid,
647 PARTITION_LINUX_RAID_GUID))
648 state->parts[i + 1].flags = ADDPART_FLAG_RAID;
649
650 info = &state->parts[i + 1].info;
651 efi_guid_unparse(&ptes[i].unique_partition_guid, info->uuid);
652
653 /* Naively convert UTF16-LE to 7 bits. */
654 label_max = min(sizeof(info->volname) - 1,
655 sizeof(ptes[i].partition_name));
656 info->volname[label_max] = 0;
657 while (label_count < label_max) {
658 u8 c = ptes[i].partition_name[label_count] & 0xff;
659 if (c && !isprint(c))
660 c = '!';
661 info->volname[label_count] = c;
662 label_count++;
663 }
664 state->parts[i + 1].has_info = true;
665 }
666 kfree(ptes);
667 kfree(gpt);
668 strlcat(state->pp_buf, "\n", PAGE_SIZE);
669 return 1;
670}
diff --git a/block/partitions/efi.h b/block/partitions/efi.h
deleted file mode 100644
index b69ab729558..00000000000
--- a/block/partitions/efi.h
+++ /dev/null
@@ -1,134 +0,0 @@
1/************************************************************
2 * EFI GUID Partition Table
3 * Per Intel EFI Specification v1.02
4 * http://developer.intel.com/technology/efi/efi.htm
5 *
6 * By Matt Domsch <Matt_Domsch@dell.com> Fri Sep 22 22:15:56 CDT 2000
7 * Copyright 2000,2001 Dell Inc.
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 *
23 ************************************************************/
24
25#ifndef FS_PART_EFI_H_INCLUDED
26#define FS_PART_EFI_H_INCLUDED
27
28#include <linux/types.h>
29#include <linux/fs.h>
30#include <linux/genhd.h>
31#include <linux/kernel.h>
32#include <linux/major.h>
33#include <linux/string.h>
34#include <linux/efi.h>
35
36#define MSDOS_MBR_SIGNATURE 0xaa55
37#define EFI_PMBR_OSTYPE_EFI 0xEF
38#define EFI_PMBR_OSTYPE_EFI_GPT 0xEE
39
40#define GPT_HEADER_SIGNATURE 0x5452415020494645ULL
41#define GPT_HEADER_REVISION_V1 0x00010000
42#define GPT_PRIMARY_PARTITION_TABLE_LBA 1
43
44#define PARTITION_SYSTEM_GUID \
45 EFI_GUID( 0xC12A7328, 0xF81F, 0x11d2, \
46 0xBA, 0x4B, 0x00, 0xA0, 0xC9, 0x3E, 0xC9, 0x3B)
47#define LEGACY_MBR_PARTITION_GUID \
48 EFI_GUID( 0x024DEE41, 0x33E7, 0x11d3, \
49 0x9D, 0x69, 0x00, 0x08, 0xC7, 0x81, 0xF3, 0x9F)
50#define PARTITION_MSFT_RESERVED_GUID \
51 EFI_GUID( 0xE3C9E316, 0x0B5C, 0x4DB8, \
52 0x81, 0x7D, 0xF9, 0x2D, 0xF0, 0x02, 0x15, 0xAE)
53#define PARTITION_BASIC_DATA_GUID \
54 EFI_GUID( 0xEBD0A0A2, 0xB9E5, 0x4433, \
55 0x87, 0xC0, 0x68, 0xB6, 0xB7, 0x26, 0x99, 0xC7)
56#define PARTITION_LINUX_RAID_GUID \
57 EFI_GUID( 0xa19d880f, 0x05fc, 0x4d3b, \
58 0xa0, 0x06, 0x74, 0x3f, 0x0f, 0x84, 0x91, 0x1e)
59#define PARTITION_LINUX_SWAP_GUID \
60 EFI_GUID( 0x0657fd6d, 0xa4ab, 0x43c4, \
61 0x84, 0xe5, 0x09, 0x33, 0xc8, 0x4b, 0x4f, 0x4f)
62#define PARTITION_LINUX_LVM_GUID \
63 EFI_GUID( 0xe6d6d379, 0xf507, 0x44c2, \
64 0xa2, 0x3c, 0x23, 0x8f, 0x2a, 0x3d, 0xf9, 0x28)
65
66typedef struct _gpt_header {
67 __le64 signature;
68 __le32 revision;
69 __le32 header_size;
70 __le32 header_crc32;
71 __le32 reserved1;
72 __le64 my_lba;
73 __le64 alternate_lba;
74 __le64 first_usable_lba;
75 __le64 last_usable_lba;
76 efi_guid_t disk_guid;
77 __le64 partition_entry_lba;
78 __le32 num_partition_entries;
79 __le32 sizeof_partition_entry;
80 __le32 partition_entry_array_crc32;
81
82 /* The rest of the logical block is reserved by UEFI and must be zero.
83 * EFI standard handles this by:
84 *
85 * uint8_t reserved2[ BlockSize - 92 ];
86 */
87} __attribute__ ((packed)) gpt_header;
88
89typedef struct _gpt_entry_attributes {
90 u64 required_to_function:1;
91 u64 reserved:47;
92 u64 type_guid_specific:16;
93} __attribute__ ((packed)) gpt_entry_attributes;
94
95typedef struct _gpt_entry {
96 efi_guid_t partition_type_guid;
97 efi_guid_t unique_partition_guid;
98 __le64 starting_lba;
99 __le64 ending_lba;
100 gpt_entry_attributes attributes;
101 efi_char16_t partition_name[72 / sizeof (efi_char16_t)];
102} __attribute__ ((packed)) gpt_entry;
103
104typedef struct _legacy_mbr {
105 u8 boot_code[440];
106 __le32 unique_mbr_signature;
107 __le16 unknown;
108 struct partition partition_record[4];
109 __le16 signature;
110} __attribute__ ((packed)) legacy_mbr;
111
112/* Functions */
113extern int efi_partition(struct parsed_partitions *state);
114
115#endif
116
117/*
118 * Overrides for Emacs so that we follow Linus's tabbing style.
119 * Emacs will notice this stuff at the end of the file and automatically
120 * adjust the settings for this buffer only. This must remain at the end
121 * of the file.
122 * --------------------------------------------------------------------------
123 * Local variables:
124 * c-indent-level: 4
125 * c-brace-imaginary-offset: 0
126 * c-brace-offset: -4
127 * c-argdecl-indent: 4
128 * c-label-offset: -4
129 * c-continued-statement-offset: 4
130 * c-continued-brace-offset: 0
131 * indent-tabs-mode: nil
132 * tab-width: 8
133 * End:
134 */
diff --git a/block/partitions/ibm.c b/block/partitions/ibm.c
deleted file mode 100644
index 47a61474e79..00000000000
--- a/block/partitions/ibm.c
+++ /dev/null
@@ -1,364 +0,0 @@
1/*
2 * Author(s)......: Holger Smolinski <Holger.Smolinski@de.ibm.com>
3 * Volker Sameske <sameske@de.ibm.com>
4 * Bugreports.to..: <Linux390@de.ibm.com>
5 * Copyright IBM Corp. 1999, 2012
6 */
7
8#include <linux/buffer_head.h>
9#include <linux/hdreg.h>
10#include <linux/slab.h>
11#include <asm/dasd.h>
12#include <asm/ebcdic.h>
13#include <asm/uaccess.h>
14#include <asm/vtoc.h>
15
16#include "check.h"
17#include "ibm.h"
18
19
20union label_t {
21 struct vtoc_volume_label_cdl vol;
22 struct vtoc_volume_label_ldl lnx;
23 struct vtoc_cms_label cms;
24};
25
26/*
27 * compute the block number from a
28 * cyl-cyl-head-head structure
29 */
30static sector_t cchh2blk(struct vtoc_cchh *ptr, struct hd_geometry *geo)
31{
32 sector_t cyl;
33 __u16 head;
34
35 /* decode cylinder and heads for large volumes */
36 cyl = ptr->hh & 0xFFF0;
37 cyl <<= 12;
38 cyl |= ptr->cc;
39 head = ptr->hh & 0x000F;
40 return cyl * geo->heads * geo->sectors +
41 head * geo->sectors;
42}
43
44/*
45 * compute the block number from a
46 * cyl-cyl-head-head-block structure
47 */
48static sector_t cchhb2blk(struct vtoc_cchhb *ptr, struct hd_geometry *geo)
49{
50 sector_t cyl;
51 __u16 head;
52
53 /* decode cylinder and heads for large volumes */
54 cyl = ptr->hh & 0xFFF0;
55 cyl <<= 12;
56 cyl |= ptr->cc;
57 head = ptr->hh & 0x000F;
58 return cyl * geo->heads * geo->sectors +
59 head * geo->sectors +
60 ptr->b;
61}
62
63static int find_label(struct parsed_partitions *state,
64 dasd_information2_t *info,
65 struct hd_geometry *geo,
66 int blocksize,
67 sector_t *labelsect,
68 char name[],
69 char type[],
70 union label_t *label)
71{
72 Sector sect;
73 unsigned char *data;
74 sector_t testsect[3];
75 unsigned char temp[5];
76 int found = 0;
77 int i, testcount;
78
79 /* There a three places where we may find a valid label:
80 * - on an ECKD disk it's block 2
81 * - on an FBA disk it's block 1
82 * - on an CMS formatted FBA disk it is sector 1, even if the block size
83 * is larger than 512 bytes (possible if the DIAG discipline is used)
84 * If we have a valid info structure, then we know exactly which case we
85 * have, otherwise we just search through all possebilities.
86 */
87 if (info) {
88 if ((info->cu_type == 0x6310 && info->dev_type == 0x9336) ||
89 (info->cu_type == 0x3880 && info->dev_type == 0x3370))
90 testsect[0] = info->label_block;
91 else
92 testsect[0] = info->label_block * (blocksize >> 9);
93 testcount = 1;
94 } else {
95 testsect[0] = 1;
96 testsect[1] = (blocksize >> 9);
97 testsect[2] = 2 * (blocksize >> 9);
98 testcount = 3;
99 }
100 for (i = 0; i < testcount; ++i) {
101 data = read_part_sector(state, testsect[i], &sect);
102 if (data == NULL)
103 continue;
104 memcpy(label, data, sizeof(*label));
105 memcpy(temp, data, 4);
106 temp[4] = 0;
107 EBCASC(temp, 4);
108 put_dev_sector(sect);
109 if (!strcmp(temp, "VOL1") ||
110 !strcmp(temp, "LNX1") ||
111 !strcmp(temp, "CMS1")) {
112 if (!strcmp(temp, "VOL1")) {
113 strncpy(type, label->vol.vollbl, 4);
114 strncpy(name, label->vol.volid, 6);
115 } else {
116 strncpy(type, label->lnx.vollbl, 4);
117 strncpy(name, label->lnx.volid, 6);
118 }
119 EBCASC(type, 4);
120 EBCASC(name, 6);
121 *labelsect = testsect[i];
122 found = 1;
123 break;
124 }
125 }
126 if (!found)
127 memset(label, 0, sizeof(*label));
128
129 return found;
130}
131
132static int find_vol1_partitions(struct parsed_partitions *state,
133 struct hd_geometry *geo,
134 int blocksize,
135 char name[],
136 union label_t *label)
137{
138 sector_t blk;
139 int counter;
140 char tmp[64];
141 Sector sect;
142 unsigned char *data;
143 loff_t offset, size;
144 struct vtoc_format1_label f1;
145 int secperblk;
146
147 snprintf(tmp, sizeof(tmp), "VOL1/%8s:", name);
148 strlcat(state->pp_buf, tmp, PAGE_SIZE);
149 /*
150 * get start of VTOC from the disk label and then search for format1
151 * and format8 labels
152 */
153 secperblk = blocksize >> 9;
154 blk = cchhb2blk(&label->vol.vtoc, geo) + 1;
155 counter = 0;
156 data = read_part_sector(state, blk * secperblk, &sect);
157 while (data != NULL) {
158 memcpy(&f1, data, sizeof(struct vtoc_format1_label));
159 put_dev_sector(sect);
160 /* skip FMT4 / FMT5 / FMT7 labels */
161 if (f1.DS1FMTID == _ascebc['4']
162 || f1.DS1FMTID == _ascebc['5']
163 || f1.DS1FMTID == _ascebc['7']
164 || f1.DS1FMTID == _ascebc['9']) {
165 blk++;
166 data = read_part_sector(state, blk * secperblk, &sect);
167 continue;
168 }
169 /* only FMT1 and 8 labels valid at this point */
170 if (f1.DS1FMTID != _ascebc['1'] &&
171 f1.DS1FMTID != _ascebc['8'])
172 break;
173 /* OK, we got valid partition data */
174 offset = cchh2blk(&f1.DS1EXT1.llimit, geo);
175 size = cchh2blk(&f1.DS1EXT1.ulimit, geo) -
176 offset + geo->sectors;
177 offset *= secperblk;
178 size *= secperblk;
179 if (counter >= state->limit)
180 break;
181 put_partition(state, counter + 1, offset, size);
182 counter++;
183 blk++;
184 data = read_part_sector(state, blk * secperblk, &sect);
185 }
186 strlcat(state->pp_buf, "\n", PAGE_SIZE);
187
188 if (!data)
189 return -1;
190
191 return 1;
192}
193
194static int find_lnx1_partitions(struct parsed_partitions *state,
195 struct hd_geometry *geo,
196 int blocksize,
197 char name[],
198 union label_t *label,
199 sector_t labelsect,
200 loff_t i_size,
201 dasd_information2_t *info)
202{
203 loff_t offset, geo_size, size;
204 char tmp[64];
205 int secperblk;
206
207 snprintf(tmp, sizeof(tmp), "LNX1/%8s:", name);
208 strlcat(state->pp_buf, tmp, PAGE_SIZE);
209 secperblk = blocksize >> 9;
210 if (label->lnx.ldl_version == 0xf2) {
211 size = label->lnx.formatted_blocks * secperblk;
212 } else {
213 /*
214 * Formated w/o large volume support. If the sanity check
215 * 'size based on geo == size based on i_size' is true, then
216 * we can safely assume that we know the formatted size of
217 * the disk, otherwise we need additional information
218 * that we can only get from a real DASD device.
219 */
220 geo_size = geo->cylinders * geo->heads
221 * geo->sectors * secperblk;
222 size = i_size >> 9;
223 if (size != geo_size) {
224 if (!info) {
225 strlcat(state->pp_buf, "\n", PAGE_SIZE);
226 return 1;
227 }
228 if (!strcmp(info->type, "ECKD"))
229 if (geo_size < size)
230 size = geo_size;
231 /* else keep size based on i_size */
232 }
233 }
234 /* first and only partition starts in the first block after the label */
235 offset = labelsect + secperblk;
236 put_partition(state, 1, offset, size - offset);
237 strlcat(state->pp_buf, "\n", PAGE_SIZE);
238 return 1;
239}
240
241static int find_cms1_partitions(struct parsed_partitions *state,
242 struct hd_geometry *geo,
243 int blocksize,
244 char name[],
245 union label_t *label,
246 sector_t labelsect)
247{
248 loff_t offset, size;
249 char tmp[64];
250 int secperblk;
251
252 /*
253 * VM style CMS1 labeled disk
254 */
255 blocksize = label->cms.block_size;
256 secperblk = blocksize >> 9;
257 if (label->cms.disk_offset != 0) {
258 snprintf(tmp, sizeof(tmp), "CMS1/%8s(MDSK):", name);
259 strlcat(state->pp_buf, tmp, PAGE_SIZE);
260 /* disk is reserved minidisk */
261 offset = label->cms.disk_offset * secperblk;
262 size = (label->cms.block_count - 1) * secperblk;
263 } else {
264 snprintf(tmp, sizeof(tmp), "CMS1/%8s:", name);
265 strlcat(state->pp_buf, tmp, PAGE_SIZE);
266 /*
267 * Special case for FBA devices:
268 * If an FBA device is CMS formatted with blocksize > 512 byte
269 * and the DIAG discipline is used, then the CMS label is found
270 * in sector 1 instead of block 1. However, the partition is
271 * still supposed to start in block 2.
272 */
273 if (labelsect == 1)
274 offset = 2 * secperblk;
275 else
276 offset = labelsect + secperblk;
277 size = label->cms.block_count * secperblk;
278 }
279
280 put_partition(state, 1, offset, size-offset);
281 strlcat(state->pp_buf, "\n", PAGE_SIZE);
282 return 1;
283}
284
285
286/*
287 * This is the main function, called by check.c
288 */
289int ibm_partition(struct parsed_partitions *state)
290{
291 struct block_device *bdev = state->bdev;
292 int blocksize, res;
293 loff_t i_size, offset, size;
294 dasd_information2_t *info;
295 struct hd_geometry *geo;
296 char type[5] = {0,};
297 char name[7] = {0,};
298 sector_t labelsect;
299 union label_t *label;
300
301 res = 0;
302 blocksize = bdev_logical_block_size(bdev);
303 if (blocksize <= 0)
304 goto out_exit;
305 i_size = i_size_read(bdev->bd_inode);
306 if (i_size == 0)
307 goto out_exit;
308 info = kmalloc(sizeof(dasd_information2_t), GFP_KERNEL);
309 if (info == NULL)
310 goto out_exit;
311 geo = kmalloc(sizeof(struct hd_geometry), GFP_KERNEL);
312 if (geo == NULL)
313 goto out_nogeo;
314 label = kmalloc(sizeof(union label_t), GFP_KERNEL);
315 if (label == NULL)
316 goto out_nolab;
317 if (ioctl_by_bdev(bdev, HDIO_GETGEO, (unsigned long)geo) != 0)
318 goto out_freeall;
319 if (ioctl_by_bdev(bdev, BIODASDINFO2, (unsigned long)info) != 0) {
320 kfree(info);
321 info = NULL;
322 }
323
324 if (find_label(state, info, geo, blocksize, &labelsect, name, type,
325 label)) {
326 if (!strncmp(type, "VOL1", 4)) {
327 res = find_vol1_partitions(state, geo, blocksize, name,
328 label);
329 } else if (!strncmp(type, "LNX1", 4)) {
330 res = find_lnx1_partitions(state, geo, blocksize, name,
331 label, labelsect, i_size,
332 info);
333 } else if (!strncmp(type, "CMS1", 4)) {
334 res = find_cms1_partitions(state, geo, blocksize, name,
335 label, labelsect);
336 }
337 } else if (info) {
338 /*
339 * ugly but needed for backward compatibility:
340 * If the block device is a DASD (i.e. BIODASDINFO2 works),
341 * then we claim it in any case, even though it has no valid
342 * label. If it has the LDL format, then we simply define a
343 * partition as if it had an LNX1 label.
344 */
345 res = 1;
346 if (info->format == DASD_FORMAT_LDL) {
347 strlcat(state->pp_buf, "(nonl)", PAGE_SIZE);
348 size = i_size >> 9;
349 offset = (info->label_block + 1) * (blocksize >> 9);
350 put_partition(state, 1, offset, size-offset);
351 strlcat(state->pp_buf, "\n", PAGE_SIZE);
352 }
353 } else
354 res = 0;
355
356out_freeall:
357 kfree(label);
358out_nolab:
359 kfree(geo);
360out_nogeo:
361 kfree(info);
362out_exit:
363 return res;
364}
diff --git a/block/partitions/ibm.h b/block/partitions/ibm.h
deleted file mode 100644
index 08fb0804a81..00000000000
--- a/block/partitions/ibm.h
+++ /dev/null
@@ -1 +0,0 @@
1int ibm_partition(struct parsed_partitions *);
diff --git a/block/partitions/karma.c b/block/partitions/karma.c
deleted file mode 100644
index 0ea19312706..00000000000
--- a/block/partitions/karma.c
+++ /dev/null
@@ -1,57 +0,0 @@
1/*
2 * fs/partitions/karma.c
3 * Rio Karma partition info.
4 *
5 * Copyright (C) 2006 Bob Copeland (me@bobcopeland.com)
6 * based on osf.c
7 */
8
9#include "check.h"
10#include "karma.h"
11
12int karma_partition(struct parsed_partitions *state)
13{
14 int i;
15 int slot = 1;
16 Sector sect;
17 unsigned char *data;
18 struct disklabel {
19 u8 d_reserved[270];
20 struct d_partition {
21 __le32 p_res;
22 u8 p_fstype;
23 u8 p_res2[3];
24 __le32 p_offset;
25 __le32 p_size;
26 } d_partitions[2];
27 u8 d_blank[208];
28 __le16 d_magic;
29 } __attribute__((packed)) *label;
30 struct d_partition *p;
31
32 data = read_part_sector(state, 0, &sect);
33 if (!data)
34 return -1;
35
36 label = (struct disklabel *)data;
37 if (le16_to_cpu(label->d_magic) != KARMA_LABEL_MAGIC) {
38 put_dev_sector(sect);
39 return 0;
40 }
41
42 p = label->d_partitions;
43 for (i = 0 ; i < 2; i++, p++) {
44 if (slot == state->limit)
45 break;
46
47 if (p->p_fstype == 0x4d && le32_to_cpu(p->p_size)) {
48 put_partition(state, slot, le32_to_cpu(p->p_offset),
49 le32_to_cpu(p->p_size));
50 }
51 slot++;
52 }
53 strlcat(state->pp_buf, "\n", PAGE_SIZE);
54 put_dev_sector(sect);
55 return 1;
56}
57
diff --git a/block/partitions/karma.h b/block/partitions/karma.h
deleted file mode 100644
index c764b2e9df2..00000000000
--- a/block/partitions/karma.h
+++ /dev/null
@@ -1,8 +0,0 @@
1/*
2 * fs/partitions/karma.h
3 */
4
5#define KARMA_LABEL_MAGIC 0xAB56
6
7int karma_partition(struct parsed_partitions *state);
8
diff --git a/block/partitions/ldm.c b/block/partitions/ldm.c
deleted file mode 100644
index e507cfbd044..00000000000
--- a/block/partitions/ldm.c
+++ /dev/null
@@ -1,1567 +0,0 @@
1/**
2 * ldm - Support for Windows Logical Disk Manager (Dynamic Disks)
3 *
4 * Copyright (C) 2001,2002 Richard Russon <ldm@flatcap.org>
5 * Copyright (c) 2001-2012 Anton Altaparmakov
6 * Copyright (C) 2001,2002 Jakob Kemi <jakob.kemi@telia.com>
7 *
8 * Documentation is available at http://www.linux-ntfs.org/doku.php?id=downloads
9 *
10 * This program is free software; you can redistribute it and/or modify it under
11 * the terms of the GNU General Public License as published by the Free Software
12 * Foundation; either version 2 of the License, or (at your option) any later
13 * version.
14 *
15 * This program is distributed in the hope that it will be useful, but WITHOUT
16 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17 * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
18 * details.
19 *
20 * You should have received a copy of the GNU General Public License along with
21 * this program (in the main directory of the source in the file COPYING); if
22 * not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330,
23 * Boston, MA 02111-1307 USA
24 */
25
26#include <linux/slab.h>
27#include <linux/pagemap.h>
28#include <linux/stringify.h>
29#include <linux/kernel.h>
30#include "ldm.h"
31#include "check.h"
32#include "msdos.h"
33
34/**
35 * ldm_debug/info/error/crit - Output an error message
36 * @f: A printf format string containing the message
37 * @...: Variables to substitute into @f
38 *
39 * ldm_debug() writes a DEBUG level message to the syslog but only if the
40 * driver was compiled with debug enabled. Otherwise, the call turns into a NOP.
41 */
42#ifndef CONFIG_LDM_DEBUG
43#define ldm_debug(...) do {} while (0)
44#else
45#define ldm_debug(f, a...) _ldm_printk (KERN_DEBUG, __func__, f, ##a)
46#endif
47
48#define ldm_crit(f, a...) _ldm_printk (KERN_CRIT, __func__, f, ##a)
49#define ldm_error(f, a...) _ldm_printk (KERN_ERR, __func__, f, ##a)
50#define ldm_info(f, a...) _ldm_printk (KERN_INFO, __func__, f, ##a)
51
52static __printf(3, 4)
53void _ldm_printk(const char *level, const char *function, const char *fmt, ...)
54{
55 struct va_format vaf;
56 va_list args;
57
58 va_start (args, fmt);
59
60 vaf.fmt = fmt;
61 vaf.va = &args;
62
63 printk("%s%s(): %pV\n", level, function, &vaf);
64
65 va_end(args);
66}
67
68/**
69 * ldm_parse_hexbyte - Convert a ASCII hex number to a byte
70 * @src: Pointer to at least 2 characters to convert.
71 *
72 * Convert a two character ASCII hex string to a number.
73 *
74 * Return: 0-255 Success, the byte was parsed correctly
75 * -1 Error, an invalid character was supplied
76 */
77static int ldm_parse_hexbyte (const u8 *src)
78{
79 unsigned int x; /* For correct wrapping */
80 int h;
81
82 /* high part */
83 x = h = hex_to_bin(src[0]);
84 if (h < 0)
85 return -1;
86
87 /* low part */
88 h = hex_to_bin(src[1]);
89 if (h < 0)
90 return -1;
91
92 return (x << 4) + h;
93}
94
95/**
96 * ldm_parse_guid - Convert GUID from ASCII to binary
97 * @src: 36 char string of the form fa50ff2b-f2e8-45de-83fa-65417f2f49ba
98 * @dest: Memory block to hold binary GUID (16 bytes)
99 *
100 * N.B. The GUID need not be NULL terminated.
101 *
102 * Return: 'true' @dest contains binary GUID
103 * 'false' @dest contents are undefined
104 */
105static bool ldm_parse_guid (const u8 *src, u8 *dest)
106{
107 static const int size[] = { 4, 2, 2, 2, 6 };
108 int i, j, v;
109
110 if (src[8] != '-' || src[13] != '-' ||
111 src[18] != '-' || src[23] != '-')
112 return false;
113
114 for (j = 0; j < 5; j++, src++)
115 for (i = 0; i < size[j]; i++, src+=2, *dest++ = v)
116 if ((v = ldm_parse_hexbyte (src)) < 0)
117 return false;
118
119 return true;
120}
121
122/**
123 * ldm_parse_privhead - Read the LDM Database PRIVHEAD structure
124 * @data: Raw database PRIVHEAD structure loaded from the device
125 * @ph: In-memory privhead structure in which to return parsed information
126 *
127 * This parses the LDM database PRIVHEAD structure supplied in @data and
128 * sets up the in-memory privhead structure @ph with the obtained information.
129 *
130 * Return: 'true' @ph contains the PRIVHEAD data
131 * 'false' @ph contents are undefined
132 */
133static bool ldm_parse_privhead(const u8 *data, struct privhead *ph)
134{
135 bool is_vista = false;
136
137 BUG_ON(!data || !ph);
138 if (MAGIC_PRIVHEAD != get_unaligned_be64(data)) {
139 ldm_error("Cannot find PRIVHEAD structure. LDM database is"
140 " corrupt. Aborting.");
141 return false;
142 }
143 ph->ver_major = get_unaligned_be16(data + 0x000C);
144 ph->ver_minor = get_unaligned_be16(data + 0x000E);
145 ph->logical_disk_start = get_unaligned_be64(data + 0x011B);
146 ph->logical_disk_size = get_unaligned_be64(data + 0x0123);
147 ph->config_start = get_unaligned_be64(data + 0x012B);
148 ph->config_size = get_unaligned_be64(data + 0x0133);
149 /* Version 2.11 is Win2k/XP and version 2.12 is Vista. */
150 if (ph->ver_major == 2 && ph->ver_minor == 12)
151 is_vista = true;
152 if (!is_vista && (ph->ver_major != 2 || ph->ver_minor != 11)) {
153 ldm_error("Expected PRIVHEAD version 2.11 or 2.12, got %d.%d."
154 " Aborting.", ph->ver_major, ph->ver_minor);
155 return false;
156 }
157 ldm_debug("PRIVHEAD version %d.%d (Windows %s).", ph->ver_major,
158 ph->ver_minor, is_vista ? "Vista" : "2000/XP");
159 if (ph->config_size != LDM_DB_SIZE) { /* 1 MiB in sectors. */
160 /* Warn the user and continue, carefully. */
161 ldm_info("Database is normally %u bytes, it claims to "
162 "be %llu bytes.", LDM_DB_SIZE,
163 (unsigned long long)ph->config_size);
164 }
165 if ((ph->logical_disk_size == 0) || (ph->logical_disk_start +
166 ph->logical_disk_size > ph->config_start)) {
167 ldm_error("PRIVHEAD disk size doesn't match real disk size");
168 return false;
169 }
170 if (!ldm_parse_guid(data + 0x0030, ph->disk_id)) {
171 ldm_error("PRIVHEAD contains an invalid GUID.");
172 return false;
173 }
174 ldm_debug("Parsed PRIVHEAD successfully.");
175 return true;
176}
177
178/**
179 * ldm_parse_tocblock - Read the LDM Database TOCBLOCK structure
180 * @data: Raw database TOCBLOCK structure loaded from the device
181 * @toc: In-memory toc structure in which to return parsed information
182 *
183 * This parses the LDM Database TOCBLOCK (table of contents) structure supplied
184 * in @data and sets up the in-memory tocblock structure @toc with the obtained
185 * information.
186 *
187 * N.B. The *_start and *_size values returned in @toc are not range-checked.
188 *
189 * Return: 'true' @toc contains the TOCBLOCK data
190 * 'false' @toc contents are undefined
191 */
192static bool ldm_parse_tocblock (const u8 *data, struct tocblock *toc)
193{
194 BUG_ON (!data || !toc);
195
196 if (MAGIC_TOCBLOCK != get_unaligned_be64(data)) {
197 ldm_crit ("Cannot find TOCBLOCK, database may be corrupt.");
198 return false;
199 }
200 strncpy (toc->bitmap1_name, data + 0x24, sizeof (toc->bitmap1_name));
201 toc->bitmap1_name[sizeof (toc->bitmap1_name) - 1] = 0;
202 toc->bitmap1_start = get_unaligned_be64(data + 0x2E);
203 toc->bitmap1_size = get_unaligned_be64(data + 0x36);
204
205 if (strncmp (toc->bitmap1_name, TOC_BITMAP1,
206 sizeof (toc->bitmap1_name)) != 0) {
207 ldm_crit ("TOCBLOCK's first bitmap is '%s', should be '%s'.",
208 TOC_BITMAP1, toc->bitmap1_name);
209 return false;
210 }
211 strncpy (toc->bitmap2_name, data + 0x46, sizeof (toc->bitmap2_name));
212 toc->bitmap2_name[sizeof (toc->bitmap2_name) - 1] = 0;
213 toc->bitmap2_start = get_unaligned_be64(data + 0x50);
214 toc->bitmap2_size = get_unaligned_be64(data + 0x58);
215 if (strncmp (toc->bitmap2_name, TOC_BITMAP2,
216 sizeof (toc->bitmap2_name)) != 0) {
217 ldm_crit ("TOCBLOCK's second bitmap is '%s', should be '%s'.",
218 TOC_BITMAP2, toc->bitmap2_name);
219 return false;
220 }
221 ldm_debug ("Parsed TOCBLOCK successfully.");
222 return true;
223}
224
225/**
226 * ldm_parse_vmdb - Read the LDM Database VMDB structure
227 * @data: Raw database VMDB structure loaded from the device
228 * @vm: In-memory vmdb structure in which to return parsed information
229 *
230 * This parses the LDM Database VMDB structure supplied in @data and sets up
231 * the in-memory vmdb structure @vm with the obtained information.
232 *
233 * N.B. The *_start, *_size and *_seq values will be range-checked later.
234 *
235 * Return: 'true' @vm contains VMDB info
236 * 'false' @vm contents are undefined
237 */
238static bool ldm_parse_vmdb (const u8 *data, struct vmdb *vm)
239{
240 BUG_ON (!data || !vm);
241
242 if (MAGIC_VMDB != get_unaligned_be32(data)) {
243 ldm_crit ("Cannot find the VMDB, database may be corrupt.");
244 return false;
245 }
246
247 vm->ver_major = get_unaligned_be16(data + 0x12);
248 vm->ver_minor = get_unaligned_be16(data + 0x14);
249 if ((vm->ver_major != 4) || (vm->ver_minor != 10)) {
250 ldm_error ("Expected VMDB version %d.%d, got %d.%d. "
251 "Aborting.", 4, 10, vm->ver_major, vm->ver_minor);
252 return false;
253 }
254
255 vm->vblk_size = get_unaligned_be32(data + 0x08);
256 if (vm->vblk_size == 0) {
257 ldm_error ("Illegal VBLK size");
258 return false;
259 }
260
261 vm->vblk_offset = get_unaligned_be32(data + 0x0C);
262 vm->last_vblk_seq = get_unaligned_be32(data + 0x04);
263
264 ldm_debug ("Parsed VMDB successfully.");
265 return true;
266}
267
268/**
269 * ldm_compare_privheads - Compare two privhead objects
270 * @ph1: First privhead
271 * @ph2: Second privhead
272 *
273 * This compares the two privhead structures @ph1 and @ph2.
274 *
275 * Return: 'true' Identical
276 * 'false' Different
277 */
278static bool ldm_compare_privheads (const struct privhead *ph1,
279 const struct privhead *ph2)
280{
281 BUG_ON (!ph1 || !ph2);
282
283 return ((ph1->ver_major == ph2->ver_major) &&
284 (ph1->ver_minor == ph2->ver_minor) &&
285 (ph1->logical_disk_start == ph2->logical_disk_start) &&
286 (ph1->logical_disk_size == ph2->logical_disk_size) &&
287 (ph1->config_start == ph2->config_start) &&
288 (ph1->config_size == ph2->config_size) &&
289 !memcmp (ph1->disk_id, ph2->disk_id, GUID_SIZE));
290}
291
292/**
293 * ldm_compare_tocblocks - Compare two tocblock objects
294 * @toc1: First toc
295 * @toc2: Second toc
296 *
297 * This compares the two tocblock structures @toc1 and @toc2.
298 *
299 * Return: 'true' Identical
300 * 'false' Different
301 */
302static bool ldm_compare_tocblocks (const struct tocblock *toc1,
303 const struct tocblock *toc2)
304{
305 BUG_ON (!toc1 || !toc2);
306
307 return ((toc1->bitmap1_start == toc2->bitmap1_start) &&
308 (toc1->bitmap1_size == toc2->bitmap1_size) &&
309 (toc1->bitmap2_start == toc2->bitmap2_start) &&
310 (toc1->bitmap2_size == toc2->bitmap2_size) &&
311 !strncmp (toc1->bitmap1_name, toc2->bitmap1_name,
312 sizeof (toc1->bitmap1_name)) &&
313 !strncmp (toc1->bitmap2_name, toc2->bitmap2_name,
314 sizeof (toc1->bitmap2_name)));
315}
316
317/**
318 * ldm_validate_privheads - Compare the primary privhead with its backups
319 * @state: Partition check state including device holding the LDM Database
320 * @ph1: Memory struct to fill with ph contents
321 *
322 * Read and compare all three privheads from disk.
323 *
324 * The privheads on disk show the size and location of the main disk area and
325 * the configuration area (the database). The values are range-checked against
326 * @hd, which contains the real size of the disk.
327 *
328 * Return: 'true' Success
329 * 'false' Error
330 */
331static bool ldm_validate_privheads(struct parsed_partitions *state,
332 struct privhead *ph1)
333{
334 static const int off[3] = { OFF_PRIV1, OFF_PRIV2, OFF_PRIV3 };
335 struct privhead *ph[3] = { ph1 };
336 Sector sect;
337 u8 *data;
338 bool result = false;
339 long num_sects;
340 int i;
341
342 BUG_ON (!state || !ph1);
343
344 ph[1] = kmalloc (sizeof (*ph[1]), GFP_KERNEL);
345 ph[2] = kmalloc (sizeof (*ph[2]), GFP_KERNEL);
346 if (!ph[1] || !ph[2]) {
347 ldm_crit ("Out of memory.");
348 goto out;
349 }
350
351 /* off[1 & 2] are relative to ph[0]->config_start */
352 ph[0]->config_start = 0;
353
354 /* Read and parse privheads */
355 for (i = 0; i < 3; i++) {
356 data = read_part_sector(state, ph[0]->config_start + off[i],
357 &sect);
358 if (!data) {
359 ldm_crit ("Disk read failed.");
360 goto out;
361 }
362 result = ldm_parse_privhead (data, ph[i]);
363 put_dev_sector (sect);
364 if (!result) {
365 ldm_error ("Cannot find PRIVHEAD %d.", i+1); /* Log again */
366 if (i < 2)
367 goto out; /* Already logged */
368 else
369 break; /* FIXME ignore for now, 3rd PH can fail on odd-sized disks */
370 }
371 }
372
373 num_sects = state->bdev->bd_inode->i_size >> 9;
374
375 if ((ph[0]->config_start > num_sects) ||
376 ((ph[0]->config_start + ph[0]->config_size) > num_sects)) {
377 ldm_crit ("Database extends beyond the end of the disk.");
378 goto out;
379 }
380
381 if ((ph[0]->logical_disk_start > ph[0]->config_start) ||
382 ((ph[0]->logical_disk_start + ph[0]->logical_disk_size)
383 > ph[0]->config_start)) {
384 ldm_crit ("Disk and database overlap.");
385 goto out;
386 }
387
388 if (!ldm_compare_privheads (ph[0], ph[1])) {
389 ldm_crit ("Primary and backup PRIVHEADs don't match.");
390 goto out;
391 }
392 /* FIXME ignore this for now
393 if (!ldm_compare_privheads (ph[0], ph[2])) {
394 ldm_crit ("Primary and backup PRIVHEADs don't match.");
395 goto out;
396 }*/
397 ldm_debug ("Validated PRIVHEADs successfully.");
398 result = true;
399out:
400 kfree (ph[1]);
401 kfree (ph[2]);
402 return result;
403}
404
405/**
406 * ldm_validate_tocblocks - Validate the table of contents and its backups
407 * @state: Partition check state including device holding the LDM Database
408 * @base: Offset, into @state->bdev, of the database
409 * @ldb: Cache of the database structures
410 *
411 * Find and compare the four tables of contents of the LDM Database stored on
412 * @state->bdev and return the parsed information into @toc1.
413 *
414 * The offsets and sizes of the configs are range-checked against a privhead.
415 *
416 * Return: 'true' @toc1 contains validated TOCBLOCK info
417 * 'false' @toc1 contents are undefined
418 */
419static bool ldm_validate_tocblocks(struct parsed_partitions *state,
420 unsigned long base, struct ldmdb *ldb)
421{
422 static const int off[4] = { OFF_TOCB1, OFF_TOCB2, OFF_TOCB3, OFF_TOCB4};
423 struct tocblock *tb[4];
424 struct privhead *ph;
425 Sector sect;
426 u8 *data;
427 int i, nr_tbs;
428 bool result = false;
429
430 BUG_ON(!state || !ldb);
431 ph = &ldb->ph;
432 tb[0] = &ldb->toc;
433 tb[1] = kmalloc(sizeof(*tb[1]) * 3, GFP_KERNEL);
434 if (!tb[1]) {
435 ldm_crit("Out of memory.");
436 goto err;
437 }
438 tb[2] = (struct tocblock*)((u8*)tb[1] + sizeof(*tb[1]));
439 tb[3] = (struct tocblock*)((u8*)tb[2] + sizeof(*tb[2]));
440 /*
441 * Try to read and parse all four TOCBLOCKs.
442 *
443 * Windows Vista LDM v2.12 does not always have all four TOCBLOCKs so
444 * skip any that fail as long as we get at least one valid TOCBLOCK.
445 */
446 for (nr_tbs = i = 0; i < 4; i++) {
447 data = read_part_sector(state, base + off[i], &sect);
448 if (!data) {
449 ldm_error("Disk read failed for TOCBLOCK %d.", i);
450 continue;
451 }
452 if (ldm_parse_tocblock(data, tb[nr_tbs]))
453 nr_tbs++;
454 put_dev_sector(sect);
455 }
456 if (!nr_tbs) {
457 ldm_crit("Failed to find a valid TOCBLOCK.");
458 goto err;
459 }
460 /* Range check the TOCBLOCK against a privhead. */
461 if (((tb[0]->bitmap1_start + tb[0]->bitmap1_size) > ph->config_size) ||
462 ((tb[0]->bitmap2_start + tb[0]->bitmap2_size) >
463 ph->config_size)) {
464 ldm_crit("The bitmaps are out of range. Giving up.");
465 goto err;
466 }
467 /* Compare all loaded TOCBLOCKs. */
468 for (i = 1; i < nr_tbs; i++) {
469 if (!ldm_compare_tocblocks(tb[0], tb[i])) {
470 ldm_crit("TOCBLOCKs 0 and %d do not match.", i);
471 goto err;
472 }
473 }
474 ldm_debug("Validated %d TOCBLOCKs successfully.", nr_tbs);
475 result = true;
476err:
477 kfree(tb[1]);
478 return result;
479}
480
481/**
482 * ldm_validate_vmdb - Read the VMDB and validate it
483 * @state: Partition check state including device holding the LDM Database
484 * @base: Offset, into @bdev, of the database
485 * @ldb: Cache of the database structures
486 *
487 * Find the vmdb of the LDM Database stored on @bdev and return the parsed
488 * information in @ldb.
489 *
490 * Return: 'true' @ldb contains validated VBDB info
491 * 'false' @ldb contents are undefined
492 */
493static bool ldm_validate_vmdb(struct parsed_partitions *state,
494 unsigned long base, struct ldmdb *ldb)
495{
496 Sector sect;
497 u8 *data;
498 bool result = false;
499 struct vmdb *vm;
500 struct tocblock *toc;
501
502 BUG_ON (!state || !ldb);
503
504 vm = &ldb->vm;
505 toc = &ldb->toc;
506
507 data = read_part_sector(state, base + OFF_VMDB, &sect);
508 if (!data) {
509 ldm_crit ("Disk read failed.");
510 return false;
511 }
512
513 if (!ldm_parse_vmdb (data, vm))
514 goto out; /* Already logged */
515
516 /* Are there uncommitted transactions? */
517 if (get_unaligned_be16(data + 0x10) != 0x01) {
518 ldm_crit ("Database is not in a consistent state. Aborting.");
519 goto out;
520 }
521
522 if (vm->vblk_offset != 512)
523 ldm_info ("VBLKs start at offset 0x%04x.", vm->vblk_offset);
524
525 /*
526 * The last_vblkd_seq can be before the end of the vmdb, just make sure
527 * it is not out of bounds.
528 */
529 if ((vm->vblk_size * vm->last_vblk_seq) > (toc->bitmap1_size << 9)) {
530 ldm_crit ("VMDB exceeds allowed size specified by TOCBLOCK. "
531 "Database is corrupt. Aborting.");
532 goto out;
533 }
534
535 result = true;
536out:
537 put_dev_sector (sect);
538 return result;
539}
540
541
542/**
543 * ldm_validate_partition_table - Determine whether bdev might be a dynamic disk
544 * @state: Partition check state including device holding the LDM Database
545 *
546 * This function provides a weak test to decide whether the device is a dynamic
547 * disk or not. It looks for an MS-DOS-style partition table containing at
548 * least one partition of type 0x42 (formerly SFS, now used by Windows for
549 * dynamic disks).
550 *
551 * N.B. The only possible error can come from the read_part_sector and that is
552 * only likely to happen if the underlying device is strange. If that IS
553 * the case we should return zero to let someone else try.
554 *
555 * Return: 'true' @state->bdev is a dynamic disk
556 * 'false' @state->bdev is not a dynamic disk, or an error occurred
557 */
558static bool ldm_validate_partition_table(struct parsed_partitions *state)
559{
560 Sector sect;
561 u8 *data;
562 struct partition *p;
563 int i;
564 bool result = false;
565
566 BUG_ON(!state);
567
568 data = read_part_sector(state, 0, &sect);
569 if (!data) {
570 ldm_info ("Disk read failed.");
571 return false;
572 }
573
574 if (*(__le16*) (data + 0x01FE) != cpu_to_le16 (MSDOS_LABEL_MAGIC))
575 goto out;
576
577 p = (struct partition*)(data + 0x01BE);
578 for (i = 0; i < 4; i++, p++)
579 if (SYS_IND (p) == LDM_PARTITION) {
580 result = true;
581 break;
582 }
583
584 if (result)
585 ldm_debug ("Found W2K dynamic disk partition type.");
586
587out:
588 put_dev_sector (sect);
589 return result;
590}
591
592/**
593 * ldm_get_disk_objid - Search a linked list of vblk's for a given Disk Id
594 * @ldb: Cache of the database structures
595 *
596 * The LDM Database contains a list of all partitions on all dynamic disks.
597 * The primary PRIVHEAD, at the beginning of the physical disk, tells us
598 * the GUID of this disk. This function searches for the GUID in a linked
599 * list of vblk's.
600 *
601 * Return: Pointer, A matching vblk was found
602 * NULL, No match, or an error
603 */
604static struct vblk * ldm_get_disk_objid (const struct ldmdb *ldb)
605{
606 struct list_head *item;
607
608 BUG_ON (!ldb);
609
610 list_for_each (item, &ldb->v_disk) {
611 struct vblk *v = list_entry (item, struct vblk, list);
612 if (!memcmp (v->vblk.disk.disk_id, ldb->ph.disk_id, GUID_SIZE))
613 return v;
614 }
615
616 return NULL;
617}
618
619/**
620 * ldm_create_data_partitions - Create data partitions for this device
621 * @pp: List of the partitions parsed so far
622 * @ldb: Cache of the database structures
623 *
624 * The database contains ALL the partitions for ALL disk groups, so we need to
625 * filter out this specific disk. Using the disk's object id, we can find all
626 * the partitions in the database that belong to this disk.
627 *
628 * Add each partition in our database, to the parsed_partitions structure.
629 *
630 * N.B. This function creates the partitions in the order it finds partition
631 * objects in the linked list.
632 *
633 * Return: 'true' Partition created
634 * 'false' Error, probably a range checking problem
635 */
636static bool ldm_create_data_partitions (struct parsed_partitions *pp,
637 const struct ldmdb *ldb)
638{
639 struct list_head *item;
640 struct vblk *vb;
641 struct vblk *disk;
642 struct vblk_part *part;
643 int part_num = 1;
644
645 BUG_ON (!pp || !ldb);
646
647 disk = ldm_get_disk_objid (ldb);
648 if (!disk) {
649 ldm_crit ("Can't find the ID of this disk in the database.");
650 return false;
651 }
652
653 strlcat(pp->pp_buf, " [LDM]", PAGE_SIZE);
654
655 /* Create the data partitions */
656 list_for_each (item, &ldb->v_part) {
657 vb = list_entry (item, struct vblk, list);
658 part = &vb->vblk.part;
659
660 if (part->disk_id != disk->obj_id)
661 continue;
662
663 put_partition (pp, part_num, ldb->ph.logical_disk_start +
664 part->start, part->size);
665 part_num++;
666 }
667
668 strlcat(pp->pp_buf, "\n", PAGE_SIZE);
669 return true;
670}
671
672
673/**
674 * ldm_relative - Calculate the next relative offset
675 * @buffer: Block of data being worked on
676 * @buflen: Size of the block of data
677 * @base: Size of the previous fixed width fields
678 * @offset: Cumulative size of the previous variable-width fields
679 *
680 * Because many of the VBLK fields are variable-width, it's necessary
681 * to calculate each offset based on the previous one and the length
682 * of the field it pointed to.
683 *
684 * Return: -1 Error, the calculated offset exceeded the size of the buffer
685 * n OK, a range-checked offset into buffer
686 */
687static int ldm_relative(const u8 *buffer, int buflen, int base, int offset)
688{
689
690 base += offset;
691 if (!buffer || offset < 0 || base > buflen) {
692 if (!buffer)
693 ldm_error("!buffer");
694 if (offset < 0)
695 ldm_error("offset (%d) < 0", offset);
696 if (base > buflen)
697 ldm_error("base (%d) > buflen (%d)", base, buflen);
698 return -1;
699 }
700 if (base + buffer[base] >= buflen) {
701 ldm_error("base (%d) + buffer[base] (%d) >= buflen (%d)", base,
702 buffer[base], buflen);
703 return -1;
704 }
705 return buffer[base] + offset + 1;
706}
707
708/**
709 * ldm_get_vnum - Convert a variable-width, big endian number, into cpu order
710 * @block: Pointer to the variable-width number to convert
711 *
712 * Large numbers in the LDM Database are often stored in a packed format. Each
713 * number is prefixed by a one byte width marker. All numbers in the database
714 * are stored in big-endian byte order. This function reads one of these
715 * numbers and returns the result
716 *
717 * N.B. This function DOES NOT perform any range checking, though the most
718 * it will read is eight bytes.
719 *
720 * Return: n A number
721 * 0 Zero, or an error occurred
722 */
723static u64 ldm_get_vnum (const u8 *block)
724{
725 u64 tmp = 0;
726 u8 length;
727
728 BUG_ON (!block);
729
730 length = *block++;
731
732 if (length && length <= 8)
733 while (length--)
734 tmp = (tmp << 8) | *block++;
735 else
736 ldm_error ("Illegal length %d.", length);
737
738 return tmp;
739}
740
741/**
742 * ldm_get_vstr - Read a length-prefixed string into a buffer
743 * @block: Pointer to the length marker
744 * @buffer: Location to copy string to
745 * @buflen: Size of the output buffer
746 *
747 * Many of the strings in the LDM Database are not NULL terminated. Instead
748 * they are prefixed by a one byte length marker. This function copies one of
749 * these strings into a buffer.
750 *
751 * N.B. This function DOES NOT perform any range checking on the input.
752 * If the buffer is too small, the output will be truncated.
753 *
754 * Return: 0, Error and @buffer contents are undefined
755 * n, String length in characters (excluding NULL)
756 * buflen-1, String was truncated.
757 */
758static int ldm_get_vstr (const u8 *block, u8 *buffer, int buflen)
759{
760 int length;
761
762 BUG_ON (!block || !buffer);
763
764 length = block[0];
765 if (length >= buflen) {
766 ldm_error ("Truncating string %d -> %d.", length, buflen);
767 length = buflen - 1;
768 }
769 memcpy (buffer, block + 1, length);
770 buffer[length] = 0;
771 return length;
772}
773
774
775/**
776 * ldm_parse_cmp3 - Read a raw VBLK Component object into a vblk structure
777 * @buffer: Block of data being worked on
778 * @buflen: Size of the block of data
779 * @vb: In-memory vblk in which to return information
780 *
781 * Read a raw VBLK Component object (version 3) into a vblk structure.
782 *
783 * Return: 'true' @vb contains a Component VBLK
784 * 'false' @vb contents are not defined
785 */
786static bool ldm_parse_cmp3 (const u8 *buffer, int buflen, struct vblk *vb)
787{
788 int r_objid, r_name, r_vstate, r_child, r_parent, r_stripe, r_cols, len;
789 struct vblk_comp *comp;
790
791 BUG_ON (!buffer || !vb);
792
793 r_objid = ldm_relative (buffer, buflen, 0x18, 0);
794 r_name = ldm_relative (buffer, buflen, 0x18, r_objid);
795 r_vstate = ldm_relative (buffer, buflen, 0x18, r_name);
796 r_child = ldm_relative (buffer, buflen, 0x1D, r_vstate);
797 r_parent = ldm_relative (buffer, buflen, 0x2D, r_child);
798
799 if (buffer[0x12] & VBLK_FLAG_COMP_STRIPE) {
800 r_stripe = ldm_relative (buffer, buflen, 0x2E, r_parent);
801 r_cols = ldm_relative (buffer, buflen, 0x2E, r_stripe);
802 len = r_cols;
803 } else {
804 r_stripe = 0;
805 r_cols = 0;
806 len = r_parent;
807 }
808 if (len < 0)
809 return false;
810
811 len += VBLK_SIZE_CMP3;
812 if (len != get_unaligned_be32(buffer + 0x14))
813 return false;
814
815 comp = &vb->vblk.comp;
816 ldm_get_vstr (buffer + 0x18 + r_name, comp->state,
817 sizeof (comp->state));
818 comp->type = buffer[0x18 + r_vstate];
819 comp->children = ldm_get_vnum (buffer + 0x1D + r_vstate);
820 comp->parent_id = ldm_get_vnum (buffer + 0x2D + r_child);
821 comp->chunksize = r_stripe ? ldm_get_vnum (buffer+r_parent+0x2E) : 0;
822
823 return true;
824}
825
826/**
827 * ldm_parse_dgr3 - Read a raw VBLK Disk Group object into a vblk structure
828 * @buffer: Block of data being worked on
829 * @buflen: Size of the block of data
830 * @vb: In-memory vblk in which to return information
831 *
832 * Read a raw VBLK Disk Group object (version 3) into a vblk structure.
833 *
834 * Return: 'true' @vb contains a Disk Group VBLK
835 * 'false' @vb contents are not defined
836 */
837static int ldm_parse_dgr3 (const u8 *buffer, int buflen, struct vblk *vb)
838{
839 int r_objid, r_name, r_diskid, r_id1, r_id2, len;
840 struct vblk_dgrp *dgrp;
841
842 BUG_ON (!buffer || !vb);
843
844 r_objid = ldm_relative (buffer, buflen, 0x18, 0);
845 r_name = ldm_relative (buffer, buflen, 0x18, r_objid);
846 r_diskid = ldm_relative (buffer, buflen, 0x18, r_name);
847
848 if (buffer[0x12] & VBLK_FLAG_DGR3_IDS) {
849 r_id1 = ldm_relative (buffer, buflen, 0x24, r_diskid);
850 r_id2 = ldm_relative (buffer, buflen, 0x24, r_id1);
851 len = r_id2;
852 } else {
853 r_id1 = 0;
854 r_id2 = 0;
855 len = r_diskid;
856 }
857 if (len < 0)
858 return false;
859
860 len += VBLK_SIZE_DGR3;
861 if (len != get_unaligned_be32(buffer + 0x14))
862 return false;
863
864 dgrp = &vb->vblk.dgrp;
865 ldm_get_vstr (buffer + 0x18 + r_name, dgrp->disk_id,
866 sizeof (dgrp->disk_id));
867 return true;
868}
869
870/**
871 * ldm_parse_dgr4 - Read a raw VBLK Disk Group object into a vblk structure
872 * @buffer: Block of data being worked on
873 * @buflen: Size of the block of data
874 * @vb: In-memory vblk in which to return information
875 *
876 * Read a raw VBLK Disk Group object (version 4) into a vblk structure.
877 *
878 * Return: 'true' @vb contains a Disk Group VBLK
879 * 'false' @vb contents are not defined
880 */
881static bool ldm_parse_dgr4 (const u8 *buffer, int buflen, struct vblk *vb)
882{
883 char buf[64];
884 int r_objid, r_name, r_id1, r_id2, len;
885 struct vblk_dgrp *dgrp;
886
887 BUG_ON (!buffer || !vb);
888
889 r_objid = ldm_relative (buffer, buflen, 0x18, 0);
890 r_name = ldm_relative (buffer, buflen, 0x18, r_objid);
891
892 if (buffer[0x12] & VBLK_FLAG_DGR4_IDS) {
893 r_id1 = ldm_relative (buffer, buflen, 0x44, r_name);
894 r_id2 = ldm_relative (buffer, buflen, 0x44, r_id1);
895 len = r_id2;
896 } else {
897 r_id1 = 0;
898 r_id2 = 0;
899 len = r_name;
900 }
901 if (len < 0)
902 return false;
903
904 len += VBLK_SIZE_DGR4;
905 if (len != get_unaligned_be32(buffer + 0x14))
906 return false;
907
908 dgrp = &vb->vblk.dgrp;
909
910 ldm_get_vstr (buffer + 0x18 + r_objid, buf, sizeof (buf));
911 return true;
912}
913
914/**
915 * ldm_parse_dsk3 - Read a raw VBLK Disk object into a vblk structure
916 * @buffer: Block of data being worked on
917 * @buflen: Size of the block of data
918 * @vb: In-memory vblk in which to return information
919 *
920 * Read a raw VBLK Disk object (version 3) into a vblk structure.
921 *
922 * Return: 'true' @vb contains a Disk VBLK
923 * 'false' @vb contents are not defined
924 */
925static bool ldm_parse_dsk3 (const u8 *buffer, int buflen, struct vblk *vb)
926{
927 int r_objid, r_name, r_diskid, r_altname, len;
928 struct vblk_disk *disk;
929
930 BUG_ON (!buffer || !vb);
931
932 r_objid = ldm_relative (buffer, buflen, 0x18, 0);
933 r_name = ldm_relative (buffer, buflen, 0x18, r_objid);
934 r_diskid = ldm_relative (buffer, buflen, 0x18, r_name);
935 r_altname = ldm_relative (buffer, buflen, 0x18, r_diskid);
936 len = r_altname;
937 if (len < 0)
938 return false;
939
940 len += VBLK_SIZE_DSK3;
941 if (len != get_unaligned_be32(buffer + 0x14))
942 return false;
943
944 disk = &vb->vblk.disk;
945 ldm_get_vstr (buffer + 0x18 + r_diskid, disk->alt_name,
946 sizeof (disk->alt_name));
947 if (!ldm_parse_guid (buffer + 0x19 + r_name, disk->disk_id))
948 return false;
949
950 return true;
951}
952
953/**
954 * ldm_parse_dsk4 - Read a raw VBLK Disk object into a vblk structure
955 * @buffer: Block of data being worked on
956 * @buflen: Size of the block of data
957 * @vb: In-memory vblk in which to return information
958 *
959 * Read a raw VBLK Disk object (version 4) into a vblk structure.
960 *
961 * Return: 'true' @vb contains a Disk VBLK
962 * 'false' @vb contents are not defined
963 */
964static bool ldm_parse_dsk4 (const u8 *buffer, int buflen, struct vblk *vb)
965{
966 int r_objid, r_name, len;
967 struct vblk_disk *disk;
968
969 BUG_ON (!buffer || !vb);
970
971 r_objid = ldm_relative (buffer, buflen, 0x18, 0);
972 r_name = ldm_relative (buffer, buflen, 0x18, r_objid);
973 len = r_name;
974 if (len < 0)
975 return false;
976
977 len += VBLK_SIZE_DSK4;
978 if (len != get_unaligned_be32(buffer + 0x14))
979 return false;
980
981 disk = &vb->vblk.disk;
982 memcpy (disk->disk_id, buffer + 0x18 + r_name, GUID_SIZE);
983 return true;
984}
985
986/**
987 * ldm_parse_prt3 - Read a raw VBLK Partition object into a vblk structure
988 * @buffer: Block of data being worked on
989 * @buflen: Size of the block of data
990 * @vb: In-memory vblk in which to return information
991 *
992 * Read a raw VBLK Partition object (version 3) into a vblk structure.
993 *
994 * Return: 'true' @vb contains a Partition VBLK
995 * 'false' @vb contents are not defined
996 */
997static bool ldm_parse_prt3(const u8 *buffer, int buflen, struct vblk *vb)
998{
999 int r_objid, r_name, r_size, r_parent, r_diskid, r_index, len;
1000 struct vblk_part *part;
1001
1002 BUG_ON(!buffer || !vb);
1003 r_objid = ldm_relative(buffer, buflen, 0x18, 0);
1004 if (r_objid < 0) {
1005 ldm_error("r_objid %d < 0", r_objid);
1006 return false;
1007 }
1008 r_name = ldm_relative(buffer, buflen, 0x18, r_objid);
1009 if (r_name < 0) {
1010 ldm_error("r_name %d < 0", r_name);
1011 return false;
1012 }
1013 r_size = ldm_relative(buffer, buflen, 0x34, r_name);
1014 if (r_size < 0) {
1015 ldm_error("r_size %d < 0", r_size);
1016 return false;
1017 }
1018 r_parent = ldm_relative(buffer, buflen, 0x34, r_size);
1019 if (r_parent < 0) {
1020 ldm_error("r_parent %d < 0", r_parent);
1021 return false;
1022 }
1023 r_diskid = ldm_relative(buffer, buflen, 0x34, r_parent);
1024 if (r_diskid < 0) {
1025 ldm_error("r_diskid %d < 0", r_diskid);
1026 return false;
1027 }
1028 if (buffer[0x12] & VBLK_FLAG_PART_INDEX) {
1029 r_index = ldm_relative(buffer, buflen, 0x34, r_diskid);
1030 if (r_index < 0) {
1031 ldm_error("r_index %d < 0", r_index);
1032 return false;
1033 }
1034 len = r_index;
1035 } else {
1036 r_index = 0;
1037 len = r_diskid;
1038 }
1039 if (len < 0) {
1040 ldm_error("len %d < 0", len);
1041 return false;
1042 }
1043 len += VBLK_SIZE_PRT3;
1044 if (len > get_unaligned_be32(buffer + 0x14)) {
1045 ldm_error("len %d > BE32(buffer + 0x14) %d", len,
1046 get_unaligned_be32(buffer + 0x14));
1047 return false;
1048 }
1049 part = &vb->vblk.part;
1050 part->start = get_unaligned_be64(buffer + 0x24 + r_name);
1051 part->volume_offset = get_unaligned_be64(buffer + 0x2C + r_name);
1052 part->size = ldm_get_vnum(buffer + 0x34 + r_name);
1053 part->parent_id = ldm_get_vnum(buffer + 0x34 + r_size);
1054 part->disk_id = ldm_get_vnum(buffer + 0x34 + r_parent);
1055 if (vb->flags & VBLK_FLAG_PART_INDEX)
1056 part->partnum = buffer[0x35 + r_diskid];
1057 else
1058 part->partnum = 0;
1059 return true;
1060}
1061
1062/**
1063 * ldm_parse_vol5 - Read a raw VBLK Volume object into a vblk structure
1064 * @buffer: Block of data being worked on
1065 * @buflen: Size of the block of data
1066 * @vb: In-memory vblk in which to return information
1067 *
1068 * Read a raw VBLK Volume object (version 5) into a vblk structure.
1069 *
1070 * Return: 'true' @vb contains a Volume VBLK
1071 * 'false' @vb contents are not defined
1072 */
1073static bool ldm_parse_vol5(const u8 *buffer, int buflen, struct vblk *vb)
1074{
1075 int r_objid, r_name, r_vtype, r_disable_drive_letter, r_child, r_size;
1076 int r_id1, r_id2, r_size2, r_drive, len;
1077 struct vblk_volu *volu;
1078
1079 BUG_ON(!buffer || !vb);
1080 r_objid = ldm_relative(buffer, buflen, 0x18, 0);
1081 if (r_objid < 0) {
1082 ldm_error("r_objid %d < 0", r_objid);
1083 return false;
1084 }
1085 r_name = ldm_relative(buffer, buflen, 0x18, r_objid);
1086 if (r_name < 0) {
1087 ldm_error("r_name %d < 0", r_name);
1088 return false;
1089 }
1090 r_vtype = ldm_relative(buffer, buflen, 0x18, r_name);
1091 if (r_vtype < 0) {
1092 ldm_error("r_vtype %d < 0", r_vtype);
1093 return false;
1094 }
1095 r_disable_drive_letter = ldm_relative(buffer, buflen, 0x18, r_vtype);
1096 if (r_disable_drive_letter < 0) {
1097 ldm_error("r_disable_drive_letter %d < 0",
1098 r_disable_drive_letter);
1099 return false;
1100 }
1101 r_child = ldm_relative(buffer, buflen, 0x2D, r_disable_drive_letter);
1102 if (r_child < 0) {
1103 ldm_error("r_child %d < 0", r_child);
1104 return false;
1105 }
1106 r_size = ldm_relative(buffer, buflen, 0x3D, r_child);
1107 if (r_size < 0) {
1108 ldm_error("r_size %d < 0", r_size);
1109 return false;
1110 }
1111 if (buffer[0x12] & VBLK_FLAG_VOLU_ID1) {
1112 r_id1 = ldm_relative(buffer, buflen, 0x52, r_size);
1113 if (r_id1 < 0) {
1114 ldm_error("r_id1 %d < 0", r_id1);
1115 return false;
1116 }
1117 } else
1118 r_id1 = r_size;
1119 if (buffer[0x12] & VBLK_FLAG_VOLU_ID2) {
1120 r_id2 = ldm_relative(buffer, buflen, 0x52, r_id1);
1121 if (r_id2 < 0) {
1122 ldm_error("r_id2 %d < 0", r_id2);
1123 return false;
1124 }
1125 } else
1126 r_id2 = r_id1;
1127 if (buffer[0x12] & VBLK_FLAG_VOLU_SIZE) {
1128 r_size2 = ldm_relative(buffer, buflen, 0x52, r_id2);
1129 if (r_size2 < 0) {
1130 ldm_error("r_size2 %d < 0", r_size2);
1131 return false;
1132 }
1133 } else
1134 r_size2 = r_id2;
1135 if (buffer[0x12] & VBLK_FLAG_VOLU_DRIVE) {
1136 r_drive = ldm_relative(buffer, buflen, 0x52, r_size2);
1137 if (r_drive < 0) {
1138 ldm_error("r_drive %d < 0", r_drive);
1139 return false;
1140 }
1141 } else
1142 r_drive = r_size2;
1143 len = r_drive;
1144 if (len < 0) {
1145 ldm_error("len %d < 0", len);
1146 return false;
1147 }
1148 len += VBLK_SIZE_VOL5;
1149 if (len > get_unaligned_be32(buffer + 0x14)) {
1150 ldm_error("len %d > BE32(buffer + 0x14) %d", len,
1151 get_unaligned_be32(buffer + 0x14));
1152 return false;
1153 }
1154 volu = &vb->vblk.volu;
1155 ldm_get_vstr(buffer + 0x18 + r_name, volu->volume_type,
1156 sizeof(volu->volume_type));
1157 memcpy(volu->volume_state, buffer + 0x18 + r_disable_drive_letter,
1158 sizeof(volu->volume_state));
1159 volu->size = ldm_get_vnum(buffer + 0x3D + r_child);
1160 volu->partition_type = buffer[0x41 + r_size];
1161 memcpy(volu->guid, buffer + 0x42 + r_size, sizeof(volu->guid));
1162 if (buffer[0x12] & VBLK_FLAG_VOLU_DRIVE) {
1163 ldm_get_vstr(buffer + 0x52 + r_size, volu->drive_hint,
1164 sizeof(volu->drive_hint));
1165 }
1166 return true;
1167}
1168
1169/**
1170 * ldm_parse_vblk - Read a raw VBLK object into a vblk structure
1171 * @buf: Block of data being worked on
1172 * @len: Size of the block of data
1173 * @vb: In-memory vblk in which to return information
1174 *
1175 * Read a raw VBLK object into a vblk structure. This function just reads the
1176 * information common to all VBLK types, then delegates the rest of the work to
1177 * helper functions: ldm_parse_*.
1178 *
1179 * Return: 'true' @vb contains a VBLK
1180 * 'false' @vb contents are not defined
1181 */
1182static bool ldm_parse_vblk (const u8 *buf, int len, struct vblk *vb)
1183{
1184 bool result = false;
1185 int r_objid;
1186
1187 BUG_ON (!buf || !vb);
1188
1189 r_objid = ldm_relative (buf, len, 0x18, 0);
1190 if (r_objid < 0) {
1191 ldm_error ("VBLK header is corrupt.");
1192 return false;
1193 }
1194
1195 vb->flags = buf[0x12];
1196 vb->type = buf[0x13];
1197 vb->obj_id = ldm_get_vnum (buf + 0x18);
1198 ldm_get_vstr (buf+0x18+r_objid, vb->name, sizeof (vb->name));
1199
1200 switch (vb->type) {
1201 case VBLK_CMP3: result = ldm_parse_cmp3 (buf, len, vb); break;
1202 case VBLK_DSK3: result = ldm_parse_dsk3 (buf, len, vb); break;
1203 case VBLK_DSK4: result = ldm_parse_dsk4 (buf, len, vb); break;
1204 case VBLK_DGR3: result = ldm_parse_dgr3 (buf, len, vb); break;
1205 case VBLK_DGR4: result = ldm_parse_dgr4 (buf, len, vb); break;
1206 case VBLK_PRT3: result = ldm_parse_prt3 (buf, len, vb); break;
1207 case VBLK_VOL5: result = ldm_parse_vol5 (buf, len, vb); break;
1208 }
1209
1210 if (result)
1211 ldm_debug ("Parsed VBLK 0x%llx (type: 0x%02x) ok.",
1212 (unsigned long long) vb->obj_id, vb->type);
1213 else
1214 ldm_error ("Failed to parse VBLK 0x%llx (type: 0x%02x).",
1215 (unsigned long long) vb->obj_id, vb->type);
1216
1217 return result;
1218}
1219
1220
1221/**
1222 * ldm_ldmdb_add - Adds a raw VBLK entry to the ldmdb database
1223 * @data: Raw VBLK to add to the database
1224 * @len: Size of the raw VBLK
1225 * @ldb: Cache of the database structures
1226 *
1227 * The VBLKs are sorted into categories. Partitions are also sorted by offset.
1228 *
1229 * N.B. This function does not check the validity of the VBLKs.
1230 *
1231 * Return: 'true' The VBLK was added
1232 * 'false' An error occurred
1233 */
1234static bool ldm_ldmdb_add (u8 *data, int len, struct ldmdb *ldb)
1235{
1236 struct vblk *vb;
1237 struct list_head *item;
1238
1239 BUG_ON (!data || !ldb);
1240
1241 vb = kmalloc (sizeof (*vb), GFP_KERNEL);
1242 if (!vb) {
1243 ldm_crit ("Out of memory.");
1244 return false;
1245 }
1246
1247 if (!ldm_parse_vblk (data, len, vb)) {
1248 kfree(vb);
1249 return false; /* Already logged */
1250 }
1251
1252 /* Put vblk into the correct list. */
1253 switch (vb->type) {
1254 case VBLK_DGR3:
1255 case VBLK_DGR4:
1256 list_add (&vb->list, &ldb->v_dgrp);
1257 break;
1258 case VBLK_DSK3:
1259 case VBLK_DSK4:
1260 list_add (&vb->list, &ldb->v_disk);
1261 break;
1262 case VBLK_VOL5:
1263 list_add (&vb->list, &ldb->v_volu);
1264 break;
1265 case VBLK_CMP3:
1266 list_add (&vb->list, &ldb->v_comp);
1267 break;
1268 case VBLK_PRT3:
1269 /* Sort by the partition's start sector. */
1270 list_for_each (item, &ldb->v_part) {
1271 struct vblk *v = list_entry (item, struct vblk, list);
1272 if ((v->vblk.part.disk_id == vb->vblk.part.disk_id) &&
1273 (v->vblk.part.start > vb->vblk.part.start)) {
1274 list_add_tail (&vb->list, &v->list);
1275 return true;
1276 }
1277 }
1278 list_add_tail (&vb->list, &ldb->v_part);
1279 break;
1280 }
1281 return true;
1282}
1283
1284/**
1285 * ldm_frag_add - Add a VBLK fragment to a list
1286 * @data: Raw fragment to be added to the list
1287 * @size: Size of the raw fragment
1288 * @frags: Linked list of VBLK fragments
1289 *
1290 * Fragmented VBLKs may not be consecutive in the database, so they are placed
1291 * in a list so they can be pieced together later.
1292 *
1293 * Return: 'true' Success, the VBLK was added to the list
1294 * 'false' Error, a problem occurred
1295 */
1296static bool ldm_frag_add (const u8 *data, int size, struct list_head *frags)
1297{
1298 struct frag *f;
1299 struct list_head *item;
1300 int rec, num, group;
1301
1302 BUG_ON (!data || !frags);
1303
1304 if (size < 2 * VBLK_SIZE_HEAD) {
1305 ldm_error("Value of size is to small.");
1306 return false;
1307 }
1308
1309 group = get_unaligned_be32(data + 0x08);
1310 rec = get_unaligned_be16(data + 0x0C);
1311 num = get_unaligned_be16(data + 0x0E);
1312 if ((num < 1) || (num > 4)) {
1313 ldm_error ("A VBLK claims to have %d parts.", num);
1314 return false;
1315 }
1316 if (rec >= num) {
1317 ldm_error("REC value (%d) exceeds NUM value (%d)", rec, num);
1318 return false;
1319 }
1320
1321 list_for_each (item, frags) {
1322 f = list_entry (item, struct frag, list);
1323 if (f->group == group)
1324 goto found;
1325 }
1326
1327 f = kmalloc (sizeof (*f) + size*num, GFP_KERNEL);
1328 if (!f) {
1329 ldm_crit ("Out of memory.");
1330 return false;
1331 }
1332
1333 f->group = group;
1334 f->num = num;
1335 f->rec = rec;
1336 f->map = 0xFF << num;
1337
1338 list_add_tail (&f->list, frags);
1339found:
1340 if (rec >= f->num) {
1341 ldm_error("REC value (%d) exceeds NUM value (%d)", rec, f->num);
1342 return false;
1343 }
1344 if (f->map & (1 << rec)) {
1345 ldm_error ("Duplicate VBLK, part %d.", rec);
1346 f->map &= 0x7F; /* Mark the group as broken */
1347 return false;
1348 }
1349 f->map |= (1 << rec);
1350 if (!rec)
1351 memcpy(f->data, data, VBLK_SIZE_HEAD);
1352 data += VBLK_SIZE_HEAD;
1353 size -= VBLK_SIZE_HEAD;
1354 memcpy(f->data + VBLK_SIZE_HEAD + rec * size, data, size);
1355 return true;
1356}
1357
1358/**
1359 * ldm_frag_free - Free a linked list of VBLK fragments
1360 * @list: Linked list of fragments
1361 *
1362 * Free a linked list of VBLK fragments
1363 *
1364 * Return: none
1365 */
1366static void ldm_frag_free (struct list_head *list)
1367{
1368 struct list_head *item, *tmp;
1369
1370 BUG_ON (!list);
1371
1372 list_for_each_safe (item, tmp, list)
1373 kfree (list_entry (item, struct frag, list));
1374}
1375
1376/**
1377 * ldm_frag_commit - Validate fragmented VBLKs and add them to the database
1378 * @frags: Linked list of VBLK fragments
1379 * @ldb: Cache of the database structures
1380 *
1381 * Now that all the fragmented VBLKs have been collected, they must be added to
1382 * the database for later use.
1383 *
1384 * Return: 'true' All the fragments we added successfully
1385 * 'false' One or more of the fragments we invalid
1386 */
1387static bool ldm_frag_commit (struct list_head *frags, struct ldmdb *ldb)
1388{
1389 struct frag *f;
1390 struct list_head *item;
1391
1392 BUG_ON (!frags || !ldb);
1393
1394 list_for_each (item, frags) {
1395 f = list_entry (item, struct frag, list);
1396
1397 if (f->map != 0xFF) {
1398 ldm_error ("VBLK group %d is incomplete (0x%02x).",
1399 f->group, f->map);
1400 return false;
1401 }
1402
1403 if (!ldm_ldmdb_add (f->data, f->num*ldb->vm.vblk_size, ldb))
1404 return false; /* Already logged */
1405 }
1406 return true;
1407}
1408
1409/**
1410 * ldm_get_vblks - Read the on-disk database of VBLKs into memory
1411 * @state: Partition check state including device holding the LDM Database
1412 * @base: Offset, into @state->bdev, of the database
1413 * @ldb: Cache of the database structures
1414 *
1415 * To use the information from the VBLKs, they need to be read from the disk,
1416 * unpacked and validated. We cache them in @ldb according to their type.
1417 *
1418 * Return: 'true' All the VBLKs were read successfully
1419 * 'false' An error occurred
1420 */
1421static bool ldm_get_vblks(struct parsed_partitions *state, unsigned long base,
1422 struct ldmdb *ldb)
1423{
1424 int size, perbuf, skip, finish, s, v, recs;
1425 u8 *data = NULL;
1426 Sector sect;
1427 bool result = false;
1428 LIST_HEAD (frags);
1429
1430 BUG_ON(!state || !ldb);
1431
1432 size = ldb->vm.vblk_size;
1433 perbuf = 512 / size;
1434 skip = ldb->vm.vblk_offset >> 9; /* Bytes to sectors */
1435 finish = (size * ldb->vm.last_vblk_seq) >> 9;
1436
1437 for (s = skip; s < finish; s++) { /* For each sector */
1438 data = read_part_sector(state, base + OFF_VMDB + s, &sect);
1439 if (!data) {
1440 ldm_crit ("Disk read failed.");
1441 goto out;
1442 }
1443
1444 for (v = 0; v < perbuf; v++, data+=size) { /* For each vblk */
1445 if (MAGIC_VBLK != get_unaligned_be32(data)) {
1446 ldm_error ("Expected to find a VBLK.");
1447 goto out;
1448 }
1449
1450 recs = get_unaligned_be16(data + 0x0E); /* Number of records */
1451 if (recs == 1) {
1452 if (!ldm_ldmdb_add (data, size, ldb))
1453 goto out; /* Already logged */
1454 } else if (recs > 1) {
1455 if (!ldm_frag_add (data, size, &frags))
1456 goto out; /* Already logged */
1457 }
1458 /* else Record is not in use, ignore it. */
1459 }
1460 put_dev_sector (sect);
1461 data = NULL;
1462 }
1463
1464 result = ldm_frag_commit (&frags, ldb); /* Failures, already logged */
1465out:
1466 if (data)
1467 put_dev_sector (sect);
1468 ldm_frag_free (&frags);
1469
1470 return result;
1471}
1472
1473/**
1474 * ldm_free_vblks - Free a linked list of vblk's
1475 * @lh: Head of a linked list of struct vblk
1476 *
1477 * Free a list of vblk's and free the memory used to maintain the list.
1478 *
1479 * Return: none
1480 */
1481static void ldm_free_vblks (struct list_head *lh)
1482{
1483 struct list_head *item, *tmp;
1484
1485 BUG_ON (!lh);
1486
1487 list_for_each_safe (item, tmp, lh)
1488 kfree (list_entry (item, struct vblk, list));
1489}
1490
1491
1492/**
1493 * ldm_partition - Find out whether a device is a dynamic disk and handle it
1494 * @state: Partition check state including device holding the LDM Database
1495 *
1496 * This determines whether the device @bdev is a dynamic disk and if so creates
1497 * the partitions necessary in the gendisk structure pointed to by @hd.
1498 *
1499 * We create a dummy device 1, which contains the LDM database, and then create
1500 * each partition described by the LDM database in sequence as devices 2+. For
1501 * example, if the device is hda, we would have: hda1: LDM database, hda2, hda3,
1502 * and so on: the actual data containing partitions.
1503 *
1504 * Return: 1 Success, @state->bdev is a dynamic disk and we handled it
1505 * 0 Success, @state->bdev is not a dynamic disk
1506 * -1 An error occurred before enough information had been read
1507 * Or @state->bdev is a dynamic disk, but it may be corrupted
1508 */
1509int ldm_partition(struct parsed_partitions *state)
1510{
1511 struct ldmdb *ldb;
1512 unsigned long base;
1513 int result = -1;
1514
1515 BUG_ON(!state);
1516
1517 /* Look for signs of a Dynamic Disk */
1518 if (!ldm_validate_partition_table(state))
1519 return 0;
1520
1521 ldb = kmalloc (sizeof (*ldb), GFP_KERNEL);
1522 if (!ldb) {
1523 ldm_crit ("Out of memory.");
1524 goto out;
1525 }
1526
1527 /* Parse and check privheads. */
1528 if (!ldm_validate_privheads(state, &ldb->ph))
1529 goto out; /* Already logged */
1530
1531 /* All further references are relative to base (database start). */
1532 base = ldb->ph.config_start;
1533
1534 /* Parse and check tocs and vmdb. */
1535 if (!ldm_validate_tocblocks(state, base, ldb) ||
1536 !ldm_validate_vmdb(state, base, ldb))
1537 goto out; /* Already logged */
1538
1539 /* Initialize vblk lists in ldmdb struct */
1540 INIT_LIST_HEAD (&ldb->v_dgrp);
1541 INIT_LIST_HEAD (&ldb->v_disk);
1542 INIT_LIST_HEAD (&ldb->v_volu);
1543 INIT_LIST_HEAD (&ldb->v_comp);
1544 INIT_LIST_HEAD (&ldb->v_part);
1545
1546 if (!ldm_get_vblks(state, base, ldb)) {
1547 ldm_crit ("Failed to read the VBLKs from the database.");
1548 goto cleanup;
1549 }
1550
1551 /* Finally, create the data partition devices. */
1552 if (ldm_create_data_partitions(state, ldb)) {
1553 ldm_debug ("Parsed LDM database successfully.");
1554 result = 1;
1555 }
1556 /* else Already logged */
1557
1558cleanup:
1559 ldm_free_vblks (&ldb->v_dgrp);
1560 ldm_free_vblks (&ldb->v_disk);
1561 ldm_free_vblks (&ldb->v_volu);
1562 ldm_free_vblks (&ldb->v_comp);
1563 ldm_free_vblks (&ldb->v_part);
1564out:
1565 kfree (ldb);
1566 return result;
1567}
diff --git a/block/partitions/ldm.h b/block/partitions/ldm.h
deleted file mode 100644
index 374242c0971..00000000000
--- a/block/partitions/ldm.h
+++ /dev/null
@@ -1,215 +0,0 @@
1/**
2 * ldm - Part of the Linux-NTFS project.
3 *
4 * Copyright (C) 2001,2002 Richard Russon <ldm@flatcap.org>
5 * Copyright (c) 2001-2007 Anton Altaparmakov
6 * Copyright (C) 2001,2002 Jakob Kemi <jakob.kemi@telia.com>
7 *
8 * Documentation is available at http://www.linux-ntfs.org/doku.php?id=downloads
9 *
10 * This program is free software; you can redistribute it and/or modify it
11 * under the terms of the GNU General Public License as published by the Free
12 * Software Foundation; either version 2 of the License, or (at your option)
13 * any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program (in the main directory of the Linux-NTFS source
22 * in the file COPYING); if not, write to the Free Software Foundation,
23 * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24 */
25
26#ifndef _FS_PT_LDM_H_
27#define _FS_PT_LDM_H_
28
29#include <linux/types.h>
30#include <linux/list.h>
31#include <linux/genhd.h>
32#include <linux/fs.h>
33#include <asm/unaligned.h>
34#include <asm/byteorder.h>
35
36struct parsed_partitions;
37
38/* Magic numbers in CPU format. */
39#define MAGIC_VMDB 0x564D4442 /* VMDB */
40#define MAGIC_VBLK 0x56424C4B /* VBLK */
41#define MAGIC_PRIVHEAD 0x5052495648454144ULL /* PRIVHEAD */
42#define MAGIC_TOCBLOCK 0x544F43424C4F434BULL /* TOCBLOCK */
43
44/* The defined vblk types. */
45#define VBLK_VOL5 0x51 /* Volume, version 5 */
46#define VBLK_CMP3 0x32 /* Component, version 3 */
47#define VBLK_PRT3 0x33 /* Partition, version 3 */
48#define VBLK_DSK3 0x34 /* Disk, version 3 */
49#define VBLK_DSK4 0x44 /* Disk, version 4 */
50#define VBLK_DGR3 0x35 /* Disk Group, version 3 */
51#define VBLK_DGR4 0x45 /* Disk Group, version 4 */
52
53/* vblk flags indicating extra information will be present */
54#define VBLK_FLAG_COMP_STRIPE 0x10
55#define VBLK_FLAG_PART_INDEX 0x08
56#define VBLK_FLAG_DGR3_IDS 0x08
57#define VBLK_FLAG_DGR4_IDS 0x08
58#define VBLK_FLAG_VOLU_ID1 0x08
59#define VBLK_FLAG_VOLU_ID2 0x20
60#define VBLK_FLAG_VOLU_SIZE 0x80
61#define VBLK_FLAG_VOLU_DRIVE 0x02
62
63/* size of a vblk's static parts */
64#define VBLK_SIZE_HEAD 16
65#define VBLK_SIZE_CMP3 22 /* Name and version */
66#define VBLK_SIZE_DGR3 12
67#define VBLK_SIZE_DGR4 44
68#define VBLK_SIZE_DSK3 12
69#define VBLK_SIZE_DSK4 45
70#define VBLK_SIZE_PRT3 28
71#define VBLK_SIZE_VOL5 58
72
73/* component types */
74#define COMP_STRIPE 0x01 /* Stripe-set */
75#define COMP_BASIC 0x02 /* Basic disk */
76#define COMP_RAID 0x03 /* Raid-set */
77
78/* Other constants. */
79#define LDM_DB_SIZE 2048 /* Size in sectors (= 1MiB). */
80
81#define OFF_PRIV1 6 /* Offset of the first privhead
82 relative to the start of the
83 device in sectors */
84
85/* Offsets to structures within the LDM Database in sectors. */
86#define OFF_PRIV2 1856 /* Backup private headers. */
87#define OFF_PRIV3 2047
88
89#define OFF_TOCB1 1 /* Tables of contents. */
90#define OFF_TOCB2 2
91#define OFF_TOCB3 2045
92#define OFF_TOCB4 2046
93
94#define OFF_VMDB 17 /* List of partitions. */
95
96#define LDM_PARTITION 0x42 /* Formerly SFS (Landis). */
97
98#define TOC_BITMAP1 "config" /* Names of the two defined */
99#define TOC_BITMAP2 "log" /* bitmaps in the TOCBLOCK. */
100
101/* Borrowed from msdos.c */
102#define SYS_IND(p) (get_unaligned(&(p)->sys_ind))
103
104struct frag { /* VBLK Fragment handling */
105 struct list_head list;
106 u32 group;
107 u8 num; /* Total number of records */
108 u8 rec; /* This is record number n */
109 u8 map; /* Which portions are in use */
110 u8 data[0];
111};
112
113/* In memory LDM database structures. */
114
115#define GUID_SIZE 16
116
117struct privhead { /* Offsets and sizes are in sectors. */
118 u16 ver_major;
119 u16 ver_minor;
120 u64 logical_disk_start;
121 u64 logical_disk_size;
122 u64 config_start;
123 u64 config_size;
124 u8 disk_id[GUID_SIZE];
125};
126
127struct tocblock { /* We have exactly two bitmaps. */
128 u8 bitmap1_name[16];
129 u64 bitmap1_start;
130 u64 bitmap1_size;
131 u8 bitmap2_name[16];
132 u64 bitmap2_start;
133 u64 bitmap2_size;
134};
135
136struct vmdb { /* VMDB: The database header */
137 u16 ver_major;
138 u16 ver_minor;
139 u32 vblk_size;
140 u32 vblk_offset;
141 u32 last_vblk_seq;
142};
143
144struct vblk_comp { /* VBLK Component */
145 u8 state[16];
146 u64 parent_id;
147 u8 type;
148 u8 children;
149 u16 chunksize;
150};
151
152struct vblk_dgrp { /* VBLK Disk Group */
153 u8 disk_id[64];
154};
155
156struct vblk_disk { /* VBLK Disk */
157 u8 disk_id[GUID_SIZE];
158 u8 alt_name[128];
159};
160
161struct vblk_part { /* VBLK Partition */
162 u64 start;
163 u64 size; /* start, size and vol_off in sectors */
164 u64 volume_offset;
165 u64 parent_id;
166 u64 disk_id;
167 u8 partnum;
168};
169
170struct vblk_volu { /* VBLK Volume */
171 u8 volume_type[16];
172 u8 volume_state[16];
173 u8 guid[16];
174 u8 drive_hint[4];
175 u64 size;
176 u8 partition_type;
177};
178
179struct vblk_head { /* VBLK standard header */
180 u32 group;
181 u16 rec;
182 u16 nrec;
183};
184
185struct vblk { /* Generalised VBLK */
186 u8 name[64];
187 u64 obj_id;
188 u32 sequence;
189 u8 flags;
190 u8 type;
191 union {
192 struct vblk_comp comp;
193 struct vblk_dgrp dgrp;
194 struct vblk_disk disk;
195 struct vblk_part part;
196 struct vblk_volu volu;
197 } vblk;
198 struct list_head list;
199};
200
201struct ldmdb { /* Cache of the database */
202 struct privhead ph;
203 struct tocblock toc;
204 struct vmdb vm;
205 struct list_head v_dgrp;
206 struct list_head v_disk;
207 struct list_head v_volu;
208 struct list_head v_comp;
209 struct list_head v_part;
210};
211
212int ldm_partition(struct parsed_partitions *state);
213
214#endif /* _FS_PT_LDM_H_ */
215
diff --git a/block/partitions/mac.c b/block/partitions/mac.c
deleted file mode 100644
index 11f688bd76c..00000000000
--- a/block/partitions/mac.c
+++ /dev/null
@@ -1,134 +0,0 @@
1/*
2 * fs/partitions/mac.c
3 *
4 * Code extracted from drivers/block/genhd.c
5 * Copyright (C) 1991-1998 Linus Torvalds
6 * Re-organised Feb 1998 Russell King
7 */
8
9#include <linux/ctype.h>
10#include "check.h"
11#include "mac.h"
12
13#ifdef CONFIG_PPC_PMAC
14#include <asm/machdep.h>
15extern void note_bootable_part(dev_t dev, int part, int goodness);
16#endif
17
18/*
19 * Code to understand MacOS partition tables.
20 */
21
22static inline void mac_fix_string(char *stg, int len)
23{
24 int i;
25
26 for (i = len - 1; i >= 0 && stg[i] == ' '; i--)
27 stg[i] = 0;
28}
29
30int mac_partition(struct parsed_partitions *state)
31{
32 Sector sect;
33 unsigned char *data;
34 int slot, blocks_in_map;
35 unsigned secsize;
36#ifdef CONFIG_PPC_PMAC
37 int found_root = 0;
38 int found_root_goodness = 0;
39#endif
40 struct mac_partition *part;
41 struct mac_driver_desc *md;
42
43 /* Get 0th block and look at the first partition map entry. */
44 md = read_part_sector(state, 0, &sect);
45 if (!md)
46 return -1;
47 if (be16_to_cpu(md->signature) != MAC_DRIVER_MAGIC) {
48 put_dev_sector(sect);
49 return 0;
50 }
51 secsize = be16_to_cpu(md->block_size);
52 put_dev_sector(sect);
53 data = read_part_sector(state, secsize/512, &sect);
54 if (!data)
55 return -1;
56 part = (struct mac_partition *) (data + secsize%512);
57 if (be16_to_cpu(part->signature) != MAC_PARTITION_MAGIC) {
58 put_dev_sector(sect);
59 return 0; /* not a MacOS disk */
60 }
61 blocks_in_map = be32_to_cpu(part->map_count);
62 if (blocks_in_map < 0 || blocks_in_map >= DISK_MAX_PARTS) {
63 put_dev_sector(sect);
64 return 0;
65 }
66 strlcat(state->pp_buf, " [mac]", PAGE_SIZE);
67 for (slot = 1; slot <= blocks_in_map; ++slot) {
68 int pos = slot * secsize;
69 put_dev_sector(sect);
70 data = read_part_sector(state, pos/512, &sect);
71 if (!data)
72 return -1;
73 part = (struct mac_partition *) (data + pos%512);
74 if (be16_to_cpu(part->signature) != MAC_PARTITION_MAGIC)
75 break;
76 put_partition(state, slot,
77 be32_to_cpu(part->start_block) * (secsize/512),
78 be32_to_cpu(part->block_count) * (secsize/512));
79
80 if (!strnicmp(part->type, "Linux_RAID", 10))
81 state->parts[slot].flags = ADDPART_FLAG_RAID;
82#ifdef CONFIG_PPC_PMAC
83 /*
84 * If this is the first bootable partition, tell the
85 * setup code, in case it wants to make this the root.
86 */
87 if (machine_is(powermac)) {
88 int goodness = 0;
89
90 mac_fix_string(part->processor, 16);
91 mac_fix_string(part->name, 32);
92 mac_fix_string(part->type, 32);
93
94 if ((be32_to_cpu(part->status) & MAC_STATUS_BOOTABLE)
95 && strcasecmp(part->processor, "powerpc") == 0)
96 goodness++;
97
98 if (strcasecmp(part->type, "Apple_UNIX_SVR2") == 0
99 || (strnicmp(part->type, "Linux", 5) == 0
100 && strcasecmp(part->type, "Linux_swap") != 0)) {
101 int i, l;
102
103 goodness++;
104 l = strlen(part->name);
105 if (strcmp(part->name, "/") == 0)
106 goodness++;
107 for (i = 0; i <= l - 4; ++i) {
108 if (strnicmp(part->name + i, "root",
109 4) == 0) {
110 goodness += 2;
111 break;
112 }
113 }
114 if (strnicmp(part->name, "swap", 4) == 0)
115 goodness--;
116 }
117
118 if (goodness > found_root_goodness) {
119 found_root = slot;
120 found_root_goodness = goodness;
121 }
122 }
123#endif /* CONFIG_PPC_PMAC */
124 }
125#ifdef CONFIG_PPC_PMAC
126 if (found_root_goodness)
127 note_bootable_part(state->bdev->bd_dev, found_root,
128 found_root_goodness);
129#endif
130
131 put_dev_sector(sect);
132 strlcat(state->pp_buf, "\n", PAGE_SIZE);
133 return 1;
134}
diff --git a/block/partitions/mac.h b/block/partitions/mac.h
deleted file mode 100644
index 3c7d9843638..00000000000
--- a/block/partitions/mac.h
+++ /dev/null
@@ -1,44 +0,0 @@
1/*
2 * fs/partitions/mac.h
3 */
4
5#define MAC_PARTITION_MAGIC 0x504d
6
7/* type field value for A/UX or other Unix partitions */
8#define APPLE_AUX_TYPE "Apple_UNIX_SVR2"
9
10struct mac_partition {
11 __be16 signature; /* expected to be MAC_PARTITION_MAGIC */
12 __be16 res1;
13 __be32 map_count; /* # blocks in partition map */
14 __be32 start_block; /* absolute starting block # of partition */
15 __be32 block_count; /* number of blocks in partition */
16 char name[32]; /* partition name */
17 char type[32]; /* string type description */
18 __be32 data_start; /* rel block # of first data block */
19 __be32 data_count; /* number of data blocks */
20 __be32 status; /* partition status bits */
21 __be32 boot_start;
22 __be32 boot_size;
23 __be32 boot_load;
24 __be32 boot_load2;
25 __be32 boot_entry;
26 __be32 boot_entry2;
27 __be32 boot_cksum;
28 char processor[16]; /* identifies ISA of boot */
29 /* there is more stuff after this that we don't need */
30};
31
32#define MAC_STATUS_BOOTABLE 8 /* partition is bootable */
33
34#define MAC_DRIVER_MAGIC 0x4552
35
36/* Driver descriptor structure, in block 0 */
37struct mac_driver_desc {
38 __be16 signature; /* expected to be MAC_DRIVER_MAGIC */
39 __be16 block_size;
40 __be32 block_count;
41 /* ... more stuff */
42};
43
44int mac_partition(struct parsed_partitions *state);
diff --git a/block/partitions/msdos.c b/block/partitions/msdos.c
deleted file mode 100644
index 8752a5d2656..00000000000
--- a/block/partitions/msdos.c
+++ /dev/null
@@ -1,569 +0,0 @@
1/*
2 * fs/partitions/msdos.c
3 *
4 * Code extracted from drivers/block/genhd.c
5 * Copyright (C) 1991-1998 Linus Torvalds
6 *
7 * Thanks to Branko Lankester, lankeste@fwi.uva.nl, who found a bug
8 * in the early extended-partition checks and added DM partitions
9 *
10 * Support for DiskManager v6.0x added by Mark Lord,
11 * with information provided by OnTrack. This now works for linux fdisk
12 * and LILO, as well as loadlin and bootln. Note that disks other than
13 * /dev/hda *must* have a "DOS" type 0x51 partition in the first slot (hda1).
14 *
15 * More flexible handling of extended partitions - aeb, 950831
16 *
17 * Check partition table on IDE disks for common CHS translations
18 *
19 * Re-organised Feb 1998 Russell King
20 */
21#include <linux/msdos_fs.h>
22
23#include "check.h"
24#include "msdos.h"
25#include "efi.h"
26
27/*
28 * Many architectures don't like unaligned accesses, while
29 * the nr_sects and start_sect partition table entries are
30 * at a 2 (mod 4) address.
31 */
32#include <asm/unaligned.h>
33
34#define SYS_IND(p) get_unaligned(&p->sys_ind)
35
36static inline sector_t nr_sects(struct partition *p)
37{
38 return (sector_t)get_unaligned_le32(&p->nr_sects);
39}
40
41static inline sector_t start_sect(struct partition *p)
42{
43 return (sector_t)get_unaligned_le32(&p->start_sect);
44}
45
46static inline int is_extended_partition(struct partition *p)
47{
48 return (SYS_IND(p) == DOS_EXTENDED_PARTITION ||
49 SYS_IND(p) == WIN98_EXTENDED_PARTITION ||
50 SYS_IND(p) == LINUX_EXTENDED_PARTITION);
51}
52
53#define MSDOS_LABEL_MAGIC1 0x55
54#define MSDOS_LABEL_MAGIC2 0xAA
55
56static inline int
57msdos_magic_present(unsigned char *p)
58{
59 return (p[0] == MSDOS_LABEL_MAGIC1 && p[1] == MSDOS_LABEL_MAGIC2);
60}
61
62/* Value is EBCDIC 'IBMA' */
63#define AIX_LABEL_MAGIC1 0xC9
64#define AIX_LABEL_MAGIC2 0xC2
65#define AIX_LABEL_MAGIC3 0xD4
66#define AIX_LABEL_MAGIC4 0xC1
67static int aix_magic_present(struct parsed_partitions *state, unsigned char *p)
68{
69 struct partition *pt = (struct partition *) (p + 0x1be);
70 Sector sect;
71 unsigned char *d;
72 int slot, ret = 0;
73
74 if (!(p[0] == AIX_LABEL_MAGIC1 &&
75 p[1] == AIX_LABEL_MAGIC2 &&
76 p[2] == AIX_LABEL_MAGIC3 &&
77 p[3] == AIX_LABEL_MAGIC4))
78 return 0;
79 /* Assume the partition table is valid if Linux partitions exists */
80 for (slot = 1; slot <= 4; slot++, pt++) {
81 if (pt->sys_ind == LINUX_SWAP_PARTITION ||
82 pt->sys_ind == LINUX_RAID_PARTITION ||
83 pt->sys_ind == LINUX_DATA_PARTITION ||
84 pt->sys_ind == LINUX_LVM_PARTITION ||
85 is_extended_partition(pt))
86 return 0;
87 }
88 d = read_part_sector(state, 7, &sect);
89 if (d) {
90 if (d[0] == '_' && d[1] == 'L' && d[2] == 'V' && d[3] == 'M')
91 ret = 1;
92 put_dev_sector(sect);
93 };
94 return ret;
95}
96
97static void set_info(struct parsed_partitions *state, int slot,
98 u32 disksig)
99{
100 struct partition_meta_info *info = &state->parts[slot].info;
101
102 snprintf(info->uuid, sizeof(info->uuid), "%08x-%02x", disksig,
103 slot);
104 info->volname[0] = 0;
105 state->parts[slot].has_info = true;
106}
107
108/*
109 * Create devices for each logical partition in an extended partition.
110 * The logical partitions form a linked list, with each entry being
111 * a partition table with two entries. The first entry
112 * is the real data partition (with a start relative to the partition
113 * table start). The second is a pointer to the next logical partition
114 * (with a start relative to the entire extended partition).
115 * We do not create a Linux partition for the partition tables, but
116 * only for the actual data partitions.
117 */
118
119static void parse_extended(struct parsed_partitions *state,
120 sector_t first_sector, sector_t first_size,
121 u32 disksig)
122{
123 struct partition *p;
124 Sector sect;
125 unsigned char *data;
126 sector_t this_sector, this_size;
127 sector_t sector_size = bdev_logical_block_size(state->bdev) / 512;
128 int loopct = 0; /* number of links followed
129 without finding a data partition */
130 int i;
131
132 this_sector = first_sector;
133 this_size = first_size;
134
135 while (1) {
136 if (++loopct > 100)
137 return;
138 if (state->next == state->limit)
139 return;
140 data = read_part_sector(state, this_sector, &sect);
141 if (!data)
142 return;
143
144 if (!msdos_magic_present(data + 510))
145 goto done;
146
147 p = (struct partition *) (data + 0x1be);
148
149 /*
150 * Usually, the first entry is the real data partition,
151 * the 2nd entry is the next extended partition, or empty,
152 * and the 3rd and 4th entries are unused.
153 * However, DRDOS sometimes has the extended partition as
154 * the first entry (when the data partition is empty),
155 * and OS/2 seems to use all four entries.
156 */
157
158 /*
159 * First process the data partition(s)
160 */
161 for (i=0; i<4; i++, p++) {
162 sector_t offs, size, next;
163 if (!nr_sects(p) || is_extended_partition(p))
164 continue;
165
166 /* Check the 3rd and 4th entries -
167 these sometimes contain random garbage */
168 offs = start_sect(p)*sector_size;
169 size = nr_sects(p)*sector_size;
170 next = this_sector + offs;
171 if (i >= 2) {
172 if (offs + size > this_size)
173 continue;
174 if (next < first_sector)
175 continue;
176 if (next + size > first_sector + first_size)
177 continue;
178 }
179
180 put_partition(state, state->next, next, size);
181 set_info(state, state->next, disksig);
182 if (SYS_IND(p) == LINUX_RAID_PARTITION)
183 state->parts[state->next].flags = ADDPART_FLAG_RAID;
184 loopct = 0;
185 if (++state->next == state->limit)
186 goto done;
187 }
188 /*
189 * Next, process the (first) extended partition, if present.
190 * (So far, there seems to be no reason to make
191 * parse_extended() recursive and allow a tree
192 * of extended partitions.)
193 * It should be a link to the next logical partition.
194 */
195 p -= 4;
196 for (i=0; i<4; i++, p++)
197 if (nr_sects(p) && is_extended_partition(p))
198 break;
199 if (i == 4)
200 goto done; /* nothing left to do */
201
202 this_sector = first_sector + start_sect(p) * sector_size;
203 this_size = nr_sects(p) * sector_size;
204 put_dev_sector(sect);
205 }
206done:
207 put_dev_sector(sect);
208}
209
210/* james@bpgc.com: Solaris has a nasty indicator: 0x82 which also
211 indicates linux swap. Be careful before believing this is Solaris. */
212
213static void parse_solaris_x86(struct parsed_partitions *state,
214 sector_t offset, sector_t size, int origin)
215{
216#ifdef CONFIG_SOLARIS_X86_PARTITION
217 Sector sect;
218 struct solaris_x86_vtoc *v;
219 int i;
220 short max_nparts;
221
222 v = read_part_sector(state, offset + 1, &sect);
223 if (!v)
224 return;
225 if (le32_to_cpu(v->v_sanity) != SOLARIS_X86_VTOC_SANE) {
226 put_dev_sector(sect);
227 return;
228 }
229 {
230 char tmp[1 + BDEVNAME_SIZE + 10 + 11 + 1];
231
232 snprintf(tmp, sizeof(tmp), " %s%d: <solaris:", state->name, origin);
233 strlcat(state->pp_buf, tmp, PAGE_SIZE);
234 }
235 if (le32_to_cpu(v->v_version) != 1) {
236 char tmp[64];
237
238 snprintf(tmp, sizeof(tmp), " cannot handle version %d vtoc>\n",
239 le32_to_cpu(v->v_version));
240 strlcat(state->pp_buf, tmp, PAGE_SIZE);
241 put_dev_sector(sect);
242 return;
243 }
244 /* Ensure we can handle previous case of VTOC with 8 entries gracefully */
245 max_nparts = le16_to_cpu (v->v_nparts) > 8 ? SOLARIS_X86_NUMSLICE : 8;
246 for (i=0; i<max_nparts && state->next<state->limit; i++) {
247 struct solaris_x86_slice *s = &v->v_slice[i];
248 char tmp[3 + 10 + 1 + 1];
249
250 if (s->s_size == 0)
251 continue;
252 snprintf(tmp, sizeof(tmp), " [s%d]", i);
253 strlcat(state->pp_buf, tmp, PAGE_SIZE);
254 /* solaris partitions are relative to current MS-DOS
255 * one; must add the offset of the current partition */
256 put_partition(state, state->next++,
257 le32_to_cpu(s->s_start)+offset,
258 le32_to_cpu(s->s_size));
259 }
260 put_dev_sector(sect);
261 strlcat(state->pp_buf, " >\n", PAGE_SIZE);
262#endif
263}
264
265#if defined(CONFIG_BSD_DISKLABEL)
266/*
267 * Create devices for BSD partitions listed in a disklabel, under a
268 * dos-like partition. See parse_extended() for more information.
269 */
270static void parse_bsd(struct parsed_partitions *state,
271 sector_t offset, sector_t size, int origin, char *flavour,
272 int max_partitions)
273{
274 Sector sect;
275 struct bsd_disklabel *l;
276 struct bsd_partition *p;
277 char tmp[64];
278
279 l = read_part_sector(state, offset + 1, &sect);
280 if (!l)
281 return;
282 if (le32_to_cpu(l->d_magic) != BSD_DISKMAGIC) {
283 put_dev_sector(sect);
284 return;
285 }
286
287 snprintf(tmp, sizeof(tmp), " %s%d: <%s:", state->name, origin, flavour);
288 strlcat(state->pp_buf, tmp, PAGE_SIZE);
289
290 if (le16_to_cpu(l->d_npartitions) < max_partitions)
291 max_partitions = le16_to_cpu(l->d_npartitions);
292 for (p = l->d_partitions; p - l->d_partitions < max_partitions; p++) {
293 sector_t bsd_start, bsd_size;
294
295 if (state->next == state->limit)
296 break;
297 if (p->p_fstype == BSD_FS_UNUSED)
298 continue;
299 bsd_start = le32_to_cpu(p->p_offset);
300 bsd_size = le32_to_cpu(p->p_size);
301 if (offset == bsd_start && size == bsd_size)
302 /* full parent partition, we have it already */
303 continue;
304 if (offset > bsd_start || offset+size < bsd_start+bsd_size) {
305 strlcat(state->pp_buf, "bad subpartition - ignored\n", PAGE_SIZE);
306 continue;
307 }
308 put_partition(state, state->next++, bsd_start, bsd_size);
309 }
310 put_dev_sector(sect);
311 if (le16_to_cpu(l->d_npartitions) > max_partitions) {
312 snprintf(tmp, sizeof(tmp), " (ignored %d more)",
313 le16_to_cpu(l->d_npartitions) - max_partitions);
314 strlcat(state->pp_buf, tmp, PAGE_SIZE);
315 }
316 strlcat(state->pp_buf, " >\n", PAGE_SIZE);
317}
318#endif
319
320static void parse_freebsd(struct parsed_partitions *state,
321 sector_t offset, sector_t size, int origin)
322{
323#ifdef CONFIG_BSD_DISKLABEL
324 parse_bsd(state, offset, size, origin, "bsd", BSD_MAXPARTITIONS);
325#endif
326}
327
328static void parse_netbsd(struct parsed_partitions *state,
329 sector_t offset, sector_t size, int origin)
330{
331#ifdef CONFIG_BSD_DISKLABEL
332 parse_bsd(state, offset, size, origin, "netbsd", BSD_MAXPARTITIONS);
333#endif
334}
335
336static void parse_openbsd(struct parsed_partitions *state,
337 sector_t offset, sector_t size, int origin)
338{
339#ifdef CONFIG_BSD_DISKLABEL
340 parse_bsd(state, offset, size, origin, "openbsd",
341 OPENBSD_MAXPARTITIONS);
342#endif
343}
344
345/*
346 * Create devices for Unixware partitions listed in a disklabel, under a
347 * dos-like partition. See parse_extended() for more information.
348 */
349static void parse_unixware(struct parsed_partitions *state,
350 sector_t offset, sector_t size, int origin)
351{
352#ifdef CONFIG_UNIXWARE_DISKLABEL
353 Sector sect;
354 struct unixware_disklabel *l;
355 struct unixware_slice *p;
356
357 l = read_part_sector(state, offset + 29, &sect);
358 if (!l)
359 return;
360 if (le32_to_cpu(l->d_magic) != UNIXWARE_DISKMAGIC ||
361 le32_to_cpu(l->vtoc.v_magic) != UNIXWARE_DISKMAGIC2) {
362 put_dev_sector(sect);
363 return;
364 }
365 {
366 char tmp[1 + BDEVNAME_SIZE + 10 + 12 + 1];
367
368 snprintf(tmp, sizeof(tmp), " %s%d: <unixware:", state->name, origin);
369 strlcat(state->pp_buf, tmp, PAGE_SIZE);
370 }
371 p = &l->vtoc.v_slice[1];
372 /* I omit the 0th slice as it is the same as whole disk. */
373 while (p - &l->vtoc.v_slice[0] < UNIXWARE_NUMSLICE) {
374 if (state->next == state->limit)
375 break;
376
377 if (p->s_label != UNIXWARE_FS_UNUSED)
378 put_partition(state, state->next++,
379 le32_to_cpu(p->start_sect),
380 le32_to_cpu(p->nr_sects));
381 p++;
382 }
383 put_dev_sector(sect);
384 strlcat(state->pp_buf, " >\n", PAGE_SIZE);
385#endif
386}
387
388/*
389 * Minix 2.0.0/2.0.2 subpartition support.
390 * Anand Krishnamurthy <anandk@wiproge.med.ge.com>
391 * Rajeev V. Pillai <rajeevvp@yahoo.com>
392 */
393static void parse_minix(struct parsed_partitions *state,
394 sector_t offset, sector_t size, int origin)
395{
396#ifdef CONFIG_MINIX_SUBPARTITION
397 Sector sect;
398 unsigned char *data;
399 struct partition *p;
400 int i;
401
402 data = read_part_sector(state, offset, &sect);
403 if (!data)
404 return;
405
406 p = (struct partition *)(data + 0x1be);
407
408 /* The first sector of a Minix partition can have either
409 * a secondary MBR describing its subpartitions, or
410 * the normal boot sector. */
411 if (msdos_magic_present (data + 510) &&
412 SYS_IND(p) == MINIX_PARTITION) { /* subpartition table present */
413 char tmp[1 + BDEVNAME_SIZE + 10 + 9 + 1];
414
415 snprintf(tmp, sizeof(tmp), " %s%d: <minix:", state->name, origin);
416 strlcat(state->pp_buf, tmp, PAGE_SIZE);
417 for (i = 0; i < MINIX_NR_SUBPARTITIONS; i++, p++) {
418 if (state->next == state->limit)
419 break;
420 /* add each partition in use */
421 if (SYS_IND(p) == MINIX_PARTITION)
422 put_partition(state, state->next++,
423 start_sect(p), nr_sects(p));
424 }
425 strlcat(state->pp_buf, " >\n", PAGE_SIZE);
426 }
427 put_dev_sector(sect);
428#endif /* CONFIG_MINIX_SUBPARTITION */
429}
430
431static struct {
432 unsigned char id;
433 void (*parse)(struct parsed_partitions *, sector_t, sector_t, int);
434} subtypes[] = {
435 {FREEBSD_PARTITION, parse_freebsd},
436 {NETBSD_PARTITION, parse_netbsd},
437 {OPENBSD_PARTITION, parse_openbsd},
438 {MINIX_PARTITION, parse_minix},
439 {UNIXWARE_PARTITION, parse_unixware},
440 {SOLARIS_X86_PARTITION, parse_solaris_x86},
441 {NEW_SOLARIS_X86_PARTITION, parse_solaris_x86},
442 {0, NULL},
443};
444
445int msdos_partition(struct parsed_partitions *state)
446{
447 sector_t sector_size = bdev_logical_block_size(state->bdev) / 512;
448 Sector sect;
449 unsigned char *data;
450 struct partition *p;
451 struct fat_boot_sector *fb;
452 int slot;
453 u32 disksig;
454
455 data = read_part_sector(state, 0, &sect);
456 if (!data)
457 return -1;
458 if (!msdos_magic_present(data + 510)) {
459 put_dev_sector(sect);
460 return 0;
461 }
462
463 if (aix_magic_present(state, data)) {
464 put_dev_sector(sect);
465 strlcat(state->pp_buf, " [AIX]", PAGE_SIZE);
466 return 0;
467 }
468
469 /*
470 * Now that the 55aa signature is present, this is probably
471 * either the boot sector of a FAT filesystem or a DOS-type
472 * partition table. Reject this in case the boot indicator
473 * is not 0 or 0x80.
474 */
475 p = (struct partition *) (data + 0x1be);
476 for (slot = 1; slot <= 4; slot++, p++) {
477 if (p->boot_ind != 0 && p->boot_ind != 0x80) {
478 /*
479 * Even without a valid boot inidicator value
480 * its still possible this is valid FAT filesystem
481 * without a partition table.
482 */
483 fb = (struct fat_boot_sector *) data;
484 if (slot == 1 && fb->reserved && fb->fats
485 && fat_valid_media(fb->media)) {
486 strlcat(state->pp_buf, "\n", PAGE_SIZE);
487 put_dev_sector(sect);
488 return 1;
489 } else {
490 put_dev_sector(sect);
491 return 0;
492 }
493 }
494 }
495
496#ifdef CONFIG_EFI_PARTITION
497 p = (struct partition *) (data + 0x1be);
498 for (slot = 1 ; slot <= 4 ; slot++, p++) {
499 /* If this is an EFI GPT disk, msdos should ignore it. */
500 if (SYS_IND(p) == EFI_PMBR_OSTYPE_EFI_GPT) {
501 put_dev_sector(sect);
502 return 0;
503 }
504 }
505#endif
506 p = (struct partition *) (data + 0x1be);
507
508 disksig = le32_to_cpup((__le32 *)(data + 0x1b8));
509
510 /*
511 * Look for partitions in two passes:
512 * First find the primary and DOS-type extended partitions.
513 * On the second pass look inside *BSD, Unixware and Solaris partitions.
514 */
515
516 state->next = 5;
517 for (slot = 1 ; slot <= 4 ; slot++, p++) {
518 sector_t start = start_sect(p)*sector_size;
519 sector_t size = nr_sects(p)*sector_size;
520 if (!size)
521 continue;
522 if (is_extended_partition(p)) {
523 /*
524 * prevent someone doing mkfs or mkswap on an
525 * extended partition, but leave room for LILO
526 * FIXME: this uses one logical sector for > 512b
527 * sector, although it may not be enough/proper.
528 */
529 sector_t n = 2;
530 n = min(size, max(sector_size, n));
531 put_partition(state, slot, start, n);
532
533 strlcat(state->pp_buf, " <", PAGE_SIZE);
534 parse_extended(state, start, size, disksig);
535 strlcat(state->pp_buf, " >", PAGE_SIZE);
536 continue;
537 }
538 put_partition(state, slot, start, size);
539 set_info(state, slot, disksig);
540 if (SYS_IND(p) == LINUX_RAID_PARTITION)
541 state->parts[slot].flags = ADDPART_FLAG_RAID;
542 if (SYS_IND(p) == DM6_PARTITION)
543 strlcat(state->pp_buf, "[DM]", PAGE_SIZE);
544 if (SYS_IND(p) == EZD_PARTITION)
545 strlcat(state->pp_buf, "[EZD]", PAGE_SIZE);
546 }
547
548 strlcat(state->pp_buf, "\n", PAGE_SIZE);
549
550 /* second pass - output for each on a separate line */
551 p = (struct partition *) (0x1be + data);
552 for (slot = 1 ; slot <= 4 ; slot++, p++) {
553 unsigned char id = SYS_IND(p);
554 int n;
555
556 if (!nr_sects(p))
557 continue;
558
559 for (n = 0; subtypes[n].parse && id != subtypes[n].id; n++)
560 ;
561
562 if (!subtypes[n].parse)
563 continue;
564 subtypes[n].parse(state, start_sect(p) * sector_size,
565 nr_sects(p) * sector_size, slot);
566 }
567 put_dev_sector(sect);
568 return 1;
569}
diff --git a/block/partitions/msdos.h b/block/partitions/msdos.h
deleted file mode 100644
index 38c781c490b..00000000000
--- a/block/partitions/msdos.h
+++ /dev/null
@@ -1,8 +0,0 @@
1/*
2 * fs/partitions/msdos.h
3 */
4
5#define MSDOS_LABEL_MAGIC 0xAA55
6
7int msdos_partition(struct parsed_partitions *state);
8
diff --git a/block/partitions/osf.c b/block/partitions/osf.c
deleted file mode 100644
index 764b86a0196..00000000000
--- a/block/partitions/osf.c
+++ /dev/null
@@ -1,86 +0,0 @@
1/*
2 * fs/partitions/osf.c
3 *
4 * Code extracted from drivers/block/genhd.c
5 *
6 * Copyright (C) 1991-1998 Linus Torvalds
7 * Re-organised Feb 1998 Russell King
8 */
9
10#include "check.h"
11#include "osf.h"
12
13#define MAX_OSF_PARTITIONS 18
14
15int osf_partition(struct parsed_partitions *state)
16{
17 int i;
18 int slot = 1;
19 unsigned int npartitions;
20 Sector sect;
21 unsigned char *data;
22 struct disklabel {
23 __le32 d_magic;
24 __le16 d_type,d_subtype;
25 u8 d_typename[16];
26 u8 d_packname[16];
27 __le32 d_secsize;
28 __le32 d_nsectors;
29 __le32 d_ntracks;
30 __le32 d_ncylinders;
31 __le32 d_secpercyl;
32 __le32 d_secprtunit;
33 __le16 d_sparespertrack;
34 __le16 d_sparespercyl;
35 __le32 d_acylinders;
36 __le16 d_rpm, d_interleave, d_trackskew, d_cylskew;
37 __le32 d_headswitch, d_trkseek, d_flags;
38 __le32 d_drivedata[5];
39 __le32 d_spare[5];
40 __le32 d_magic2;
41 __le16 d_checksum;
42 __le16 d_npartitions;
43 __le32 d_bbsize, d_sbsize;
44 struct d_partition {
45 __le32 p_size;
46 __le32 p_offset;
47 __le32 p_fsize;
48 u8 p_fstype;
49 u8 p_frag;
50 __le16 p_cpg;
51 } d_partitions[MAX_OSF_PARTITIONS];
52 } * label;
53 struct d_partition * partition;
54
55 data = read_part_sector(state, 0, &sect);
56 if (!data)
57 return -1;
58
59 label = (struct disklabel *) (data+64);
60 partition = label->d_partitions;
61 if (le32_to_cpu(label->d_magic) != DISKLABELMAGIC) {
62 put_dev_sector(sect);
63 return 0;
64 }
65 if (le32_to_cpu(label->d_magic2) != DISKLABELMAGIC) {
66 put_dev_sector(sect);
67 return 0;
68 }
69 npartitions = le16_to_cpu(label->d_npartitions);
70 if (npartitions > MAX_OSF_PARTITIONS) {
71 put_dev_sector(sect);
72 return 0;
73 }
74 for (i = 0 ; i < npartitions; i++, partition++) {
75 if (slot == state->limit)
76 break;
77 if (le32_to_cpu(partition->p_size))
78 put_partition(state, slot,
79 le32_to_cpu(partition->p_offset),
80 le32_to_cpu(partition->p_size));
81 slot++;
82 }
83 strlcat(state->pp_buf, "\n", PAGE_SIZE);
84 put_dev_sector(sect);
85 return 1;
86}
diff --git a/block/partitions/osf.h b/block/partitions/osf.h
deleted file mode 100644
index 20ed2315ec1..00000000000
--- a/block/partitions/osf.h
+++ /dev/null
@@ -1,7 +0,0 @@
1/*
2 * fs/partitions/osf.h
3 */
4
5#define DISKLABELMAGIC (0x82564557UL)
6
7int osf_partition(struct parsed_partitions *state);
diff --git a/block/partitions/sgi.c b/block/partitions/sgi.c
deleted file mode 100644
index ea8a86dceaf..00000000000
--- a/block/partitions/sgi.c
+++ /dev/null
@@ -1,82 +0,0 @@
1/*
2 * fs/partitions/sgi.c
3 *
4 * Code extracted from drivers/block/genhd.c
5 */
6
7#include "check.h"
8#include "sgi.h"
9
10struct sgi_disklabel {
11 __be32 magic_mushroom; /* Big fat spliff... */
12 __be16 root_part_num; /* Root partition number */
13 __be16 swap_part_num; /* Swap partition number */
14 s8 boot_file[16]; /* Name of boot file for ARCS */
15 u8 _unused0[48]; /* Device parameter useless crapola.. */
16 struct sgi_volume {
17 s8 name[8]; /* Name of volume */
18 __be32 block_num; /* Logical block number */
19 __be32 num_bytes; /* How big, in bytes */
20 } volume[15];
21 struct sgi_partition {
22 __be32 num_blocks; /* Size in logical blocks */
23 __be32 first_block; /* First logical block */
24 __be32 type; /* Type of this partition */
25 } partitions[16];
26 __be32 csum; /* Disk label checksum */
27 __be32 _unused1; /* Padding */
28};
29
30int sgi_partition(struct parsed_partitions *state)
31{
32 int i, csum;
33 __be32 magic;
34 int slot = 1;
35 unsigned int start, blocks;
36 __be32 *ui, cs;
37 Sector sect;
38 struct sgi_disklabel *label;
39 struct sgi_partition *p;
40 char b[BDEVNAME_SIZE];
41
42 label = read_part_sector(state, 0, &sect);
43 if (!label)
44 return -1;
45 p = &label->partitions[0];
46 magic = label->magic_mushroom;
47 if(be32_to_cpu(magic) != SGI_LABEL_MAGIC) {
48 /*printk("Dev %s SGI disklabel: bad magic %08x\n",
49 bdevname(bdev, b), be32_to_cpu(magic));*/
50 put_dev_sector(sect);
51 return 0;
52 }
53 ui = ((__be32 *) (label + 1)) - 1;
54 for(csum = 0; ui >= ((__be32 *) label);) {
55 cs = *ui--;
56 csum += be32_to_cpu(cs);
57 }
58 if(csum) {
59 printk(KERN_WARNING "Dev %s SGI disklabel: csum bad, label corrupted\n",
60 bdevname(state->bdev, b));
61 put_dev_sector(sect);
62 return 0;
63 }
64 /* All SGI disk labels have 16 partitions, disks under Linux only
65 * have 15 minor's. Luckily there are always a few zero length
66 * partitions which we don't care about so we never overflow the
67 * current_minor.
68 */
69 for(i = 0; i < 16; i++, p++) {
70 blocks = be32_to_cpu(p->num_blocks);
71 start = be32_to_cpu(p->first_block);
72 if (blocks) {
73 put_partition(state, slot, start, blocks);
74 if (be32_to_cpu(p->type) == LINUX_RAID_PARTITION)
75 state->parts[slot].flags = ADDPART_FLAG_RAID;
76 }
77 slot++;
78 }
79 strlcat(state->pp_buf, "\n", PAGE_SIZE);
80 put_dev_sector(sect);
81 return 1;
82}
diff --git a/block/partitions/sgi.h b/block/partitions/sgi.h
deleted file mode 100644
index b9553ebdd5a..00000000000
--- a/block/partitions/sgi.h
+++ /dev/null
@@ -1,8 +0,0 @@
1/*
2 * fs/partitions/sgi.h
3 */
4
5extern int sgi_partition(struct parsed_partitions *state);
6
7#define SGI_LABEL_MAGIC 0x0be5a941
8
diff --git a/block/partitions/sun.c b/block/partitions/sun.c
deleted file mode 100644
index b5b6fcfb3d3..00000000000
--- a/block/partitions/sun.c
+++ /dev/null
@@ -1,122 +0,0 @@
1/*
2 * fs/partitions/sun.c
3 *
4 * Code extracted from drivers/block/genhd.c
5 *
6 * Copyright (C) 1991-1998 Linus Torvalds
7 * Re-organised Feb 1998 Russell King
8 */
9
10#include "check.h"
11#include "sun.h"
12
13int sun_partition(struct parsed_partitions *state)
14{
15 int i;
16 __be16 csum;
17 int slot = 1;
18 __be16 *ush;
19 Sector sect;
20 struct sun_disklabel {
21 unsigned char info[128]; /* Informative text string */
22 struct sun_vtoc {
23 __be32 version; /* Layout version */
24 char volume[8]; /* Volume name */
25 __be16 nparts; /* Number of partitions */
26 struct sun_info { /* Partition hdrs, sec 2 */
27 __be16 id;
28 __be16 flags;
29 } infos[8];
30 __be16 padding; /* Alignment padding */
31 __be32 bootinfo[3]; /* Info needed by mboot */
32 __be32 sanity; /* To verify vtoc sanity */
33 __be32 reserved[10]; /* Free space */
34 __be32 timestamp[8]; /* Partition timestamp */
35 } vtoc;
36 __be32 write_reinstruct; /* sectors to skip, writes */
37 __be32 read_reinstruct; /* sectors to skip, reads */
38 unsigned char spare[148]; /* Padding */
39 __be16 rspeed; /* Disk rotational speed */
40 __be16 pcylcount; /* Physical cylinder count */
41 __be16 sparecyl; /* extra sects per cylinder */
42 __be16 obs1; /* gap1 */
43 __be16 obs2; /* gap2 */
44 __be16 ilfact; /* Interleave factor */
45 __be16 ncyl; /* Data cylinder count */
46 __be16 nacyl; /* Alt. cylinder count */
47 __be16 ntrks; /* Tracks per cylinder */
48 __be16 nsect; /* Sectors per track */
49 __be16 obs3; /* bhead - Label head offset */
50 __be16 obs4; /* ppart - Physical Partition */
51 struct sun_partition {
52 __be32 start_cylinder;
53 __be32 num_sectors;
54 } partitions[8];
55 __be16 magic; /* Magic number */
56 __be16 csum; /* Label xor'd checksum */
57 } * label;
58 struct sun_partition *p;
59 unsigned long spc;
60 char b[BDEVNAME_SIZE];
61 int use_vtoc;
62 int nparts;
63
64 label = read_part_sector(state, 0, &sect);
65 if (!label)
66 return -1;
67
68 p = label->partitions;
69 if (be16_to_cpu(label->magic) != SUN_LABEL_MAGIC) {
70/* printk(KERN_INFO "Dev %s Sun disklabel: bad magic %04x\n",
71 bdevname(bdev, b), be16_to_cpu(label->magic)); */
72 put_dev_sector(sect);
73 return 0;
74 }
75 /* Look at the checksum */
76 ush = ((__be16 *) (label+1)) - 1;
77 for (csum = 0; ush >= ((__be16 *) label);)
78 csum ^= *ush--;
79 if (csum) {
80 printk("Dev %s Sun disklabel: Csum bad, label corrupted\n",
81 bdevname(state->bdev, b));
82 put_dev_sector(sect);
83 return 0;
84 }
85
86 /* Check to see if we can use the VTOC table */
87 use_vtoc = ((be32_to_cpu(label->vtoc.sanity) == SUN_VTOC_SANITY) &&
88 (be32_to_cpu(label->vtoc.version) == 1) &&
89 (be16_to_cpu(label->vtoc.nparts) <= 8));
90
91 /* Use 8 partition entries if not specified in validated VTOC */
92 nparts = (use_vtoc) ? be16_to_cpu(label->vtoc.nparts) : 8;
93
94 /*
95 * So that old Linux-Sun partitions continue to work,
96 * alow the VTOC to be used under the additional condition ...
97 */
98 use_vtoc = use_vtoc || !(label->vtoc.sanity ||
99 label->vtoc.version || label->vtoc.nparts);
100 spc = be16_to_cpu(label->ntrks) * be16_to_cpu(label->nsect);
101 for (i = 0; i < nparts; i++, p++) {
102 unsigned long st_sector;
103 unsigned int num_sectors;
104
105 st_sector = be32_to_cpu(p->start_cylinder) * spc;
106 num_sectors = be32_to_cpu(p->num_sectors);
107 if (num_sectors) {
108 put_partition(state, slot, st_sector, num_sectors);
109 state->parts[slot].flags = 0;
110 if (use_vtoc) {
111 if (be16_to_cpu(label->vtoc.infos[i].id) == LINUX_RAID_PARTITION)
112 state->parts[slot].flags |= ADDPART_FLAG_RAID;
113 else if (be16_to_cpu(label->vtoc.infos[i].id) == SUN_WHOLE_DISK)
114 state->parts[slot].flags |= ADDPART_FLAG_WHOLEDISK;
115 }
116 }
117 slot++;
118 }
119 strlcat(state->pp_buf, "\n", PAGE_SIZE);
120 put_dev_sector(sect);
121 return 1;
122}
diff --git a/block/partitions/sun.h b/block/partitions/sun.h
deleted file mode 100644
index 2424baa8319..00000000000
--- a/block/partitions/sun.h
+++ /dev/null
@@ -1,8 +0,0 @@
1/*
2 * fs/partitions/sun.h
3 */
4
5#define SUN_LABEL_MAGIC 0xDABE
6#define SUN_VTOC_SANITY 0x600DDEEE
7
8int sun_partition(struct parsed_partitions *state);
diff --git a/block/partitions/sysv68.c b/block/partitions/sysv68.c
deleted file mode 100644
index 9627ccffc1c..00000000000
--- a/block/partitions/sysv68.c
+++ /dev/null
@@ -1,95 +0,0 @@
1/*
2 * fs/partitions/sysv68.c
3 *
4 * Copyright (C) 2007 Philippe De Muyter <phdm@macqel.be>
5 */
6
7#include "check.h"
8#include "sysv68.h"
9
10/*
11 * Volume ID structure: on first 256-bytes sector of disk
12 */
13
14struct volumeid {
15 u8 vid_unused[248];
16 u8 vid_mac[8]; /* ASCII string "MOTOROLA" */
17};
18
19/*
20 * config block: second 256-bytes sector on disk
21 */
22
23struct dkconfig {
24 u8 ios_unused0[128];
25 __be32 ios_slcblk; /* Slice table block number */
26 __be16 ios_slccnt; /* Number of entries in slice table */
27 u8 ios_unused1[122];
28};
29
30/*
31 * combined volumeid and dkconfig block
32 */
33
34struct dkblk0 {
35 struct volumeid dk_vid;
36 struct dkconfig dk_ios;
37};
38
39/*
40 * Slice Table Structure
41 */
42
43struct slice {
44 __be32 nblocks; /* slice size (in blocks) */
45 __be32 blkoff; /* block offset of slice */
46};
47
48
49int sysv68_partition(struct parsed_partitions *state)
50{
51 int i, slices;
52 int slot = 1;
53 Sector sect;
54 unsigned char *data;
55 struct dkblk0 *b;
56 struct slice *slice;
57 char tmp[64];
58
59 data = read_part_sector(state, 0, &sect);
60 if (!data)
61 return -1;
62
63 b = (struct dkblk0 *)data;
64 if (memcmp(b->dk_vid.vid_mac, "MOTOROLA", sizeof(b->dk_vid.vid_mac))) {
65 put_dev_sector(sect);
66 return 0;
67 }
68 slices = be16_to_cpu(b->dk_ios.ios_slccnt);
69 i = be32_to_cpu(b->dk_ios.ios_slcblk);
70 put_dev_sector(sect);
71
72 data = read_part_sector(state, i, &sect);
73 if (!data)
74 return -1;
75
76 slices -= 1; /* last slice is the whole disk */
77 snprintf(tmp, sizeof(tmp), "sysV68: %s(s%u)", state->name, slices);
78 strlcat(state->pp_buf, tmp, PAGE_SIZE);
79 slice = (struct slice *)data;
80 for (i = 0; i < slices; i++, slice++) {
81 if (slot == state->limit)
82 break;
83 if (be32_to_cpu(slice->nblocks)) {
84 put_partition(state, slot,
85 be32_to_cpu(slice->blkoff),
86 be32_to_cpu(slice->nblocks));
87 snprintf(tmp, sizeof(tmp), "(s%u)", i);
88 strlcat(state->pp_buf, tmp, PAGE_SIZE);
89 }
90 slot++;
91 }
92 strlcat(state->pp_buf, "\n", PAGE_SIZE);
93 put_dev_sector(sect);
94 return 1;
95}
diff --git a/block/partitions/sysv68.h b/block/partitions/sysv68.h
deleted file mode 100644
index bf2f5ffa97a..00000000000
--- a/block/partitions/sysv68.h
+++ /dev/null
@@ -1 +0,0 @@
1extern int sysv68_partition(struct parsed_partitions *state);
diff --git a/block/partitions/ultrix.c b/block/partitions/ultrix.c
deleted file mode 100644
index 8dbaf9f77a9..00000000000
--- a/block/partitions/ultrix.c
+++ /dev/null
@@ -1,48 +0,0 @@
1/*
2 * fs/partitions/ultrix.c
3 *
4 * Code extracted from drivers/block/genhd.c
5 *
6 * Re-organised Jul 1999 Russell King
7 */
8
9#include "check.h"
10#include "ultrix.h"
11
12int ultrix_partition(struct parsed_partitions *state)
13{
14 int i;
15 Sector sect;
16 unsigned char *data;
17 struct ultrix_disklabel {
18 s32 pt_magic; /* magic no. indicating part. info exits */
19 s32 pt_valid; /* set by driver if pt is current */
20 struct pt_info {
21 s32 pi_nblocks; /* no. of sectors */
22 u32 pi_blkoff; /* block offset for start */
23 } pt_part[8];
24 } *label;
25
26#define PT_MAGIC 0x032957 /* Partition magic number */
27#define PT_VALID 1 /* Indicates if struct is valid */
28
29 data = read_part_sector(state, (16384 - sizeof(*label))/512, &sect);
30 if (!data)
31 return -1;
32
33 label = (struct ultrix_disklabel *)(data + 512 - sizeof(*label));
34
35 if (label->pt_magic == PT_MAGIC && label->pt_valid == PT_VALID) {
36 for (i=0; i<8; i++)
37 if (label->pt_part[i].pi_nblocks)
38 put_partition(state, i+1,
39 label->pt_part[i].pi_blkoff,
40 label->pt_part[i].pi_nblocks);
41 put_dev_sector(sect);
42 strlcat(state->pp_buf, "\n", PAGE_SIZE);
43 return 1;
44 } else {
45 put_dev_sector(sect);
46 return 0;
47 }
48}
diff --git a/block/partitions/ultrix.h b/block/partitions/ultrix.h
deleted file mode 100644
index a3cc00b2bde..00000000000
--- a/block/partitions/ultrix.h
+++ /dev/null
@@ -1,5 +0,0 @@
1/*
2 * fs/partitions/ultrix.h
3 */
4
5int ultrix_partition(struct parsed_partitions *state);
diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index 9a87daa6f4f..4f4230b79bb 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -24,7 +24,6 @@
24#include <linux/capability.h> 24#include <linux/capability.h>
25#include <linux/completion.h> 25#include <linux/completion.h>
26#include <linux/cdrom.h> 26#include <linux/cdrom.h>
27#include <linux/ratelimit.h>
28#include <linux/slab.h> 27#include <linux/slab.h>
29#include <linux/times.h> 28#include <linux/times.h>
30#include <asm/uaccess.h> 29#include <asm/uaccess.h>
@@ -566,7 +565,7 @@ int scsi_cmd_ioctl(struct request_queue *q, struct gendisk *bd_disk, fmode_t mod
566{ 565{
567 int err; 566 int err;
568 567
569 if (!q) 568 if (!q || blk_get_queue(q))
570 return -ENXIO; 569 return -ENXIO;
571 570
572 switch (cmd) { 571 switch (cmd) {
@@ -687,64 +686,11 @@ int scsi_cmd_ioctl(struct request_queue *q, struct gendisk *bd_disk, fmode_t mod
687 err = -ENOTTY; 686 err = -ENOTTY;
688 } 687 }
689 688
689 blk_put_queue(q);
690 return err; 690 return err;
691} 691}
692EXPORT_SYMBOL(scsi_cmd_ioctl); 692EXPORT_SYMBOL(scsi_cmd_ioctl);
693 693
694int scsi_verify_blk_ioctl(struct block_device *bd, unsigned int cmd)
695{
696 if (bd && bd == bd->bd_contains)
697 return 0;
698
699 /* Actually none of these is particularly useful on a partition,
700 * but they are safe.
701 */
702 switch (cmd) {
703 case SCSI_IOCTL_GET_IDLUN:
704 case SCSI_IOCTL_GET_BUS_NUMBER:
705 case SCSI_IOCTL_GET_PCI:
706 case SCSI_IOCTL_PROBE_HOST:
707 case SG_GET_VERSION_NUM:
708 case SG_SET_TIMEOUT:
709 case SG_GET_TIMEOUT:
710 case SG_GET_RESERVED_SIZE:
711 case SG_SET_RESERVED_SIZE:
712 case SG_EMULATED_HOST:
713 return 0;
714 case CDROM_GET_CAPABILITY:
715 /* Keep this until we remove the printk below. udev sends it
716 * and we do not want to spam dmesg about it. CD-ROMs do
717 * not have partitions, so we get here only for disks.
718 */
719 return -ENOIOCTLCMD;
720 default:
721 break;
722 }
723
724 if (capable(CAP_SYS_RAWIO))
725 return 0;
726
727 /* In particular, rule out all resets and host-specific ioctls. */
728 printk_ratelimited(KERN_WARNING
729 "%s: sending ioctl %x to a partition!\n", current->comm, cmd);
730
731 return -ENOIOCTLCMD;
732}
733EXPORT_SYMBOL(scsi_verify_blk_ioctl);
734
735int scsi_cmd_blk_ioctl(struct block_device *bd, fmode_t mode,
736 unsigned int cmd, void __user *arg)
737{
738 int ret;
739
740 ret = scsi_verify_blk_ioctl(bd, cmd);
741 if (ret < 0)
742 return ret;
743
744 return scsi_cmd_ioctl(bd->bd_disk->queue, bd->bd_disk, mode, cmd, arg);
745}
746EXPORT_SYMBOL(scsi_cmd_blk_ioctl);
747
748static int __init blk_scsi_ioctl_init(void) 694static int __init blk_scsi_ioctl_init(void)
749{ 695{
750 blk_set_cmd_filter_defaults(&blk_default_cmd_filter); 696 blk_set_cmd_filter_defaults(&blk_default_cmd_filter);