aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/ABI/testing/sysfs-driver-xen-blkback17
-rw-r--r--Documentation/ABI/testing/sysfs-driver-xen-blkfront10
-rw-r--r--Documentation/cgroups/blkio-controller.txt29
-rw-r--r--MAINTAINERS2
-rw-r--r--block/blk-cgroup.c105
-rw-r--r--block/blk-cgroup.h38
-rw-r--r--block/blk-throttle.c1064
-rw-r--r--drivers/block/Kconfig4
-rw-r--r--drivers/block/drbd/drbd_actlog.c21
-rw-r--r--drivers/block/drbd/drbd_int.h15
-rw-r--r--drivers/block/drbd/drbd_main.c61
-rw-r--r--drivers/block/drbd/drbd_nl.c185
-rw-r--r--drivers/block/drbd/drbd_receiver.c12
-rw-r--r--drivers/block/drbd/drbd_state.c4
-rw-r--r--drivers/block/rsxx/core.c359
-rw-r--r--drivers/block/rsxx/cregs.c14
-rw-r--r--drivers/block/rsxx/dev.c33
-rw-r--r--drivers/block/rsxx/dma.c185
-rw-r--r--drivers/block/rsxx/rsxx_priv.h10
-rw-r--r--drivers/block/xen-blkback/blkback.c872
-rw-r--r--drivers/block/xen-blkback/common.h147
-rw-r--r--drivers/block/xen-blkback/xenbus.c85
-rw-r--r--drivers/block/xen-blkfront.c532
-rw-r--r--include/linux/cgroup.h2
-rw-r--r--include/linux/drbd.h6
-rw-r--r--include/linux/drbd_genl.h2
-rw-r--r--include/linux/drbd_limits.h9
-rw-r--r--include/xen/interface/io/blkif.h53
-rw-r--r--include/xen/interface/io/ring.h5
29 files changed, 2854 insertions, 1027 deletions
diff --git a/Documentation/ABI/testing/sysfs-driver-xen-blkback b/Documentation/ABI/testing/sysfs-driver-xen-blkback
new file mode 100644
index 000000000000..8bb43b66eb55
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-driver-xen-blkback
@@ -0,0 +1,17 @@
1What: /sys/module/xen_blkback/parameters/max_buffer_pages
2Date: March 2013
3KernelVersion: 3.11
4Contact: Roger Pau Monné <roger.pau@citrix.com>
5Description:
6 Maximum number of free pages to keep in each block
7 backend buffer.
8
9What: /sys/module/xen_blkback/parameters/max_persistent_grants
10Date: March 2013
11KernelVersion: 3.11
12Contact: Roger Pau Monné <roger.pau@citrix.com>
13Description:
14 Maximum number of grants to map persistently in
15 blkback. If the frontend tries to use more than
16 max_persistent_grants, the LRU kicks in and starts
17 removing 5% of max_persistent_grants every 100ms.
diff --git a/Documentation/ABI/testing/sysfs-driver-xen-blkfront b/Documentation/ABI/testing/sysfs-driver-xen-blkfront
new file mode 100644
index 000000000000..c0a6cb7eb314
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-driver-xen-blkfront
@@ -0,0 +1,10 @@
1What: /sys/module/xen_blkfront/parameters/max
2Date: June 2013
3KernelVersion: 3.11
4Contact: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
5Description:
6 Maximum number of segments that the frontend will negotiate
7 with the backend for indirect descriptors. The default value
8 is 32 - higher value means more potential throughput but more
9 memory usage. The backend picks the minimum of the frontend
10 and its default backend value.
diff --git a/Documentation/cgroups/blkio-controller.txt b/Documentation/cgroups/blkio-controller.txt
index da272c8f44e7..cd556b914786 100644
--- a/Documentation/cgroups/blkio-controller.txt
+++ b/Documentation/cgroups/blkio-controller.txt
@@ -94,11 +94,13 @@ Throttling/Upper Limit policy
94 94
95Hierarchical Cgroups 95Hierarchical Cgroups
96==================== 96====================
97- Currently only CFQ supports hierarchical groups. For throttling,
98 cgroup interface does allow creation of hierarchical cgroups and
99 internally it treats them as flat hierarchy.
100 97
101 If somebody created a hierarchy like as follows. 98Both CFQ and throttling implement hierarchy support; however,
99throttling's hierarchy support is enabled iff "sane_behavior" is
100enabled from cgroup side, which currently is a development option and
101not publicly available.
102
103If somebody created a hierarchy like as follows.
102 104
103 root 105 root
104 / \ 106 / \
@@ -106,21 +108,20 @@ Hierarchical Cgroups
106 | 108 |
107 test3 109 test3
108 110
109 CFQ will handle the hierarchy correctly but and throttling will 111CFQ by default and throttling with "sane_behavior" will handle the
110 practically treat all groups at same level. For details on CFQ 112hierarchy correctly. For details on CFQ hierarchy support, refer to
111 hierarchy support, refer to Documentation/block/cfq-iosched.txt. 113Documentation/block/cfq-iosched.txt. For throttling, all limits apply
112 Throttling will treat the hierarchy as if it looks like the 114to the whole subtree while all statistics are local to the IOs
113 following. 115directly generated by tasks in that cgroup.
116
117Throttling without "sane_behavior" enabled from cgroup side will
118practically treat all groups at same level as if it looks like the
119following.
114 120
115 pivot 121 pivot
116 / / \ \ 122 / / \ \
117 root test1 test2 test3 123 root test1 test2 test3
118 124
119 Nesting cgroups, while allowed, isn't officially supported and blkio
120 genereates warning when cgroups nest. Once throttling implements
121 hierarchy support, hierarchy will be supported and the warning will
122 be removed.
123
124Various user visible config options 125Various user visible config options
125=================================== 126===================================
126CONFIG_BLK_CGROUP 127CONFIG_BLK_CGROUP
diff --git a/MAINTAINERS b/MAINTAINERS
index 58814f0f06bd..a0a76fb7323f 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3297,7 +3297,7 @@ F: Documentation/firmware_class/
3297F: drivers/base/firmware*.c 3297F: drivers/base/firmware*.c
3298F: include/linux/firmware.h 3298F: include/linux/firmware.h
3299 3299
3300FLASHSYSTEM DRIVER (IBM FlashSystem 70/80 PCI SSD Flash Card) 3300FLASH ADAPTER DRIVER (IBM Flash Adapter 900GB Full Height PCI Flash Card)
3301M: Joshua Morris <josh.h.morris@us.ibm.com> 3301M: Joshua Morris <josh.h.morris@us.ibm.com>
3302M: Philip Kelleher <pjk1939@linux.vnet.ibm.com> 3302M: Philip Kelleher <pjk1939@linux.vnet.ibm.com>
3303S: Maintained 3303S: Maintained
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index e8918ffaf96d..290792a13e3c 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -32,26 +32,6 @@ EXPORT_SYMBOL_GPL(blkcg_root);
32 32
33static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS]; 33static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];
34 34
35static struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg,
36 struct request_queue *q, bool update_hint);
37
38/**
39 * blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants
40 * @d_blkg: loop cursor pointing to the current descendant
41 * @pos_cgrp: used for iteration
42 * @p_blkg: target blkg to walk descendants of
43 *
44 * Walk @c_blkg through the descendants of @p_blkg. Must be used with RCU
45 * read locked. If called under either blkcg or queue lock, the iteration
46 * is guaranteed to include all and only online blkgs. The caller may
47 * update @pos_cgrp by calling cgroup_rightmost_descendant() to skip
48 * subtree.
49 */
50#define blkg_for_each_descendant_pre(d_blkg, pos_cgrp, p_blkg) \
51 cgroup_for_each_descendant_pre((pos_cgrp), (p_blkg)->blkcg->css.cgroup) \
52 if (((d_blkg) = __blkg_lookup(cgroup_to_blkcg(pos_cgrp), \
53 (p_blkg)->q, false)))
54
55static bool blkcg_policy_enabled(struct request_queue *q, 35static bool blkcg_policy_enabled(struct request_queue *q,
56 const struct blkcg_policy *pol) 36 const struct blkcg_policy *pol)
57{ 37{
@@ -71,18 +51,8 @@ static void blkg_free(struct blkcg_gq *blkg)
71 if (!blkg) 51 if (!blkg)
72 return; 52 return;
73 53
74 for (i = 0; i < BLKCG_MAX_POLS; i++) { 54 for (i = 0; i < BLKCG_MAX_POLS; i++)
75 struct blkcg_policy *pol = blkcg_policy[i]; 55 kfree(blkg->pd[i]);
76 struct blkg_policy_data *pd = blkg->pd[i];
77
78 if (!pd)
79 continue;
80
81 if (pol && pol->pd_exit_fn)
82 pol->pd_exit_fn(blkg);
83
84 kfree(pd);
85 }
86 56
87 blk_exit_rl(&blkg->rl); 57 blk_exit_rl(&blkg->rl);
88 kfree(blkg); 58 kfree(blkg);
@@ -134,10 +104,6 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
134 blkg->pd[i] = pd; 104 blkg->pd[i] = pd;
135 pd->blkg = blkg; 105 pd->blkg = blkg;
136 pd->plid = i; 106 pd->plid = i;
137
138 /* invoke per-policy init */
139 if (pol->pd_init_fn)
140 pol->pd_init_fn(blkg);
141 } 107 }
142 108
143 return blkg; 109 return blkg;
@@ -158,8 +124,8 @@ err_free:
158 * @q's bypass state. If @update_hint is %true, the caller should be 124 * @q's bypass state. If @update_hint is %true, the caller should be
159 * holding @q->queue_lock and lookup hint is updated on success. 125 * holding @q->queue_lock and lookup hint is updated on success.
160 */ 126 */
161static struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, 127struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q,
162 struct request_queue *q, bool update_hint) 128 bool update_hint)
163{ 129{
164 struct blkcg_gq *blkg; 130 struct blkcg_gq *blkg;
165 131
@@ -234,16 +200,25 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
234 } 200 }
235 blkg = new_blkg; 201 blkg = new_blkg;
236 202
237 /* link parent and insert */ 203 /* link parent */
238 if (blkcg_parent(blkcg)) { 204 if (blkcg_parent(blkcg)) {
239 blkg->parent = __blkg_lookup(blkcg_parent(blkcg), q, false); 205 blkg->parent = __blkg_lookup(blkcg_parent(blkcg), q, false);
240 if (WARN_ON_ONCE(!blkg->parent)) { 206 if (WARN_ON_ONCE(!blkg->parent)) {
241 blkg = ERR_PTR(-EINVAL); 207 ret = -EINVAL;
242 goto err_put_css; 208 goto err_put_css;
243 } 209 }
244 blkg_get(blkg->parent); 210 blkg_get(blkg->parent);
245 } 211 }
246 212
213 /* invoke per-policy init */
214 for (i = 0; i < BLKCG_MAX_POLS; i++) {
215 struct blkcg_policy *pol = blkcg_policy[i];
216
217 if (blkg->pd[i] && pol->pd_init_fn)
218 pol->pd_init_fn(blkg);
219 }
220
221 /* insert */
247 spin_lock(&blkcg->lock); 222 spin_lock(&blkcg->lock);
248 ret = radix_tree_insert(&blkcg->blkg_tree, q->id, blkg); 223 ret = radix_tree_insert(&blkcg->blkg_tree, q->id, blkg);
249 if (likely(!ret)) { 224 if (likely(!ret)) {
@@ -394,30 +369,38 @@ static void blkg_destroy_all(struct request_queue *q)
394 q->root_rl.blkg = NULL; 369 q->root_rl.blkg = NULL;
395} 370}
396 371
397static void blkg_rcu_free(struct rcu_head *rcu_head) 372/*
373 * A group is RCU protected, but having an rcu lock does not mean that one
374 * can access all the fields of blkg and assume these are valid. For
375 * example, don't try to follow throtl_data and request queue links.
376 *
377 * Having a reference to blkg under an rcu allows accesses to only values
378 * local to groups like group stats and group rate limits.
379 */
380void __blkg_release_rcu(struct rcu_head *rcu_head)
398{ 381{
399 blkg_free(container_of(rcu_head, struct blkcg_gq, rcu_head)); 382 struct blkcg_gq *blkg = container_of(rcu_head, struct blkcg_gq, rcu_head);
400} 383 int i;
384
385 /* tell policies that this one is being freed */
386 for (i = 0; i < BLKCG_MAX_POLS; i++) {
387 struct blkcg_policy *pol = blkcg_policy[i];
388
389 if (blkg->pd[i] && pol->pd_exit_fn)
390 pol->pd_exit_fn(blkg);
391 }
401 392
402void __blkg_release(struct blkcg_gq *blkg)
403{
404 /* release the blkcg and parent blkg refs this blkg has been holding */ 393 /* release the blkcg and parent blkg refs this blkg has been holding */
405 css_put(&blkg->blkcg->css); 394 css_put(&blkg->blkcg->css);
406 if (blkg->parent) 395 if (blkg->parent) {
396 spin_lock_irq(blkg->q->queue_lock);
407 blkg_put(blkg->parent); 397 blkg_put(blkg->parent);
398 spin_unlock_irq(blkg->q->queue_lock);
399 }
408 400
409 /* 401 blkg_free(blkg);
410 * A group is freed in rcu manner. But having an rcu lock does not
411 * mean that one can access all the fields of blkg and assume these
412 * are valid. For example, don't try to follow throtl_data and
413 * request queue links.
414 *
415 * Having a reference to blkg under an rcu allows acess to only
416 * values local to groups like group stats and group rate limits
417 */
418 call_rcu(&blkg->rcu_head, blkg_rcu_free);
419} 402}
420EXPORT_SYMBOL_GPL(__blkg_release); 403EXPORT_SYMBOL_GPL(__blkg_release_rcu);
421 404
422/* 405/*
423 * The next function used by blk_queue_for_each_rl(). It's a bit tricky 406 * The next function used by blk_queue_for_each_rl(). It's a bit tricky
@@ -928,14 +911,6 @@ struct cgroup_subsys blkio_subsys = {
928 .subsys_id = blkio_subsys_id, 911 .subsys_id = blkio_subsys_id,
929 .base_cftypes = blkcg_files, 912 .base_cftypes = blkcg_files,
930 .module = THIS_MODULE, 913 .module = THIS_MODULE,
931
932 /*
933 * blkio subsystem is utterly broken in terms of hierarchy support.
934 * It treats all cgroups equally regardless of where they're
935 * located in the hierarchy - all cgroups are treated as if they're
936 * right below the root. Fix it and remove the following.
937 */
938 .broken_hierarchy = true,
939}; 914};
940EXPORT_SYMBOL_GPL(blkio_subsys); 915EXPORT_SYMBOL_GPL(blkio_subsys);
941 916
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 4e595ee8c915..8056c03a3382 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -266,7 +266,7 @@ static inline void blkg_get(struct blkcg_gq *blkg)
266 blkg->refcnt++; 266 blkg->refcnt++;
267} 267}
268 268
269void __blkg_release(struct blkcg_gq *blkg); 269void __blkg_release_rcu(struct rcu_head *rcu);
270 270
271/** 271/**
272 * blkg_put - put a blkg reference 272 * blkg_put - put a blkg reference
@@ -279,9 +279,43 @@ static inline void blkg_put(struct blkcg_gq *blkg)
279 lockdep_assert_held(blkg->q->queue_lock); 279 lockdep_assert_held(blkg->q->queue_lock);
280 WARN_ON_ONCE(blkg->refcnt <= 0); 280 WARN_ON_ONCE(blkg->refcnt <= 0);
281 if (!--blkg->refcnt) 281 if (!--blkg->refcnt)
282 __blkg_release(blkg); 282 call_rcu(&blkg->rcu_head, __blkg_release_rcu);
283} 283}
284 284
285struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q,
286 bool update_hint);
287
288/**
289 * blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants
290 * @d_blkg: loop cursor pointing to the current descendant
291 * @pos_cgrp: used for iteration
292 * @p_blkg: target blkg to walk descendants of
293 *
294 * Walk @c_blkg through the descendants of @p_blkg. Must be used with RCU
295 * read locked. If called under either blkcg or queue lock, the iteration
296 * is guaranteed to include all and only online blkgs. The caller may
297 * update @pos_cgrp by calling cgroup_rightmost_descendant() to skip
298 * subtree.
299 */
300#define blkg_for_each_descendant_pre(d_blkg, pos_cgrp, p_blkg) \
301 cgroup_for_each_descendant_pre((pos_cgrp), (p_blkg)->blkcg->css.cgroup) \
302 if (((d_blkg) = __blkg_lookup(cgroup_to_blkcg(pos_cgrp), \
303 (p_blkg)->q, false)))
304
305/**
306 * blkg_for_each_descendant_post - post-order walk of a blkg's descendants
307 * @d_blkg: loop cursor pointing to the current descendant
308 * @pos_cgrp: used for iteration
309 * @p_blkg: target blkg to walk descendants of
310 *
311 * Similar to blkg_for_each_descendant_pre() but performs post-order
312 * traversal instead. Synchronization rules are the same.
313 */
314#define blkg_for_each_descendant_post(d_blkg, pos_cgrp, p_blkg) \
315 cgroup_for_each_descendant_post((pos_cgrp), (p_blkg)->blkcg->css.cgroup) \
316 if (((d_blkg) = __blkg_lookup(cgroup_to_blkcg(pos_cgrp), \
317 (p_blkg)->q, false)))
318
285/** 319/**
286 * blk_get_rl - get request_list to use 320 * blk_get_rl - get request_list to use
287 * @q: request_queue of interest 321 * @q: request_queue of interest
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 31146225f3d0..08a32dfd3844 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -25,18 +25,61 @@ static struct blkcg_policy blkcg_policy_throtl;
25 25
26/* A workqueue to queue throttle related work */ 26/* A workqueue to queue throttle related work */
27static struct workqueue_struct *kthrotld_workqueue; 27static struct workqueue_struct *kthrotld_workqueue;
28static void throtl_schedule_delayed_work(struct throtl_data *td, 28
29 unsigned long delay); 29/*
30 30 * To implement hierarchical throttling, throtl_grps form a tree and bios
31struct throtl_rb_root { 31 * are dispatched upwards level by level until they reach the top and get
32 struct rb_root rb; 32 * issued. When dispatching bios from the children and local group at each
33 struct rb_node *left; 33 * level, if the bios are dispatched into a single bio_list, there's a risk
34 unsigned int count; 34 * of a local or child group which can queue many bios at once filling up
35 unsigned long min_disptime; 35 * the list starving others.
36 *
37 * To avoid such starvation, dispatched bios are queued separately
38 * according to where they came from. When they are again dispatched to
39 * the parent, they're popped in round-robin order so that no single source
40 * hogs the dispatch window.
41 *
42 * throtl_qnode is used to keep the queued bios separated by their sources.
43 * Bios are queued to throtl_qnode which in turn is queued to
44 * throtl_service_queue and then dispatched in round-robin order.
45 *
46 * It's also used to track the reference counts on blkg's. A qnode always
47 * belongs to a throtl_grp and gets queued on itself or the parent, so
48 * incrementing the reference of the associated throtl_grp when a qnode is
49 * queued and decrementing when dequeued is enough to keep the whole blkg
50 * tree pinned while bios are in flight.
51 */
52struct throtl_qnode {
53 struct list_head node; /* service_queue->queued[] */
54 struct bio_list bios; /* queued bios */
55 struct throtl_grp *tg; /* tg this qnode belongs to */
36}; 56};
37 57
38#define THROTL_RB_ROOT (struct throtl_rb_root) { .rb = RB_ROOT, .left = NULL, \ 58struct throtl_service_queue {
39 .count = 0, .min_disptime = 0} 59 struct throtl_service_queue *parent_sq; /* the parent service_queue */
60
61 /*
62 * Bios queued directly to this service_queue or dispatched from
63 * children throtl_grp's.
64 */
65 struct list_head queued[2]; /* throtl_qnode [READ/WRITE] */
66 unsigned int nr_queued[2]; /* number of queued bios */
67
68 /*
69 * RB tree of active children throtl_grp's, which are sorted by
70 * their ->disptime.
71 */
72 struct rb_root pending_tree; /* RB tree of active tgs */
73 struct rb_node *first_pending; /* first node in the tree */
74 unsigned int nr_pending; /* # queued in the tree */
75 unsigned long first_pending_disptime; /* disptime of the first tg */
76 struct timer_list pending_timer; /* fires on first_pending_disptime */
77};
78
79enum tg_state_flags {
80 THROTL_TG_PENDING = 1 << 0, /* on parent's pending tree */
81 THROTL_TG_WAS_EMPTY = 1 << 1, /* bio_lists[] became non-empty */
82};
40 83
41#define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node) 84#define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node)
42 85
@@ -52,9 +95,26 @@ struct throtl_grp {
52 /* must be the first member */ 95 /* must be the first member */
53 struct blkg_policy_data pd; 96 struct blkg_policy_data pd;
54 97
55 /* active throtl group service_tree member */ 98 /* active throtl group service_queue member */
56 struct rb_node rb_node; 99 struct rb_node rb_node;
57 100
101 /* throtl_data this group belongs to */
102 struct throtl_data *td;
103
104 /* this group's service queue */
105 struct throtl_service_queue service_queue;
106
107 /*
108 * qnode_on_self is used when bios are directly queued to this
109 * throtl_grp so that local bios compete fairly with bios
110 * dispatched from children. qnode_on_parent is used when bios are
111 * dispatched from this throtl_grp into its parent and will compete
112 * with the sibling qnode_on_parents and the parent's
113 * qnode_on_self.
114 */
115 struct throtl_qnode qnode_on_self[2];
116 struct throtl_qnode qnode_on_parent[2];
117
58 /* 118 /*
59 * Dispatch time in jiffies. This is the estimated time when group 119 * Dispatch time in jiffies. This is the estimated time when group
60 * will unthrottle and is ready to dispatch more bio. It is used as 120 * will unthrottle and is ready to dispatch more bio. It is used as
@@ -64,11 +124,8 @@ struct throtl_grp {
64 124
65 unsigned int flags; 125 unsigned int flags;
66 126
67 /* Two lists for READ and WRITE */ 127 /* are there any throtl rules between this group and td? */
68 struct bio_list bio_lists[2]; 128 bool has_rules[2];
69
70 /* Number of queued bios on READ and WRITE lists */
71 unsigned int nr_queued[2];
72 129
73 /* bytes per second rate limits */ 130 /* bytes per second rate limits */
74 uint64_t bps[2]; 131 uint64_t bps[2];
@@ -85,9 +142,6 @@ struct throtl_grp {
85 unsigned long slice_start[2]; 142 unsigned long slice_start[2];
86 unsigned long slice_end[2]; 143 unsigned long slice_end[2];
87 144
88 /* Some throttle limits got updated for the group */
89 int limits_changed;
90
91 /* Per cpu stats pointer */ 145 /* Per cpu stats pointer */
92 struct tg_stats_cpu __percpu *stats_cpu; 146 struct tg_stats_cpu __percpu *stats_cpu;
93 147
@@ -98,7 +152,7 @@ struct throtl_grp {
98struct throtl_data 152struct throtl_data
99{ 153{
100 /* service tree for active throtl groups */ 154 /* service tree for active throtl groups */
101 struct throtl_rb_root tg_service_tree; 155 struct throtl_service_queue service_queue;
102 156
103 struct request_queue *queue; 157 struct request_queue *queue;
104 158
@@ -111,9 +165,7 @@ struct throtl_data
111 unsigned int nr_undestroyed_grps; 165 unsigned int nr_undestroyed_grps;
112 166
113 /* Work for dispatching throttled bios */ 167 /* Work for dispatching throttled bios */
114 struct delayed_work throtl_work; 168 struct work_struct dispatch_work;
115
116 int limits_changed;
117}; 169};
118 170
119/* list and work item to allocate percpu group stats */ 171/* list and work item to allocate percpu group stats */
@@ -123,6 +175,8 @@ static LIST_HEAD(tg_stats_alloc_list);
123static void tg_stats_alloc_fn(struct work_struct *); 175static void tg_stats_alloc_fn(struct work_struct *);
124static DECLARE_DELAYED_WORK(tg_stats_alloc_work, tg_stats_alloc_fn); 176static DECLARE_DELAYED_WORK(tg_stats_alloc_work, tg_stats_alloc_fn);
125 177
178static void throtl_pending_timer_fn(unsigned long arg);
179
126static inline struct throtl_grp *pd_to_tg(struct blkg_policy_data *pd) 180static inline struct throtl_grp *pd_to_tg(struct blkg_policy_data *pd)
127{ 181{
128 return pd ? container_of(pd, struct throtl_grp, pd) : NULL; 182 return pd ? container_of(pd, struct throtl_grp, pd) : NULL;
@@ -143,41 +197,65 @@ static inline struct throtl_grp *td_root_tg(struct throtl_data *td)
143 return blkg_to_tg(td->queue->root_blkg); 197 return blkg_to_tg(td->queue->root_blkg);
144} 198}
145 199
146enum tg_state_flags { 200/**
147 THROTL_TG_FLAG_on_rr = 0, /* on round-robin busy list */ 201 * sq_to_tg - return the throl_grp the specified service queue belongs to
148}; 202 * @sq: the throtl_service_queue of interest
149 203 *
150#define THROTL_TG_FNS(name) \ 204 * Return the throtl_grp @sq belongs to. If @sq is the top-level one
151static inline void throtl_mark_tg_##name(struct throtl_grp *tg) \ 205 * embedded in throtl_data, %NULL is returned.
152{ \ 206 */
153 (tg)->flags |= (1 << THROTL_TG_FLAG_##name); \ 207static struct throtl_grp *sq_to_tg(struct throtl_service_queue *sq)
154} \ 208{
155static inline void throtl_clear_tg_##name(struct throtl_grp *tg) \ 209 if (sq && sq->parent_sq)
156{ \ 210 return container_of(sq, struct throtl_grp, service_queue);
157 (tg)->flags &= ~(1 << THROTL_TG_FLAG_##name); \ 211 else
158} \ 212 return NULL;
159static inline int throtl_tg_##name(const struct throtl_grp *tg) \
160{ \
161 return ((tg)->flags & (1 << THROTL_TG_FLAG_##name)) != 0; \
162} 213}
163 214
164THROTL_TG_FNS(on_rr); 215/**
216 * sq_to_td - return throtl_data the specified service queue belongs to
217 * @sq: the throtl_service_queue of interest
218 *
219 * A service_queue can be embeded in either a throtl_grp or throtl_data.
220 * Determine the associated throtl_data accordingly and return it.
221 */
222static struct throtl_data *sq_to_td(struct throtl_service_queue *sq)
223{
224 struct throtl_grp *tg = sq_to_tg(sq);
165 225
166#define throtl_log_tg(td, tg, fmt, args...) do { \ 226 if (tg)
167 char __pbuf[128]; \ 227 return tg->td;
228 else
229 return container_of(sq, struct throtl_data, service_queue);
230}
231
232/**
233 * throtl_log - log debug message via blktrace
234 * @sq: the service_queue being reported
235 * @fmt: printf format string
236 * @args: printf args
237 *
238 * The messages are prefixed with "throtl BLKG_NAME" if @sq belongs to a
239 * throtl_grp; otherwise, just "throtl".
240 *
241 * TODO: this should be made a function and name formatting should happen
242 * after testing whether blktrace is enabled.
243 */
244#define throtl_log(sq, fmt, args...) do { \
245 struct throtl_grp *__tg = sq_to_tg((sq)); \
246 struct throtl_data *__td = sq_to_td((sq)); \
247 \
248 (void)__td; \
249 if ((__tg)) { \
250 char __pbuf[128]; \
168 \ 251 \
169 blkg_path(tg_to_blkg(tg), __pbuf, sizeof(__pbuf)); \ 252 blkg_path(tg_to_blkg(__tg), __pbuf, sizeof(__pbuf)); \
170 blk_add_trace_msg((td)->queue, "throtl %s " fmt, __pbuf, ##args); \ 253 blk_add_trace_msg(__td->queue, "throtl %s " fmt, __pbuf, ##args); \
254 } else { \
255 blk_add_trace_msg(__td->queue, "throtl " fmt, ##args); \
256 } \
171} while (0) 257} while (0)
172 258
173#define throtl_log(td, fmt, args...) \
174 blk_add_trace_msg((td)->queue, "throtl " fmt, ##args)
175
176static inline unsigned int total_nr_queued(struct throtl_data *td)
177{
178 return td->nr_queued[0] + td->nr_queued[1];
179}
180
181/* 259/*
182 * Worker for allocating per cpu stat for tgs. This is scheduled on the 260 * Worker for allocating per cpu stat for tgs. This is scheduled on the
183 * system_wq once there are some groups on the alloc_list waiting for 261 * system_wq once there are some groups on the alloc_list waiting for
@@ -215,15 +293,141 @@ alloc_stats:
215 goto alloc_stats; 293 goto alloc_stats;
216} 294}
217 295
296static void throtl_qnode_init(struct throtl_qnode *qn, struct throtl_grp *tg)
297{
298 INIT_LIST_HEAD(&qn->node);
299 bio_list_init(&qn->bios);
300 qn->tg = tg;
301}
302
303/**
304 * throtl_qnode_add_bio - add a bio to a throtl_qnode and activate it
305 * @bio: bio being added
306 * @qn: qnode to add bio to
307 * @queued: the service_queue->queued[] list @qn belongs to
308 *
309 * Add @bio to @qn and put @qn on @queued if it's not already on.
310 * @qn->tg's reference count is bumped when @qn is activated. See the
311 * comment on top of throtl_qnode definition for details.
312 */
313static void throtl_qnode_add_bio(struct bio *bio, struct throtl_qnode *qn,
314 struct list_head *queued)
315{
316 bio_list_add(&qn->bios, bio);
317 if (list_empty(&qn->node)) {
318 list_add_tail(&qn->node, queued);
319 blkg_get(tg_to_blkg(qn->tg));
320 }
321}
322
323/**
324 * throtl_peek_queued - peek the first bio on a qnode list
325 * @queued: the qnode list to peek
326 */
327static struct bio *throtl_peek_queued(struct list_head *queued)
328{
329 struct throtl_qnode *qn = list_first_entry(queued, struct throtl_qnode, node);
330 struct bio *bio;
331
332 if (list_empty(queued))
333 return NULL;
334
335 bio = bio_list_peek(&qn->bios);
336 WARN_ON_ONCE(!bio);
337 return bio;
338}
339
340/**
341 * throtl_pop_queued - pop the first bio form a qnode list
342 * @queued: the qnode list to pop a bio from
343 * @tg_to_put: optional out argument for throtl_grp to put
344 *
345 * Pop the first bio from the qnode list @queued. After popping, the first
346 * qnode is removed from @queued if empty or moved to the end of @queued so
347 * that the popping order is round-robin.
348 *
349 * When the first qnode is removed, its associated throtl_grp should be put
350 * too. If @tg_to_put is NULL, this function automatically puts it;
351 * otherwise, *@tg_to_put is set to the throtl_grp to put and the caller is
352 * responsible for putting it.
353 */
354static struct bio *throtl_pop_queued(struct list_head *queued,
355 struct throtl_grp **tg_to_put)
356{
357 struct throtl_qnode *qn = list_first_entry(queued, struct throtl_qnode, node);
358 struct bio *bio;
359
360 if (list_empty(queued))
361 return NULL;
362
363 bio = bio_list_pop(&qn->bios);
364 WARN_ON_ONCE(!bio);
365
366 if (bio_list_empty(&qn->bios)) {
367 list_del_init(&qn->node);
368 if (tg_to_put)
369 *tg_to_put = qn->tg;
370 else
371 blkg_put(tg_to_blkg(qn->tg));
372 } else {
373 list_move_tail(&qn->node, queued);
374 }
375
376 return bio;
377}
378
379/* init a service_queue, assumes the caller zeroed it */
380static void throtl_service_queue_init(struct throtl_service_queue *sq,
381 struct throtl_service_queue *parent_sq)
382{
383 INIT_LIST_HEAD(&sq->queued[0]);
384 INIT_LIST_HEAD(&sq->queued[1]);
385 sq->pending_tree = RB_ROOT;
386 sq->parent_sq = parent_sq;
387 setup_timer(&sq->pending_timer, throtl_pending_timer_fn,
388 (unsigned long)sq);
389}
390
391static void throtl_service_queue_exit(struct throtl_service_queue *sq)
392{
393 del_timer_sync(&sq->pending_timer);
394}
395
218static void throtl_pd_init(struct blkcg_gq *blkg) 396static void throtl_pd_init(struct blkcg_gq *blkg)
219{ 397{
220 struct throtl_grp *tg = blkg_to_tg(blkg); 398 struct throtl_grp *tg = blkg_to_tg(blkg);
399 struct throtl_data *td = blkg->q->td;
400 struct throtl_service_queue *parent_sq;
221 unsigned long flags; 401 unsigned long flags;
402 int rw;
403
404 /*
405 * If sane_hierarchy is enabled, we switch to properly hierarchical
406 * behavior where limits on a given throtl_grp are applied to the
407 * whole subtree rather than just the group itself. e.g. If 16M
408 * read_bps limit is set on the root group, the whole system can't
409 * exceed 16M for the device.
410 *
411 * If sane_hierarchy is not enabled, the broken flat hierarchy
412 * behavior is retained where all throtl_grps are treated as if
413 * they're all separate root groups right below throtl_data.
414 * Limits of a group don't interact with limits of other groups
415 * regardless of the position of the group in the hierarchy.
416 */
417 parent_sq = &td->service_queue;
418
419 if (cgroup_sane_behavior(blkg->blkcg->css.cgroup) && blkg->parent)
420 parent_sq = &blkg_to_tg(blkg->parent)->service_queue;
421
422 throtl_service_queue_init(&tg->service_queue, parent_sq);
423
424 for (rw = READ; rw <= WRITE; rw++) {
425 throtl_qnode_init(&tg->qnode_on_self[rw], tg);
426 throtl_qnode_init(&tg->qnode_on_parent[rw], tg);
427 }
222 428
223 RB_CLEAR_NODE(&tg->rb_node); 429 RB_CLEAR_NODE(&tg->rb_node);
224 bio_list_init(&tg->bio_lists[0]); 430 tg->td = td;
225 bio_list_init(&tg->bio_lists[1]);
226 tg->limits_changed = false;
227 431
228 tg->bps[READ] = -1; 432 tg->bps[READ] = -1;
229 tg->bps[WRITE] = -1; 433 tg->bps[WRITE] = -1;
@@ -241,6 +445,30 @@ static void throtl_pd_init(struct blkcg_gq *blkg)
241 spin_unlock_irqrestore(&tg_stats_alloc_lock, flags); 445 spin_unlock_irqrestore(&tg_stats_alloc_lock, flags);
242} 446}
243 447
448/*
449 * Set has_rules[] if @tg or any of its parents have limits configured.
450 * This doesn't require walking up to the top of the hierarchy as the
451 * parent's has_rules[] is guaranteed to be correct.
452 */
453static void tg_update_has_rules(struct throtl_grp *tg)
454{
455 struct throtl_grp *parent_tg = sq_to_tg(tg->service_queue.parent_sq);
456 int rw;
457
458 for (rw = READ; rw <= WRITE; rw++)
459 tg->has_rules[rw] = (parent_tg && parent_tg->has_rules[rw]) ||
460 (tg->bps[rw] != -1 || tg->iops[rw] != -1);
461}
462
463static void throtl_pd_online(struct blkcg_gq *blkg)
464{
465 /*
466 * We don't want new groups to escape the limits of its ancestors.
467 * Update has_rules[] after a new group is brought online.
468 */
469 tg_update_has_rules(blkg_to_tg(blkg));
470}
471
244static void throtl_pd_exit(struct blkcg_gq *blkg) 472static void throtl_pd_exit(struct blkcg_gq *blkg)
245{ 473{
246 struct throtl_grp *tg = blkg_to_tg(blkg); 474 struct throtl_grp *tg = blkg_to_tg(blkg);
@@ -251,6 +479,8 @@ static void throtl_pd_exit(struct blkcg_gq *blkg)
251 spin_unlock_irqrestore(&tg_stats_alloc_lock, flags); 479 spin_unlock_irqrestore(&tg_stats_alloc_lock, flags);
252 480
253 free_percpu(tg->stats_cpu); 481 free_percpu(tg->stats_cpu);
482
483 throtl_service_queue_exit(&tg->service_queue);
254} 484}
255 485
256static void throtl_pd_reset_stats(struct blkcg_gq *blkg) 486static void throtl_pd_reset_stats(struct blkcg_gq *blkg)
@@ -309,17 +539,18 @@ static struct throtl_grp *throtl_lookup_create_tg(struct throtl_data *td,
309 return tg; 539 return tg;
310} 540}
311 541
312static struct throtl_grp *throtl_rb_first(struct throtl_rb_root *root) 542static struct throtl_grp *
543throtl_rb_first(struct throtl_service_queue *parent_sq)
313{ 544{
314 /* Service tree is empty */ 545 /* Service tree is empty */
315 if (!root->count) 546 if (!parent_sq->nr_pending)
316 return NULL; 547 return NULL;
317 548
318 if (!root->left) 549 if (!parent_sq->first_pending)
319 root->left = rb_first(&root->rb); 550 parent_sq->first_pending = rb_first(&parent_sq->pending_tree);
320 551
321 if (root->left) 552 if (parent_sq->first_pending)
322 return rb_entry_tg(root->left); 553 return rb_entry_tg(parent_sq->first_pending);
323 554
324 return NULL; 555 return NULL;
325} 556}
@@ -330,29 +561,30 @@ static void rb_erase_init(struct rb_node *n, struct rb_root *root)
330 RB_CLEAR_NODE(n); 561 RB_CLEAR_NODE(n);
331} 562}
332 563
333static void throtl_rb_erase(struct rb_node *n, struct throtl_rb_root *root) 564static void throtl_rb_erase(struct rb_node *n,
565 struct throtl_service_queue *parent_sq)
334{ 566{
335 if (root->left == n) 567 if (parent_sq->first_pending == n)
336 root->left = NULL; 568 parent_sq->first_pending = NULL;
337 rb_erase_init(n, &root->rb); 569 rb_erase_init(n, &parent_sq->pending_tree);
338 --root->count; 570 --parent_sq->nr_pending;
339} 571}
340 572
341static void update_min_dispatch_time(struct throtl_rb_root *st) 573static void update_min_dispatch_time(struct throtl_service_queue *parent_sq)
342{ 574{
343 struct throtl_grp *tg; 575 struct throtl_grp *tg;
344 576
345 tg = throtl_rb_first(st); 577 tg = throtl_rb_first(parent_sq);
346 if (!tg) 578 if (!tg)
347 return; 579 return;
348 580
349 st->min_disptime = tg->disptime; 581 parent_sq->first_pending_disptime = tg->disptime;
350} 582}
351 583
352static void 584static void tg_service_queue_add(struct throtl_grp *tg)
353tg_service_tree_add(struct throtl_rb_root *st, struct throtl_grp *tg)
354{ 585{
355 struct rb_node **node = &st->rb.rb_node; 586 struct throtl_service_queue *parent_sq = tg->service_queue.parent_sq;
587 struct rb_node **node = &parent_sq->pending_tree.rb_node;
356 struct rb_node *parent = NULL; 588 struct rb_node *parent = NULL;
357 struct throtl_grp *__tg; 589 struct throtl_grp *__tg;
358 unsigned long key = tg->disptime; 590 unsigned long key = tg->disptime;
@@ -371,89 +603,135 @@ tg_service_tree_add(struct throtl_rb_root *st, struct throtl_grp *tg)
371 } 603 }
372 604
373 if (left) 605 if (left)
374 st->left = &tg->rb_node; 606 parent_sq->first_pending = &tg->rb_node;
375 607
376 rb_link_node(&tg->rb_node, parent, node); 608 rb_link_node(&tg->rb_node, parent, node);
377 rb_insert_color(&tg->rb_node, &st->rb); 609 rb_insert_color(&tg->rb_node, &parent_sq->pending_tree);
378} 610}
379 611
380static void __throtl_enqueue_tg(struct throtl_data *td, struct throtl_grp *tg) 612static void __throtl_enqueue_tg(struct throtl_grp *tg)
381{ 613{
382 struct throtl_rb_root *st = &td->tg_service_tree; 614 tg_service_queue_add(tg);
615 tg->flags |= THROTL_TG_PENDING;
616 tg->service_queue.parent_sq->nr_pending++;
617}
383 618
384 tg_service_tree_add(st, tg); 619static void throtl_enqueue_tg(struct throtl_grp *tg)
385 throtl_mark_tg_on_rr(tg); 620{
386 st->count++; 621 if (!(tg->flags & THROTL_TG_PENDING))
622 __throtl_enqueue_tg(tg);
387} 623}
388 624
389static void throtl_enqueue_tg(struct throtl_data *td, struct throtl_grp *tg) 625static void __throtl_dequeue_tg(struct throtl_grp *tg)
390{ 626{
391 if (!throtl_tg_on_rr(tg)) 627 throtl_rb_erase(&tg->rb_node, tg->service_queue.parent_sq);
392 __throtl_enqueue_tg(td, tg); 628 tg->flags &= ~THROTL_TG_PENDING;
393} 629}
394 630
395static void __throtl_dequeue_tg(struct throtl_data *td, struct throtl_grp *tg) 631static void throtl_dequeue_tg(struct throtl_grp *tg)
396{ 632{
397 throtl_rb_erase(&tg->rb_node, &td->tg_service_tree); 633 if (tg->flags & THROTL_TG_PENDING)
398 throtl_clear_tg_on_rr(tg); 634 __throtl_dequeue_tg(tg);
399} 635}
400 636
401static void throtl_dequeue_tg(struct throtl_data *td, struct throtl_grp *tg) 637/* Call with queue lock held */
638static void throtl_schedule_pending_timer(struct throtl_service_queue *sq,
639 unsigned long expires)
402{ 640{
403 if (throtl_tg_on_rr(tg)) 641 mod_timer(&sq->pending_timer, expires);
404 __throtl_dequeue_tg(td, tg); 642 throtl_log(sq, "schedule timer. delay=%lu jiffies=%lu",
643 expires - jiffies, jiffies);
405} 644}
406 645
407static void throtl_schedule_next_dispatch(struct throtl_data *td) 646/**
647 * throtl_schedule_next_dispatch - schedule the next dispatch cycle
648 * @sq: the service_queue to schedule dispatch for
649 * @force: force scheduling
650 *
651 * Arm @sq->pending_timer so that the next dispatch cycle starts on the
652 * dispatch time of the first pending child. Returns %true if either timer
653 * is armed or there's no pending child left. %false if the current
654 * dispatch window is still open and the caller should continue
655 * dispatching.
656 *
657 * If @force is %true, the dispatch timer is always scheduled and this
658 * function is guaranteed to return %true. This is to be used when the
659 * caller can't dispatch itself and needs to invoke pending_timer
660 * unconditionally. Note that forced scheduling is likely to induce short
661 * delay before dispatch starts even if @sq->first_pending_disptime is not
662 * in the future and thus shouldn't be used in hot paths.
663 */
664static bool throtl_schedule_next_dispatch(struct throtl_service_queue *sq,
665 bool force)
408{ 666{
409 struct throtl_rb_root *st = &td->tg_service_tree; 667 /* any pending children left? */
668 if (!sq->nr_pending)
669 return true;
410 670
411 /* 671 update_min_dispatch_time(sq);
412 * If there are more bios pending, schedule more work.
413 */
414 if (!total_nr_queued(td))
415 return;
416 672
417 BUG_ON(!st->count); 673 /* is the next dispatch time in the future? */
674 if (force || time_after(sq->first_pending_disptime, jiffies)) {
675 throtl_schedule_pending_timer(sq, sq->first_pending_disptime);
676 return true;
677 }
418 678
419 update_min_dispatch_time(st); 679 /* tell the caller to continue dispatching */
680 return false;
681}
420 682
421 if (time_before_eq(st->min_disptime, jiffies)) 683static inline void throtl_start_new_slice_with_credit(struct throtl_grp *tg,
422 throtl_schedule_delayed_work(td, 0); 684 bool rw, unsigned long start)
423 else 685{
424 throtl_schedule_delayed_work(td, (st->min_disptime - jiffies)); 686 tg->bytes_disp[rw] = 0;
687 tg->io_disp[rw] = 0;
688
689 /*
690 * Previous slice has expired. We must have trimmed it after last
691 * bio dispatch. That means since start of last slice, we never used
692 * that bandwidth. Do try to make use of that bandwidth while giving
693 * credit.
694 */
695 if (time_after_eq(start, tg->slice_start[rw]))
696 tg->slice_start[rw] = start;
697
698 tg->slice_end[rw] = jiffies + throtl_slice;
699 throtl_log(&tg->service_queue,
700 "[%c] new slice with credit start=%lu end=%lu jiffies=%lu",
701 rw == READ ? 'R' : 'W', tg->slice_start[rw],
702 tg->slice_end[rw], jiffies);
425} 703}
426 704
427static inline void 705static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw)
428throtl_start_new_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
429{ 706{
430 tg->bytes_disp[rw] = 0; 707 tg->bytes_disp[rw] = 0;
431 tg->io_disp[rw] = 0; 708 tg->io_disp[rw] = 0;
432 tg->slice_start[rw] = jiffies; 709 tg->slice_start[rw] = jiffies;
433 tg->slice_end[rw] = jiffies + throtl_slice; 710 tg->slice_end[rw] = jiffies + throtl_slice;
434 throtl_log_tg(td, tg, "[%c] new slice start=%lu end=%lu jiffies=%lu", 711 throtl_log(&tg->service_queue,
435 rw == READ ? 'R' : 'W', tg->slice_start[rw], 712 "[%c] new slice start=%lu end=%lu jiffies=%lu",
436 tg->slice_end[rw], jiffies); 713 rw == READ ? 'R' : 'W', tg->slice_start[rw],
714 tg->slice_end[rw], jiffies);
437} 715}
438 716
439static inline void throtl_set_slice_end(struct throtl_data *td, 717static inline void throtl_set_slice_end(struct throtl_grp *tg, bool rw,
440 struct throtl_grp *tg, bool rw, unsigned long jiffy_end) 718 unsigned long jiffy_end)
441{ 719{
442 tg->slice_end[rw] = roundup(jiffy_end, throtl_slice); 720 tg->slice_end[rw] = roundup(jiffy_end, throtl_slice);
443} 721}
444 722
445static inline void throtl_extend_slice(struct throtl_data *td, 723static inline void throtl_extend_slice(struct throtl_grp *tg, bool rw,
446 struct throtl_grp *tg, bool rw, unsigned long jiffy_end) 724 unsigned long jiffy_end)
447{ 725{
448 tg->slice_end[rw] = roundup(jiffy_end, throtl_slice); 726 tg->slice_end[rw] = roundup(jiffy_end, throtl_slice);
449 throtl_log_tg(td, tg, "[%c] extend slice start=%lu end=%lu jiffies=%lu", 727 throtl_log(&tg->service_queue,
450 rw == READ ? 'R' : 'W', tg->slice_start[rw], 728 "[%c] extend slice start=%lu end=%lu jiffies=%lu",
451 tg->slice_end[rw], jiffies); 729 rw == READ ? 'R' : 'W', tg->slice_start[rw],
730 tg->slice_end[rw], jiffies);
452} 731}
453 732
454/* Determine if previously allocated or extended slice is complete or not */ 733/* Determine if previously allocated or extended slice is complete or not */
455static bool 734static bool throtl_slice_used(struct throtl_grp *tg, bool rw)
456throtl_slice_used(struct throtl_data *td, struct throtl_grp *tg, bool rw)
457{ 735{
458 if (time_in_range(jiffies, tg->slice_start[rw], tg->slice_end[rw])) 736 if (time_in_range(jiffies, tg->slice_start[rw], tg->slice_end[rw]))
459 return 0; 737 return 0;
@@ -462,8 +740,7 @@ throtl_slice_used(struct throtl_data *td, struct throtl_grp *tg, bool rw)
462} 740}
463 741
464/* Trim the used slices and adjust slice start accordingly */ 742/* Trim the used slices and adjust slice start accordingly */
465static inline void 743static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw)
466throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
467{ 744{
468 unsigned long nr_slices, time_elapsed, io_trim; 745 unsigned long nr_slices, time_elapsed, io_trim;
469 u64 bytes_trim, tmp; 746 u64 bytes_trim, tmp;
@@ -475,7 +752,7 @@ throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
475 * renewed. Don't try to trim the slice if slice is used. A new 752 * renewed. Don't try to trim the slice if slice is used. A new
476 * slice will start when appropriate. 753 * slice will start when appropriate.
477 */ 754 */
478 if (throtl_slice_used(td, tg, rw)) 755 if (throtl_slice_used(tg, rw))
479 return; 756 return;
480 757
481 /* 758 /*
@@ -486,7 +763,7 @@ throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
486 * is bad because it does not allow new slice to start. 763 * is bad because it does not allow new slice to start.
487 */ 764 */
488 765
489 throtl_set_slice_end(td, tg, rw, jiffies + throtl_slice); 766 throtl_set_slice_end(tg, rw, jiffies + throtl_slice);
490 767
491 time_elapsed = jiffies - tg->slice_start[rw]; 768 time_elapsed = jiffies - tg->slice_start[rw];
492 769
@@ -515,14 +792,14 @@ throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
515 792
516 tg->slice_start[rw] += nr_slices * throtl_slice; 793 tg->slice_start[rw] += nr_slices * throtl_slice;
517 794
518 throtl_log_tg(td, tg, "[%c] trim slice nr=%lu bytes=%llu io=%lu" 795 throtl_log(&tg->service_queue,
519 " start=%lu end=%lu jiffies=%lu", 796 "[%c] trim slice nr=%lu bytes=%llu io=%lu start=%lu end=%lu jiffies=%lu",
520 rw == READ ? 'R' : 'W', nr_slices, bytes_trim, io_trim, 797 rw == READ ? 'R' : 'W', nr_slices, bytes_trim, io_trim,
521 tg->slice_start[rw], tg->slice_end[rw], jiffies); 798 tg->slice_start[rw], tg->slice_end[rw], jiffies);
522} 799}
523 800
524static bool tg_with_in_iops_limit(struct throtl_data *td, struct throtl_grp *tg, 801static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio,
525 struct bio *bio, unsigned long *wait) 802 unsigned long *wait)
526{ 803{
527 bool rw = bio_data_dir(bio); 804 bool rw = bio_data_dir(bio);
528 unsigned int io_allowed; 805 unsigned int io_allowed;
@@ -571,8 +848,8 @@ static bool tg_with_in_iops_limit(struct throtl_data *td, struct throtl_grp *tg,
571 return 0; 848 return 0;
572} 849}
573 850
574static bool tg_with_in_bps_limit(struct throtl_data *td, struct throtl_grp *tg, 851static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio,
575 struct bio *bio, unsigned long *wait) 852 unsigned long *wait)
576{ 853{
577 bool rw = bio_data_dir(bio); 854 bool rw = bio_data_dir(bio);
578 u64 bytes_allowed, extra_bytes, tmp; 855 u64 bytes_allowed, extra_bytes, tmp;
@@ -613,18 +890,12 @@ static bool tg_with_in_bps_limit(struct throtl_data *td, struct throtl_grp *tg,
613 return 0; 890 return 0;
614} 891}
615 892
616static bool tg_no_rule_group(struct throtl_grp *tg, bool rw) {
617 if (tg->bps[rw] == -1 && tg->iops[rw] == -1)
618 return 1;
619 return 0;
620}
621
622/* 893/*
623 * Returns whether one can dispatch a bio or not. Also returns approx number 894 * Returns whether one can dispatch a bio or not. Also returns approx number
624 * of jiffies to wait before this bio is with-in IO rate and can be dispatched 895 * of jiffies to wait before this bio is with-in IO rate and can be dispatched
625 */ 896 */
626static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg, 897static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
627 struct bio *bio, unsigned long *wait) 898 unsigned long *wait)
628{ 899{
629 bool rw = bio_data_dir(bio); 900 bool rw = bio_data_dir(bio);
630 unsigned long bps_wait = 0, iops_wait = 0, max_wait = 0; 901 unsigned long bps_wait = 0, iops_wait = 0, max_wait = 0;
@@ -635,7 +906,8 @@ static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg,
635 * this function with a different bio if there are other bios 906 * this function with a different bio if there are other bios
636 * queued. 907 * queued.
637 */ 908 */
638 BUG_ON(tg->nr_queued[rw] && bio != bio_list_peek(&tg->bio_lists[rw])); 909 BUG_ON(tg->service_queue.nr_queued[rw] &&
910 bio != throtl_peek_queued(&tg->service_queue.queued[rw]));
639 911
640 /* If tg->bps = -1, then BW is unlimited */ 912 /* If tg->bps = -1, then BW is unlimited */
641 if (tg->bps[rw] == -1 && tg->iops[rw] == -1) { 913 if (tg->bps[rw] == -1 && tg->iops[rw] == -1) {
@@ -649,15 +921,15 @@ static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg,
649 * existing slice to make sure it is at least throtl_slice interval 921 * existing slice to make sure it is at least throtl_slice interval
650 * long since now. 922 * long since now.
651 */ 923 */
652 if (throtl_slice_used(td, tg, rw)) 924 if (throtl_slice_used(tg, rw))
653 throtl_start_new_slice(td, tg, rw); 925 throtl_start_new_slice(tg, rw);
654 else { 926 else {
655 if (time_before(tg->slice_end[rw], jiffies + throtl_slice)) 927 if (time_before(tg->slice_end[rw], jiffies + throtl_slice))
656 throtl_extend_slice(td, tg, rw, jiffies + throtl_slice); 928 throtl_extend_slice(tg, rw, jiffies + throtl_slice);
657 } 929 }
658 930
659 if (tg_with_in_bps_limit(td, tg, bio, &bps_wait) 931 if (tg_with_in_bps_limit(tg, bio, &bps_wait) &&
660 && tg_with_in_iops_limit(td, tg, bio, &iops_wait)) { 932 tg_with_in_iops_limit(tg, bio, &iops_wait)) {
661 if (wait) 933 if (wait)
662 *wait = 0; 934 *wait = 0;
663 return 1; 935 return 1;
@@ -669,7 +941,7 @@ static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg,
669 *wait = max_wait; 941 *wait = max_wait;
670 942
671 if (time_before(tg->slice_end[rw], jiffies + max_wait)) 943 if (time_before(tg->slice_end[rw], jiffies + max_wait))
672 throtl_extend_slice(td, tg, rw, jiffies + max_wait); 944 throtl_extend_slice(tg, rw, jiffies + max_wait);
673 945
674 return 0; 946 return 0;
675} 947}
@@ -708,65 +980,136 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
708 tg->bytes_disp[rw] += bio->bi_size; 980 tg->bytes_disp[rw] += bio->bi_size;
709 tg->io_disp[rw]++; 981 tg->io_disp[rw]++;
710 982
711 throtl_update_dispatch_stats(tg_to_blkg(tg), bio->bi_size, bio->bi_rw); 983 /*
984 * REQ_THROTTLED is used to prevent the same bio to be throttled
985 * more than once as a throttled bio will go through blk-throtl the
986 * second time when it eventually gets issued. Set it when a bio
987 * is being charged to a tg.
988 *
989 * Dispatch stats aren't recursive and each @bio should only be
990 * accounted by the @tg it was originally associated with. Let's
991 * update the stats when setting REQ_THROTTLED for the first time
992 * which is guaranteed to be for the @bio's original tg.
993 */
994 if (!(bio->bi_rw & REQ_THROTTLED)) {
995 bio->bi_rw |= REQ_THROTTLED;
996 throtl_update_dispatch_stats(tg_to_blkg(tg), bio->bi_size,
997 bio->bi_rw);
998 }
712} 999}
713 1000
714static void throtl_add_bio_tg(struct throtl_data *td, struct throtl_grp *tg, 1001/**
715 struct bio *bio) 1002 * throtl_add_bio_tg - add a bio to the specified throtl_grp
1003 * @bio: bio to add
1004 * @qn: qnode to use
1005 * @tg: the target throtl_grp
1006 *
1007 * Add @bio to @tg's service_queue using @qn. If @qn is not specified,
1008 * tg->qnode_on_self[] is used.
1009 */
1010static void throtl_add_bio_tg(struct bio *bio, struct throtl_qnode *qn,
1011 struct throtl_grp *tg)
716{ 1012{
1013 struct throtl_service_queue *sq = &tg->service_queue;
717 bool rw = bio_data_dir(bio); 1014 bool rw = bio_data_dir(bio);
718 1015
719 bio_list_add(&tg->bio_lists[rw], bio); 1016 if (!qn)
720 /* Take a bio reference on tg */ 1017 qn = &tg->qnode_on_self[rw];
721 blkg_get(tg_to_blkg(tg)); 1018
722 tg->nr_queued[rw]++; 1019 /*
723 td->nr_queued[rw]++; 1020 * If @tg doesn't currently have any bios queued in the same
724 throtl_enqueue_tg(td, tg); 1021 * direction, queueing @bio can change when @tg should be
1022 * dispatched. Mark that @tg was empty. This is automatically
1023 * cleaered on the next tg_update_disptime().
1024 */
1025 if (!sq->nr_queued[rw])
1026 tg->flags |= THROTL_TG_WAS_EMPTY;
1027
1028 throtl_qnode_add_bio(bio, qn, &sq->queued[rw]);
1029
1030 sq->nr_queued[rw]++;
1031 throtl_enqueue_tg(tg);
725} 1032}
726 1033
727static void tg_update_disptime(struct throtl_data *td, struct throtl_grp *tg) 1034static void tg_update_disptime(struct throtl_grp *tg)
728{ 1035{
1036 struct throtl_service_queue *sq = &tg->service_queue;
729 unsigned long read_wait = -1, write_wait = -1, min_wait = -1, disptime; 1037 unsigned long read_wait = -1, write_wait = -1, min_wait = -1, disptime;
730 struct bio *bio; 1038 struct bio *bio;
731 1039
732 if ((bio = bio_list_peek(&tg->bio_lists[READ]))) 1040 if ((bio = throtl_peek_queued(&sq->queued[READ])))
733 tg_may_dispatch(td, tg, bio, &read_wait); 1041 tg_may_dispatch(tg, bio, &read_wait);
734 1042
735 if ((bio = bio_list_peek(&tg->bio_lists[WRITE]))) 1043 if ((bio = throtl_peek_queued(&sq->queued[WRITE])))
736 tg_may_dispatch(td, tg, bio, &write_wait); 1044 tg_may_dispatch(tg, bio, &write_wait);
737 1045
738 min_wait = min(read_wait, write_wait); 1046 min_wait = min(read_wait, write_wait);
739 disptime = jiffies + min_wait; 1047 disptime = jiffies + min_wait;
740 1048
741 /* Update dispatch time */ 1049 /* Update dispatch time */
742 throtl_dequeue_tg(td, tg); 1050 throtl_dequeue_tg(tg);
743 tg->disptime = disptime; 1051 tg->disptime = disptime;
744 throtl_enqueue_tg(td, tg); 1052 throtl_enqueue_tg(tg);
1053
1054 /* see throtl_add_bio_tg() */
1055 tg->flags &= ~THROTL_TG_WAS_EMPTY;
745} 1056}
746 1057
747static void tg_dispatch_one_bio(struct throtl_data *td, struct throtl_grp *tg, 1058static void start_parent_slice_with_credit(struct throtl_grp *child_tg,
748 bool rw, struct bio_list *bl) 1059 struct throtl_grp *parent_tg, bool rw)
749{ 1060{
750 struct bio *bio; 1061 if (throtl_slice_used(parent_tg, rw)) {
1062 throtl_start_new_slice_with_credit(parent_tg, rw,
1063 child_tg->slice_start[rw]);
1064 }
1065
1066}
751 1067
752 bio = bio_list_pop(&tg->bio_lists[rw]); 1068static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw)
753 tg->nr_queued[rw]--; 1069{
754 /* Drop bio reference on blkg */ 1070 struct throtl_service_queue *sq = &tg->service_queue;
755 blkg_put(tg_to_blkg(tg)); 1071 struct throtl_service_queue *parent_sq = sq->parent_sq;
1072 struct throtl_grp *parent_tg = sq_to_tg(parent_sq);
1073 struct throtl_grp *tg_to_put = NULL;
1074 struct bio *bio;
756 1075
757 BUG_ON(td->nr_queued[rw] <= 0); 1076 /*
758 td->nr_queued[rw]--; 1077 * @bio is being transferred from @tg to @parent_sq. Popping a bio
1078 * from @tg may put its reference and @parent_sq might end up
1079 * getting released prematurely. Remember the tg to put and put it
1080 * after @bio is transferred to @parent_sq.
1081 */
1082 bio = throtl_pop_queued(&sq->queued[rw], &tg_to_put);
1083 sq->nr_queued[rw]--;
759 1084
760 throtl_charge_bio(tg, bio); 1085 throtl_charge_bio(tg, bio);
761 bio_list_add(bl, bio);
762 bio->bi_rw |= REQ_THROTTLED;
763 1086
764 throtl_trim_slice(td, tg, rw); 1087 /*
1088 * If our parent is another tg, we just need to transfer @bio to
1089 * the parent using throtl_add_bio_tg(). If our parent is
1090 * @td->service_queue, @bio is ready to be issued. Put it on its
1091 * bio_lists[] and decrease total number queued. The caller is
1092 * responsible for issuing these bios.
1093 */
1094 if (parent_tg) {
1095 throtl_add_bio_tg(bio, &tg->qnode_on_parent[rw], parent_tg);
1096 start_parent_slice_with_credit(tg, parent_tg, rw);
1097 } else {
1098 throtl_qnode_add_bio(bio, &tg->qnode_on_parent[rw],
1099 &parent_sq->queued[rw]);
1100 BUG_ON(tg->td->nr_queued[rw] <= 0);
1101 tg->td->nr_queued[rw]--;
1102 }
1103
1104 throtl_trim_slice(tg, rw);
1105
1106 if (tg_to_put)
1107 blkg_put(tg_to_blkg(tg_to_put));
765} 1108}
766 1109
767static int throtl_dispatch_tg(struct throtl_data *td, struct throtl_grp *tg, 1110static int throtl_dispatch_tg(struct throtl_grp *tg)
768 struct bio_list *bl)
769{ 1111{
1112 struct throtl_service_queue *sq = &tg->service_queue;
770 unsigned int nr_reads = 0, nr_writes = 0; 1113 unsigned int nr_reads = 0, nr_writes = 0;
771 unsigned int max_nr_reads = throtl_grp_quantum*3/4; 1114 unsigned int max_nr_reads = throtl_grp_quantum*3/4;
772 unsigned int max_nr_writes = throtl_grp_quantum - max_nr_reads; 1115 unsigned int max_nr_writes = throtl_grp_quantum - max_nr_reads;
@@ -774,20 +1117,20 @@ static int throtl_dispatch_tg(struct throtl_data *td, struct throtl_grp *tg,
774 1117
775 /* Try to dispatch 75% READS and 25% WRITES */ 1118 /* Try to dispatch 75% READS and 25% WRITES */
776 1119
777 while ((bio = bio_list_peek(&tg->bio_lists[READ])) 1120 while ((bio = throtl_peek_queued(&sq->queued[READ])) &&
778 && tg_may_dispatch(td, tg, bio, NULL)) { 1121 tg_may_dispatch(tg, bio, NULL)) {
779 1122
780 tg_dispatch_one_bio(td, tg, bio_data_dir(bio), bl); 1123 tg_dispatch_one_bio(tg, bio_data_dir(bio));
781 nr_reads++; 1124 nr_reads++;
782 1125
783 if (nr_reads >= max_nr_reads) 1126 if (nr_reads >= max_nr_reads)
784 break; 1127 break;
785 } 1128 }
786 1129
787 while ((bio = bio_list_peek(&tg->bio_lists[WRITE])) 1130 while ((bio = throtl_peek_queued(&sq->queued[WRITE])) &&
788 && tg_may_dispatch(td, tg, bio, NULL)) { 1131 tg_may_dispatch(tg, bio, NULL)) {
789 1132
790 tg_dispatch_one_bio(td, tg, bio_data_dir(bio), bl); 1133 tg_dispatch_one_bio(tg, bio_data_dir(bio));
791 nr_writes++; 1134 nr_writes++;
792 1135
793 if (nr_writes >= max_nr_writes) 1136 if (nr_writes >= max_nr_writes)
@@ -797,14 +1140,13 @@ static int throtl_dispatch_tg(struct throtl_data *td, struct throtl_grp *tg,
797 return nr_reads + nr_writes; 1140 return nr_reads + nr_writes;
798} 1141}
799 1142
800static int throtl_select_dispatch(struct throtl_data *td, struct bio_list *bl) 1143static int throtl_select_dispatch(struct throtl_service_queue *parent_sq)
801{ 1144{
802 unsigned int nr_disp = 0; 1145 unsigned int nr_disp = 0;
803 struct throtl_grp *tg;
804 struct throtl_rb_root *st = &td->tg_service_tree;
805 1146
806 while (1) { 1147 while (1) {
807 tg = throtl_rb_first(st); 1148 struct throtl_grp *tg = throtl_rb_first(parent_sq);
1149 struct throtl_service_queue *sq = &tg->service_queue;
808 1150
809 if (!tg) 1151 if (!tg)
810 break; 1152 break;
@@ -812,14 +1154,12 @@ static int throtl_select_dispatch(struct throtl_data *td, struct bio_list *bl)
812 if (time_before(jiffies, tg->disptime)) 1154 if (time_before(jiffies, tg->disptime))
813 break; 1155 break;
814 1156
815 throtl_dequeue_tg(td, tg); 1157 throtl_dequeue_tg(tg);
816 1158
817 nr_disp += throtl_dispatch_tg(td, tg, bl); 1159 nr_disp += throtl_dispatch_tg(tg);
818 1160
819 if (tg->nr_queued[0] || tg->nr_queued[1]) { 1161 if (sq->nr_queued[0] || sq->nr_queued[1])
820 tg_update_disptime(td, tg); 1162 tg_update_disptime(tg);
821 throtl_enqueue_tg(td, tg);
822 }
823 1163
824 if (nr_disp >= throtl_quantum) 1164 if (nr_disp >= throtl_quantum)
825 break; 1165 break;
@@ -828,111 +1168,111 @@ static int throtl_select_dispatch(struct throtl_data *td, struct bio_list *bl)
828 return nr_disp; 1168 return nr_disp;
829} 1169}
830 1170
831static void throtl_process_limit_change(struct throtl_data *td) 1171/**
1172 * throtl_pending_timer_fn - timer function for service_queue->pending_timer
1173 * @arg: the throtl_service_queue being serviced
1174 *
1175 * This timer is armed when a child throtl_grp with active bio's become
1176 * pending and queued on the service_queue's pending_tree and expires when
1177 * the first child throtl_grp should be dispatched. This function
1178 * dispatches bio's from the children throtl_grps to the parent
1179 * service_queue.
1180 *
1181 * If the parent's parent is another throtl_grp, dispatching is propagated
1182 * by either arming its pending_timer or repeating dispatch directly. If
1183 * the top-level service_tree is reached, throtl_data->dispatch_work is
1184 * kicked so that the ready bio's are issued.
1185 */
1186static void throtl_pending_timer_fn(unsigned long arg)
832{ 1187{
1188 struct throtl_service_queue *sq = (void *)arg;
1189 struct throtl_grp *tg = sq_to_tg(sq);
1190 struct throtl_data *td = sq_to_td(sq);
833 struct request_queue *q = td->queue; 1191 struct request_queue *q = td->queue;
834 struct blkcg_gq *blkg, *n; 1192 struct throtl_service_queue *parent_sq;
835 1193 bool dispatched;
836 if (!td->limits_changed) 1194 int ret;
837 return;
838
839 xchg(&td->limits_changed, false);
840
841 throtl_log(td, "limits changed");
842
843 list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) {
844 struct throtl_grp *tg = blkg_to_tg(blkg);
845 1195
846 if (!tg->limits_changed) 1196 spin_lock_irq(q->queue_lock);
847 continue; 1197again:
1198 parent_sq = sq->parent_sq;
1199 dispatched = false;
1200
1201 while (true) {
1202 throtl_log(sq, "dispatch nr_queued=%u read=%u write=%u",
1203 sq->nr_queued[READ] + sq->nr_queued[WRITE],
1204 sq->nr_queued[READ], sq->nr_queued[WRITE]);
1205
1206 ret = throtl_select_dispatch(sq);
1207 if (ret) {
1208 throtl_log(sq, "bios disp=%u", ret);
1209 dispatched = true;
1210 }
848 1211
849 if (!xchg(&tg->limits_changed, false)) 1212 if (throtl_schedule_next_dispatch(sq, false))
850 continue; 1213 break;
851 1214
852 throtl_log_tg(td, tg, "limit change rbps=%llu wbps=%llu" 1215 /* this dispatch windows is still open, relax and repeat */
853 " riops=%u wiops=%u", tg->bps[READ], tg->bps[WRITE], 1216 spin_unlock_irq(q->queue_lock);
854 tg->iops[READ], tg->iops[WRITE]); 1217 cpu_relax();
1218 spin_lock_irq(q->queue_lock);
1219 }
855 1220
856 /* 1221 if (!dispatched)
857 * Restart the slices for both READ and WRITES. It 1222 goto out_unlock;
858 * might happen that a group's limit are dropped
859 * suddenly and we don't want to account recently
860 * dispatched IO with new low rate
861 */
862 throtl_start_new_slice(td, tg, 0);
863 throtl_start_new_slice(td, tg, 1);
864 1223
865 if (throtl_tg_on_rr(tg)) 1224 if (parent_sq) {
866 tg_update_disptime(td, tg); 1225 /* @parent_sq is another throl_grp, propagate dispatch */
1226 if (tg->flags & THROTL_TG_WAS_EMPTY) {
1227 tg_update_disptime(tg);
1228 if (!throtl_schedule_next_dispatch(parent_sq, false)) {
1229 /* window is already open, repeat dispatching */
1230 sq = parent_sq;
1231 tg = sq_to_tg(sq);
1232 goto again;
1233 }
1234 }
1235 } else {
1236 /* reached the top-level, queue issueing */
1237 queue_work(kthrotld_workqueue, &td->dispatch_work);
867 } 1238 }
1239out_unlock:
1240 spin_unlock_irq(q->queue_lock);
868} 1241}
869 1242
870/* Dispatch throttled bios. Should be called without queue lock held. */ 1243/**
871static int throtl_dispatch(struct request_queue *q) 1244 * blk_throtl_dispatch_work_fn - work function for throtl_data->dispatch_work
1245 * @work: work item being executed
1246 *
1247 * This function is queued for execution when bio's reach the bio_lists[]
1248 * of throtl_data->service_queue. Those bio's are ready and issued by this
1249 * function.
1250 */
1251void blk_throtl_dispatch_work_fn(struct work_struct *work)
872{ 1252{
873 struct throtl_data *td = q->td; 1253 struct throtl_data *td = container_of(work, struct throtl_data,
874 unsigned int nr_disp = 0; 1254 dispatch_work);
1255 struct throtl_service_queue *td_sq = &td->service_queue;
1256 struct request_queue *q = td->queue;
875 struct bio_list bio_list_on_stack; 1257 struct bio_list bio_list_on_stack;
876 struct bio *bio; 1258 struct bio *bio;
877 struct blk_plug plug; 1259 struct blk_plug plug;
878 1260 int rw;
879 spin_lock_irq(q->queue_lock);
880
881 throtl_process_limit_change(td);
882
883 if (!total_nr_queued(td))
884 goto out;
885 1261
886 bio_list_init(&bio_list_on_stack); 1262 bio_list_init(&bio_list_on_stack);
887 1263
888 throtl_log(td, "dispatch nr_queued=%u read=%u write=%u", 1264 spin_lock_irq(q->queue_lock);
889 total_nr_queued(td), td->nr_queued[READ], 1265 for (rw = READ; rw <= WRITE; rw++)
890 td->nr_queued[WRITE]); 1266 while ((bio = throtl_pop_queued(&td_sq->queued[rw], NULL)))
891 1267 bio_list_add(&bio_list_on_stack, bio);
892 nr_disp = throtl_select_dispatch(td, &bio_list_on_stack);
893
894 if (nr_disp)
895 throtl_log(td, "bios disp=%u", nr_disp);
896
897 throtl_schedule_next_dispatch(td);
898out:
899 spin_unlock_irq(q->queue_lock); 1268 spin_unlock_irq(q->queue_lock);
900 1269
901 /* 1270 if (!bio_list_empty(&bio_list_on_stack)) {
902 * If we dispatched some requests, unplug the queue to make sure
903 * immediate dispatch
904 */
905 if (nr_disp) {
906 blk_start_plug(&plug); 1271 blk_start_plug(&plug);
907 while((bio = bio_list_pop(&bio_list_on_stack))) 1272 while((bio = bio_list_pop(&bio_list_on_stack)))
908 generic_make_request(bio); 1273 generic_make_request(bio);
909 blk_finish_plug(&plug); 1274 blk_finish_plug(&plug);
910 } 1275 }
911 return nr_disp;
912}
913
914void blk_throtl_work(struct work_struct *work)
915{
916 struct throtl_data *td = container_of(work, struct throtl_data,
917 throtl_work.work);
918 struct request_queue *q = td->queue;
919
920 throtl_dispatch(q);
921}
922
923/* Call with queue lock held */
924static void
925throtl_schedule_delayed_work(struct throtl_data *td, unsigned long delay)
926{
927
928 struct delayed_work *dwork = &td->throtl_work;
929
930 /* schedule work if limits changed even if no bio is queued */
931 if (total_nr_queued(td) || td->limits_changed) {
932 mod_delayed_work(kthrotld_workqueue, dwork, delay);
933 throtl_log(td, "schedule work. delay=%lu jiffies=%lu",
934 delay, jiffies);
935 }
936} 1276}
937 1277
938static u64 tg_prfill_cpu_rwstat(struct seq_file *sf, 1278static u64 tg_prfill_cpu_rwstat(struct seq_file *sf,
@@ -1007,7 +1347,9 @@ static int tg_set_conf(struct cgroup *cgrp, struct cftype *cft, const char *buf,
1007 struct blkcg *blkcg = cgroup_to_blkcg(cgrp); 1347 struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
1008 struct blkg_conf_ctx ctx; 1348 struct blkg_conf_ctx ctx;
1009 struct throtl_grp *tg; 1349 struct throtl_grp *tg;
1010 struct throtl_data *td; 1350 struct throtl_service_queue *sq;
1351 struct blkcg_gq *blkg;
1352 struct cgroup *pos_cgrp;
1011 int ret; 1353 int ret;
1012 1354
1013 ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx); 1355 ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx);
@@ -1015,7 +1357,7 @@ static int tg_set_conf(struct cgroup *cgrp, struct cftype *cft, const char *buf,
1015 return ret; 1357 return ret;
1016 1358
1017 tg = blkg_to_tg(ctx.blkg); 1359 tg = blkg_to_tg(ctx.blkg);
1018 td = ctx.blkg->q->td; 1360 sq = &tg->service_queue;
1019 1361
1020 if (!ctx.v) 1362 if (!ctx.v)
1021 ctx.v = -1; 1363 ctx.v = -1;
@@ -1025,10 +1367,37 @@ static int tg_set_conf(struct cgroup *cgrp, struct cftype *cft, const char *buf,
1025 else 1367 else
1026 *(unsigned int *)((void *)tg + cft->private) = ctx.v; 1368 *(unsigned int *)((void *)tg + cft->private) = ctx.v;
1027 1369
1028 /* XXX: we don't need the following deferred processing */ 1370 throtl_log(&tg->service_queue,
1029 xchg(&tg->limits_changed, true); 1371 "limit change rbps=%llu wbps=%llu riops=%u wiops=%u",
1030 xchg(&td->limits_changed, true); 1372 tg->bps[READ], tg->bps[WRITE],
1031 throtl_schedule_delayed_work(td, 0); 1373 tg->iops[READ], tg->iops[WRITE]);
1374
1375 /*
1376 * Update has_rules[] flags for the updated tg's subtree. A tg is
1377 * considered to have rules if either the tg itself or any of its
1378 * ancestors has rules. This identifies groups without any
1379 * restrictions in the whole hierarchy and allows them to bypass
1380 * blk-throttle.
1381 */
1382 tg_update_has_rules(tg);
1383 blkg_for_each_descendant_pre(blkg, pos_cgrp, ctx.blkg)
1384 tg_update_has_rules(blkg_to_tg(blkg));
1385
1386 /*
1387 * We're already holding queue_lock and know @tg is valid. Let's
1388 * apply the new config directly.
1389 *
1390 * Restart the slices for both READ and WRITES. It might happen
1391 * that a group's limit are dropped suddenly and we don't want to
1392 * account recently dispatched IO with new low rate.
1393 */
1394 throtl_start_new_slice(tg, 0);
1395 throtl_start_new_slice(tg, 1);
1396
1397 if (tg->flags & THROTL_TG_PENDING) {
1398 tg_update_disptime(tg);
1399 throtl_schedule_next_dispatch(sq->parent_sq, true);
1400 }
1032 1401
1033 blkg_conf_finish(&ctx); 1402 blkg_conf_finish(&ctx);
1034 return 0; 1403 return 0;
@@ -1092,7 +1461,7 @@ static void throtl_shutdown_wq(struct request_queue *q)
1092{ 1461{
1093 struct throtl_data *td = q->td; 1462 struct throtl_data *td = q->td;
1094 1463
1095 cancel_delayed_work_sync(&td->throtl_work); 1464 cancel_work_sync(&td->dispatch_work);
1096} 1465}
1097 1466
1098static struct blkcg_policy blkcg_policy_throtl = { 1467static struct blkcg_policy blkcg_policy_throtl = {
@@ -1100,6 +1469,7 @@ static struct blkcg_policy blkcg_policy_throtl = {
1100 .cftypes = throtl_files, 1469 .cftypes = throtl_files,
1101 1470
1102 .pd_init_fn = throtl_pd_init, 1471 .pd_init_fn = throtl_pd_init,
1472 .pd_online_fn = throtl_pd_online,
1103 .pd_exit_fn = throtl_pd_exit, 1473 .pd_exit_fn = throtl_pd_exit,
1104 .pd_reset_stats_fn = throtl_pd_reset_stats, 1474 .pd_reset_stats_fn = throtl_pd_reset_stats,
1105}; 1475};
@@ -1107,15 +1477,16 @@ static struct blkcg_policy blkcg_policy_throtl = {
1107bool blk_throtl_bio(struct request_queue *q, struct bio *bio) 1477bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
1108{ 1478{
1109 struct throtl_data *td = q->td; 1479 struct throtl_data *td = q->td;
1480 struct throtl_qnode *qn = NULL;
1110 struct throtl_grp *tg; 1481 struct throtl_grp *tg;
1111 bool rw = bio_data_dir(bio), update_disptime = true; 1482 struct throtl_service_queue *sq;
1483 bool rw = bio_data_dir(bio);
1112 struct blkcg *blkcg; 1484 struct blkcg *blkcg;
1113 bool throttled = false; 1485 bool throttled = false;
1114 1486
1115 if (bio->bi_rw & REQ_THROTTLED) { 1487 /* see throtl_charge_bio() */
1116 bio->bi_rw &= ~REQ_THROTTLED; 1488 if (bio->bi_rw & REQ_THROTTLED)
1117 goto out; 1489 goto out;
1118 }
1119 1490
1120 /* 1491 /*
1121 * A throtl_grp pointer retrieved under rcu can be used to access 1492 * A throtl_grp pointer retrieved under rcu can be used to access
@@ -1126,7 +1497,7 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
1126 blkcg = bio_blkcg(bio); 1497 blkcg = bio_blkcg(bio);
1127 tg = throtl_lookup_tg(td, blkcg); 1498 tg = throtl_lookup_tg(td, blkcg);
1128 if (tg) { 1499 if (tg) {
1129 if (tg_no_rule_group(tg, rw)) { 1500 if (!tg->has_rules[rw]) {
1130 throtl_update_dispatch_stats(tg_to_blkg(tg), 1501 throtl_update_dispatch_stats(tg_to_blkg(tg),
1131 bio->bi_size, bio->bi_rw); 1502 bio->bi_size, bio->bi_rw);
1132 goto out_unlock_rcu; 1503 goto out_unlock_rcu;
@@ -1142,18 +1513,18 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
1142 if (unlikely(!tg)) 1513 if (unlikely(!tg))
1143 goto out_unlock; 1514 goto out_unlock;
1144 1515
1145 if (tg->nr_queued[rw]) { 1516 sq = &tg->service_queue;
1146 /*
1147 * There is already another bio queued in same dir. No
1148 * need to update dispatch time.
1149 */
1150 update_disptime = false;
1151 goto queue_bio;
1152 1517
1153 } 1518 while (true) {
1519 /* throtl is FIFO - if bios are already queued, should queue */
1520 if (sq->nr_queued[rw])
1521 break;
1522
1523 /* if above limits, break to queue */
1524 if (!tg_may_dispatch(tg, bio, NULL))
1525 break;
1154 1526
1155 /* Bio is with-in rate limit of group */ 1527 /* within limits, let's charge and dispatch directly */
1156 if (tg_may_dispatch(td, tg, bio, NULL)) {
1157 throtl_charge_bio(tg, bio); 1528 throtl_charge_bio(tg, bio);
1158 1529
1159 /* 1530 /*
@@ -1167,25 +1538,41 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
1167 * 1538 *
1168 * So keep on trimming slice even if bio is not queued. 1539 * So keep on trimming slice even if bio is not queued.
1169 */ 1540 */
1170 throtl_trim_slice(td, tg, rw); 1541 throtl_trim_slice(tg, rw);
1171 goto out_unlock; 1542
1543 /*
1544 * @bio passed through this layer without being throttled.
1545 * Climb up the ladder. If we''re already at the top, it
1546 * can be executed directly.
1547 */
1548 qn = &tg->qnode_on_parent[rw];
1549 sq = sq->parent_sq;
1550 tg = sq_to_tg(sq);
1551 if (!tg)
1552 goto out_unlock;
1172 } 1553 }
1173 1554
1174queue_bio: 1555 /* out-of-limit, queue to @tg */
1175 throtl_log_tg(td, tg, "[%c] bio. bdisp=%llu sz=%u bps=%llu" 1556 throtl_log(sq, "[%c] bio. bdisp=%llu sz=%u bps=%llu iodisp=%u iops=%u queued=%d/%d",
1176 " iodisp=%u iops=%u queued=%d/%d", 1557 rw == READ ? 'R' : 'W',
1177 rw == READ ? 'R' : 'W', 1558 tg->bytes_disp[rw], bio->bi_size, tg->bps[rw],
1178 tg->bytes_disp[rw], bio->bi_size, tg->bps[rw], 1559 tg->io_disp[rw], tg->iops[rw],
1179 tg->io_disp[rw], tg->iops[rw], 1560 sq->nr_queued[READ], sq->nr_queued[WRITE]);
1180 tg->nr_queued[READ], tg->nr_queued[WRITE]);
1181 1561
1182 bio_associate_current(bio); 1562 bio_associate_current(bio);
1183 throtl_add_bio_tg(q->td, tg, bio); 1563 tg->td->nr_queued[rw]++;
1564 throtl_add_bio_tg(bio, qn, tg);
1184 throttled = true; 1565 throttled = true;
1185 1566
1186 if (update_disptime) { 1567 /*
1187 tg_update_disptime(td, tg); 1568 * Update @tg's dispatch time and force schedule dispatch if @tg
1188 throtl_schedule_next_dispatch(td); 1569 * was empty before @bio. The forced scheduling isn't likely to
1570 * cause undue delay as @bio is likely to be dispatched directly if
1571 * its @tg's disptime is not in the future.
1572 */
1573 if (tg->flags & THROTL_TG_WAS_EMPTY) {
1574 tg_update_disptime(tg);
1575 throtl_schedule_next_dispatch(tg->service_queue.parent_sq, true);
1189 } 1576 }
1190 1577
1191out_unlock: 1578out_unlock:
@@ -1193,9 +1580,38 @@ out_unlock:
1193out_unlock_rcu: 1580out_unlock_rcu:
1194 rcu_read_unlock(); 1581 rcu_read_unlock();
1195out: 1582out:
1583 /*
1584 * As multiple blk-throtls may stack in the same issue path, we
1585 * don't want bios to leave with the flag set. Clear the flag if
1586 * being issued.
1587 */
1588 if (!throttled)
1589 bio->bi_rw &= ~REQ_THROTTLED;
1196 return throttled; 1590 return throttled;
1197} 1591}
1198 1592
1593/*
1594 * Dispatch all bios from all children tg's queued on @parent_sq. On
1595 * return, @parent_sq is guaranteed to not have any active children tg's
1596 * and all bios from previously active tg's are on @parent_sq->bio_lists[].
1597 */
1598static void tg_drain_bios(struct throtl_service_queue *parent_sq)
1599{
1600 struct throtl_grp *tg;
1601
1602 while ((tg = throtl_rb_first(parent_sq))) {
1603 struct throtl_service_queue *sq = &tg->service_queue;
1604 struct bio *bio;
1605
1606 throtl_dequeue_tg(tg);
1607
1608 while ((bio = throtl_peek_queued(&sq->queued[READ])))
1609 tg_dispatch_one_bio(tg, bio_data_dir(bio));
1610 while ((bio = throtl_peek_queued(&sq->queued[WRITE])))
1611 tg_dispatch_one_bio(tg, bio_data_dir(bio));
1612 }
1613}
1614
1199/** 1615/**
1200 * blk_throtl_drain - drain throttled bios 1616 * blk_throtl_drain - drain throttled bios
1201 * @q: request_queue to drain throttled bios for 1617 * @q: request_queue to drain throttled bios for
@@ -1206,27 +1622,36 @@ void blk_throtl_drain(struct request_queue *q)
1206 __releases(q->queue_lock) __acquires(q->queue_lock) 1622 __releases(q->queue_lock) __acquires(q->queue_lock)
1207{ 1623{
1208 struct throtl_data *td = q->td; 1624 struct throtl_data *td = q->td;
1209 struct throtl_rb_root *st = &td->tg_service_tree; 1625 struct blkcg_gq *blkg;
1210 struct throtl_grp *tg; 1626 struct cgroup *pos_cgrp;
1211 struct bio_list bl;
1212 struct bio *bio; 1627 struct bio *bio;
1628 int rw;
1213 1629
1214 queue_lockdep_assert_held(q); 1630 queue_lockdep_assert_held(q);
1631 rcu_read_lock();
1632
1633 /*
1634 * Drain each tg while doing post-order walk on the blkg tree, so
1635 * that all bios are propagated to td->service_queue. It'd be
1636 * better to walk service_queue tree directly but blkg walk is
1637 * easier.
1638 */
1639 blkg_for_each_descendant_post(blkg, pos_cgrp, td->queue->root_blkg)
1640 tg_drain_bios(&blkg_to_tg(blkg)->service_queue);
1215 1641
1216 bio_list_init(&bl); 1642 tg_drain_bios(&td_root_tg(td)->service_queue);
1217 1643
1218 while ((tg = throtl_rb_first(st))) { 1644 /* finally, transfer bios from top-level tg's into the td */
1219 throtl_dequeue_tg(td, tg); 1645 tg_drain_bios(&td->service_queue);
1220 1646
1221 while ((bio = bio_list_peek(&tg->bio_lists[READ]))) 1647 rcu_read_unlock();
1222 tg_dispatch_one_bio(td, tg, bio_data_dir(bio), &bl);
1223 while ((bio = bio_list_peek(&tg->bio_lists[WRITE])))
1224 tg_dispatch_one_bio(td, tg, bio_data_dir(bio), &bl);
1225 }
1226 spin_unlock_irq(q->queue_lock); 1648 spin_unlock_irq(q->queue_lock);
1227 1649
1228 while ((bio = bio_list_pop(&bl))) 1650 /* all bios now should be in td->service_queue, issue them */
1229 generic_make_request(bio); 1651 for (rw = READ; rw <= WRITE; rw++)
1652 while ((bio = throtl_pop_queued(&td->service_queue.queued[rw],
1653 NULL)))
1654 generic_make_request(bio);
1230 1655
1231 spin_lock_irq(q->queue_lock); 1656 spin_lock_irq(q->queue_lock);
1232} 1657}
@@ -1240,9 +1665,8 @@ int blk_throtl_init(struct request_queue *q)
1240 if (!td) 1665 if (!td)
1241 return -ENOMEM; 1666 return -ENOMEM;
1242 1667
1243 td->tg_service_tree = THROTL_RB_ROOT; 1668 INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn);
1244 td->limits_changed = false; 1669 throtl_service_queue_init(&td->service_queue, NULL);
1245 INIT_DELAYED_WORK(&td->throtl_work, blk_throtl_work);
1246 1670
1247 q->td = td; 1671 q->td = td;
1248 td->queue = q; 1672 td->queue = q;
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index b81ddfea1da0..e07a5fd58ad7 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -532,11 +532,11 @@ config BLK_DEV_RBD
532 If unsure, say N. 532 If unsure, say N.
533 533
534config BLK_DEV_RSXX 534config BLK_DEV_RSXX
535 tristate "IBM FlashSystem 70/80 PCIe SSD Device Driver" 535 tristate "IBM Flash Adapter 900GB Full Height PCIe Device Driver"
536 depends on PCI 536 depends on PCI
537 help 537 help
538 Device driver for IBM's high speed PCIe SSD 538 Device driver for IBM's high speed PCIe SSD
539 storage devices: FlashSystem-70 and FlashSystem-80. 539 storage device: Flash Adapter 900GB Full Height.
540 540
541 To compile this driver as a module, choose M here: the 541 To compile this driver as a module, choose M here: the
542 module will be called rsxx. 542 module will be called rsxx.
diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c
index 6608076dc39e..28c73ca320a8 100644
--- a/drivers/block/drbd/drbd_actlog.c
+++ b/drivers/block/drbd/drbd_actlog.c
@@ -659,6 +659,27 @@ void drbd_al_shrink(struct drbd_conf *mdev)
659 wake_up(&mdev->al_wait); 659 wake_up(&mdev->al_wait);
660} 660}
661 661
662int drbd_initialize_al(struct drbd_conf *mdev, void *buffer)
663{
664 struct al_transaction_on_disk *al = buffer;
665 struct drbd_md *md = &mdev->ldev->md;
666 sector_t al_base = md->md_offset + md->al_offset;
667 int al_size_4k = md->al_stripes * md->al_stripe_size_4k;
668 int i;
669
670 memset(al, 0, 4096);
671 al->magic = cpu_to_be32(DRBD_AL_MAGIC);
672 al->transaction_type = cpu_to_be16(AL_TR_INITIALIZED);
673 al->crc32c = cpu_to_be32(crc32c(0, al, 4096));
674
675 for (i = 0; i < al_size_4k; i++) {
676 int err = drbd_md_sync_page_io(mdev, mdev->ldev, al_base + i * 8, WRITE);
677 if (err)
678 return err;
679 }
680 return 0;
681}
682
662static int w_update_odbm(struct drbd_work *w, int unused) 683static int w_update_odbm(struct drbd_work *w, int unused)
663{ 684{
664 struct update_odbm_work *udw = container_of(w, struct update_odbm_work, w); 685 struct update_odbm_work *udw = container_of(w, struct update_odbm_work, w);
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index f943aacfdad8..2d7f608d181c 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -832,6 +832,7 @@ struct drbd_tconn { /* is a resource from the config file */
832 unsigned susp_nod:1; /* IO suspended because no data */ 832 unsigned susp_nod:1; /* IO suspended because no data */
833 unsigned susp_fen:1; /* IO suspended because fence peer handler runs */ 833 unsigned susp_fen:1; /* IO suspended because fence peer handler runs */
834 struct mutex cstate_mutex; /* Protects graceful disconnects */ 834 struct mutex cstate_mutex; /* Protects graceful disconnects */
835 unsigned int connect_cnt; /* Inc each time a connection is established */
835 836
836 unsigned long flags; 837 unsigned long flags;
837 struct net_conf *net_conf; /* content protected by rcu */ 838 struct net_conf *net_conf; /* content protected by rcu */
@@ -1132,6 +1133,7 @@ extern void drbd_mdev_cleanup(struct drbd_conf *mdev);
1132void drbd_print_uuids(struct drbd_conf *mdev, const char *text); 1133void drbd_print_uuids(struct drbd_conf *mdev, const char *text);
1133 1134
1134extern void conn_md_sync(struct drbd_tconn *tconn); 1135extern void conn_md_sync(struct drbd_tconn *tconn);
1136extern void drbd_md_write(struct drbd_conf *mdev, void *buffer);
1135extern void drbd_md_sync(struct drbd_conf *mdev); 1137extern void drbd_md_sync(struct drbd_conf *mdev);
1136extern int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev); 1138extern int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev);
1137extern void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local); 1139extern void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local);
@@ -1466,8 +1468,16 @@ extern void drbd_suspend_io(struct drbd_conf *mdev);
1466extern void drbd_resume_io(struct drbd_conf *mdev); 1468extern void drbd_resume_io(struct drbd_conf *mdev);
1467extern char *ppsize(char *buf, unsigned long long size); 1469extern char *ppsize(char *buf, unsigned long long size);
1468extern sector_t drbd_new_dev_size(struct drbd_conf *, struct drbd_backing_dev *, sector_t, int); 1470extern sector_t drbd_new_dev_size(struct drbd_conf *, struct drbd_backing_dev *, sector_t, int);
1469enum determine_dev_size { dev_size_error = -1, unchanged = 0, shrunk = 1, grew = 2 }; 1471enum determine_dev_size {
1470extern enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *, enum dds_flags) __must_hold(local); 1472 DS_ERROR_SHRINK = -3,
1473 DS_ERROR_SPACE_MD = -2,
1474 DS_ERROR = -1,
1475 DS_UNCHANGED = 0,
1476 DS_SHRUNK = 1,
1477 DS_GREW = 2
1478};
1479extern enum determine_dev_size
1480drbd_determine_dev_size(struct drbd_conf *, enum dds_flags, struct resize_parms *) __must_hold(local);
1471extern void resync_after_online_grow(struct drbd_conf *); 1481extern void resync_after_online_grow(struct drbd_conf *);
1472extern void drbd_reconsider_max_bio_size(struct drbd_conf *mdev); 1482extern void drbd_reconsider_max_bio_size(struct drbd_conf *mdev);
1473extern enum drbd_state_rv drbd_set_role(struct drbd_conf *mdev, 1483extern enum drbd_state_rv drbd_set_role(struct drbd_conf *mdev,
@@ -1633,6 +1643,7 @@ extern int __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector,
1633#define drbd_set_out_of_sync(mdev, sector, size) \ 1643#define drbd_set_out_of_sync(mdev, sector, size) \
1634 __drbd_set_out_of_sync(mdev, sector, size, __FILE__, __LINE__) 1644 __drbd_set_out_of_sync(mdev, sector, size, __FILE__, __LINE__)
1635extern void drbd_al_shrink(struct drbd_conf *mdev); 1645extern void drbd_al_shrink(struct drbd_conf *mdev);
1646extern int drbd_initialize_al(struct drbd_conf *, void *);
1636 1647
1637/* drbd_nl.c */ 1648/* drbd_nl.c */
1638/* state info broadcast */ 1649/* state info broadcast */
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index a5dca6affcbb..55635edf563b 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -2762,8 +2762,6 @@ int __init drbd_init(void)
2762 /* 2762 /*
2763 * allocate all necessary structs 2763 * allocate all necessary structs
2764 */ 2764 */
2765 err = -ENOMEM;
2766
2767 init_waitqueue_head(&drbd_pp_wait); 2765 init_waitqueue_head(&drbd_pp_wait);
2768 2766
2769 drbd_proc = NULL; /* play safe for drbd_cleanup */ 2767 drbd_proc = NULL; /* play safe for drbd_cleanup */
@@ -2773,6 +2771,7 @@ int __init drbd_init(void)
2773 if (err) 2771 if (err)
2774 goto fail; 2772 goto fail;
2775 2773
2774 err = -ENOMEM;
2776 drbd_proc = proc_create_data("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops, NULL); 2775 drbd_proc = proc_create_data("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops, NULL);
2777 if (!drbd_proc) { 2776 if (!drbd_proc) {
2778 printk(KERN_ERR "drbd: unable to register proc file\n"); 2777 printk(KERN_ERR "drbd: unable to register proc file\n");
@@ -2803,7 +2802,6 @@ int __init drbd_init(void)
2803fail: 2802fail:
2804 drbd_cleanup(); 2803 drbd_cleanup();
2805 if (err == -ENOMEM) 2804 if (err == -ENOMEM)
2806 /* currently always the case */
2807 printk(KERN_ERR "drbd: ran out of memory\n"); 2805 printk(KERN_ERR "drbd: ran out of memory\n");
2808 else 2806 else
2809 printk(KERN_ERR "drbd: initialization failure\n"); 2807 printk(KERN_ERR "drbd: initialization failure\n");
@@ -2881,34 +2879,14 @@ struct meta_data_on_disk {
2881 u8 reserved_u8[4096 - (7*8 + 10*4)]; 2879 u8 reserved_u8[4096 - (7*8 + 10*4)];
2882} __packed; 2880} __packed;
2883 2881
2884/** 2882
2885 * drbd_md_sync() - Writes the meta data super block if the MD_DIRTY flag bit is set 2883
2886 * @mdev: DRBD device. 2884void drbd_md_write(struct drbd_conf *mdev, void *b)
2887 */
2888void drbd_md_sync(struct drbd_conf *mdev)
2889{ 2885{
2890 struct meta_data_on_disk *buffer; 2886 struct meta_data_on_disk *buffer = b;
2891 sector_t sector; 2887 sector_t sector;
2892 int i; 2888 int i;
2893 2889
2894 /* Don't accidentally change the DRBD meta data layout. */
2895 BUILD_BUG_ON(UI_SIZE != 4);
2896 BUILD_BUG_ON(sizeof(struct meta_data_on_disk) != 4096);
2897
2898 del_timer(&mdev->md_sync_timer);
2899 /* timer may be rearmed by drbd_md_mark_dirty() now. */
2900 if (!test_and_clear_bit(MD_DIRTY, &mdev->flags))
2901 return;
2902
2903 /* We use here D_FAILED and not D_ATTACHING because we try to write
2904 * metadata even if we detach due to a disk failure! */
2905 if (!get_ldev_if_state(mdev, D_FAILED))
2906 return;
2907
2908 buffer = drbd_md_get_buffer(mdev);
2909 if (!buffer)
2910 goto out;
2911
2912 memset(buffer, 0, sizeof(*buffer)); 2890 memset(buffer, 0, sizeof(*buffer));
2913 2891
2914 buffer->la_size_sect = cpu_to_be64(drbd_get_capacity(mdev->this_bdev)); 2892 buffer->la_size_sect = cpu_to_be64(drbd_get_capacity(mdev->this_bdev));
@@ -2937,6 +2915,35 @@ void drbd_md_sync(struct drbd_conf *mdev)
2937 dev_err(DEV, "meta data update failed!\n"); 2915 dev_err(DEV, "meta data update failed!\n");
2938 drbd_chk_io_error(mdev, 1, DRBD_META_IO_ERROR); 2916 drbd_chk_io_error(mdev, 1, DRBD_META_IO_ERROR);
2939 } 2917 }
2918}
2919
2920/**
2921 * drbd_md_sync() - Writes the meta data super block if the MD_DIRTY flag bit is set
2922 * @mdev: DRBD device.
2923 */
2924void drbd_md_sync(struct drbd_conf *mdev)
2925{
2926 struct meta_data_on_disk *buffer;
2927
2928 /* Don't accidentally change the DRBD meta data layout. */
2929 BUILD_BUG_ON(UI_SIZE != 4);
2930 BUILD_BUG_ON(sizeof(struct meta_data_on_disk) != 4096);
2931
2932 del_timer(&mdev->md_sync_timer);
2933 /* timer may be rearmed by drbd_md_mark_dirty() now. */
2934 if (!test_and_clear_bit(MD_DIRTY, &mdev->flags))
2935 return;
2936
2937 /* We use here D_FAILED and not D_ATTACHING because we try to write
2938 * metadata even if we detach due to a disk failure! */
2939 if (!get_ldev_if_state(mdev, D_FAILED))
2940 return;
2941
2942 buffer = drbd_md_get_buffer(mdev);
2943 if (!buffer)
2944 goto out;
2945
2946 drbd_md_write(mdev, buffer);
2940 2947
2941 /* Update mdev->ldev->md.la_size_sect, 2948 /* Update mdev->ldev->md.la_size_sect,
2942 * since we updated it on metadata. */ 2949 * since we updated it on metadata. */
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 9e3f441e7e84..8cc1e640f485 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -417,6 +417,7 @@ static enum drbd_fencing_p highest_fencing_policy(struct drbd_tconn *tconn)
417 417
418bool conn_try_outdate_peer(struct drbd_tconn *tconn) 418bool conn_try_outdate_peer(struct drbd_tconn *tconn)
419{ 419{
420 unsigned int connect_cnt;
420 union drbd_state mask = { }; 421 union drbd_state mask = { };
421 union drbd_state val = { }; 422 union drbd_state val = { };
422 enum drbd_fencing_p fp; 423 enum drbd_fencing_p fp;
@@ -428,6 +429,10 @@ bool conn_try_outdate_peer(struct drbd_tconn *tconn)
428 return false; 429 return false;
429 } 430 }
430 431
432 spin_lock_irq(&tconn->req_lock);
433 connect_cnt = tconn->connect_cnt;
434 spin_unlock_irq(&tconn->req_lock);
435
431 fp = highest_fencing_policy(tconn); 436 fp = highest_fencing_policy(tconn);
432 switch (fp) { 437 switch (fp) {
433 case FP_NOT_AVAIL: 438 case FP_NOT_AVAIL:
@@ -492,8 +497,14 @@ bool conn_try_outdate_peer(struct drbd_tconn *tconn)
492 here, because we might were able to re-establish the connection in the 497 here, because we might were able to re-establish the connection in the
493 meantime. */ 498 meantime. */
494 spin_lock_irq(&tconn->req_lock); 499 spin_lock_irq(&tconn->req_lock);
495 if (tconn->cstate < C_WF_REPORT_PARAMS && !test_bit(STATE_SENT, &tconn->flags)) 500 if (tconn->cstate < C_WF_REPORT_PARAMS && !test_bit(STATE_SENT, &tconn->flags)) {
496 _conn_request_state(tconn, mask, val, CS_VERBOSE); 501 if (tconn->connect_cnt != connect_cnt)
502 /* In case the connection was established and droped
503 while the fence-peer handler was running, ignore it */
504 conn_info(tconn, "Ignoring fence-peer exit code\n");
505 else
506 _conn_request_state(tconn, mask, val, CS_VERBOSE);
507 }
497 spin_unlock_irq(&tconn->req_lock); 508 spin_unlock_irq(&tconn->req_lock);
498 509
499 return conn_highest_pdsk(tconn) <= D_OUTDATED; 510 return conn_highest_pdsk(tconn) <= D_OUTDATED;
@@ -816,15 +827,20 @@ void drbd_resume_io(struct drbd_conf *mdev)
816 * Returns 0 on success, negative return values indicate errors. 827 * Returns 0 on success, negative return values indicate errors.
817 * You should call drbd_md_sync() after calling this function. 828 * You should call drbd_md_sync() after calling this function.
818 */ 829 */
819enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds_flags flags) __must_hold(local) 830enum determine_dev_size
831drbd_determine_dev_size(struct drbd_conf *mdev, enum dds_flags flags, struct resize_parms *rs) __must_hold(local)
820{ 832{
821 sector_t prev_first_sect, prev_size; /* previous meta location */ 833 sector_t prev_first_sect, prev_size; /* previous meta location */
822 sector_t la_size_sect, u_size; 834 sector_t la_size_sect, u_size;
835 struct drbd_md *md = &mdev->ldev->md;
836 u32 prev_al_stripe_size_4k;
837 u32 prev_al_stripes;
823 sector_t size; 838 sector_t size;
824 char ppb[10]; 839 char ppb[10];
840 void *buffer;
825 841
826 int md_moved, la_size_changed; 842 int md_moved, la_size_changed;
827 enum determine_dev_size rv = unchanged; 843 enum determine_dev_size rv = DS_UNCHANGED;
828 844
829 /* race: 845 /* race:
830 * application request passes inc_ap_bio, 846 * application request passes inc_ap_bio,
@@ -836,6 +852,11 @@ enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds
836 * still lock the act_log to not trigger ASSERTs there. 852 * still lock the act_log to not trigger ASSERTs there.
837 */ 853 */
838 drbd_suspend_io(mdev); 854 drbd_suspend_io(mdev);
855 buffer = drbd_md_get_buffer(mdev); /* Lock meta-data IO */
856 if (!buffer) {
857 drbd_resume_io(mdev);
858 return DS_ERROR;
859 }
839 860
840 /* no wait necessary anymore, actually we could assert that */ 861 /* no wait necessary anymore, actually we could assert that */
841 wait_event(mdev->al_wait, lc_try_lock(mdev->act_log)); 862 wait_event(mdev->al_wait, lc_try_lock(mdev->act_log));
@@ -844,7 +865,17 @@ enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds
844 prev_size = mdev->ldev->md.md_size_sect; 865 prev_size = mdev->ldev->md.md_size_sect;
845 la_size_sect = mdev->ldev->md.la_size_sect; 866 la_size_sect = mdev->ldev->md.la_size_sect;
846 867
847 /* TODO: should only be some assert here, not (re)init... */ 868 if (rs) {
869 /* rs is non NULL if we should change the AL layout only */
870
871 prev_al_stripes = md->al_stripes;
872 prev_al_stripe_size_4k = md->al_stripe_size_4k;
873
874 md->al_stripes = rs->al_stripes;
875 md->al_stripe_size_4k = rs->al_stripe_size / 4;
876 md->al_size_4k = (u64)rs->al_stripes * rs->al_stripe_size / 4;
877 }
878
848 drbd_md_set_sector_offsets(mdev, mdev->ldev); 879 drbd_md_set_sector_offsets(mdev, mdev->ldev);
849 880
850 rcu_read_lock(); 881 rcu_read_lock();
@@ -852,6 +883,21 @@ enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds
852 rcu_read_unlock(); 883 rcu_read_unlock();
853 size = drbd_new_dev_size(mdev, mdev->ldev, u_size, flags & DDSF_FORCED); 884 size = drbd_new_dev_size(mdev, mdev->ldev, u_size, flags & DDSF_FORCED);
854 885
886 if (size < la_size_sect) {
887 if (rs && u_size == 0) {
888 /* Remove "rs &&" later. This check should always be active, but
889 right now the receiver expects the permissive behavior */
890 dev_warn(DEV, "Implicit shrink not allowed. "
891 "Use --size=%llus for explicit shrink.\n",
892 (unsigned long long)size);
893 rv = DS_ERROR_SHRINK;
894 }
895 if (u_size > size)
896 rv = DS_ERROR_SPACE_MD;
897 if (rv != DS_UNCHANGED)
898 goto err_out;
899 }
900
855 if (drbd_get_capacity(mdev->this_bdev) != size || 901 if (drbd_get_capacity(mdev->this_bdev) != size ||
856 drbd_bm_capacity(mdev) != size) { 902 drbd_bm_capacity(mdev) != size) {
857 int err; 903 int err;
@@ -867,7 +913,7 @@ enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds
867 "Leaving size unchanged at size = %lu KB\n", 913 "Leaving size unchanged at size = %lu KB\n",
868 (unsigned long)size); 914 (unsigned long)size);
869 } 915 }
870 rv = dev_size_error; 916 rv = DS_ERROR;
871 } 917 }
872 /* racy, see comments above. */ 918 /* racy, see comments above. */
873 drbd_set_my_capacity(mdev, size); 919 drbd_set_my_capacity(mdev, size);
@@ -875,38 +921,57 @@ enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds
875 dev_info(DEV, "size = %s (%llu KB)\n", ppsize(ppb, size>>1), 921 dev_info(DEV, "size = %s (%llu KB)\n", ppsize(ppb, size>>1),
876 (unsigned long long)size>>1); 922 (unsigned long long)size>>1);
877 } 923 }
878 if (rv == dev_size_error) 924 if (rv <= DS_ERROR)
879 goto out; 925 goto err_out;
880 926
881 la_size_changed = (la_size_sect != mdev->ldev->md.la_size_sect); 927 la_size_changed = (la_size_sect != mdev->ldev->md.la_size_sect);
882 928
883 md_moved = prev_first_sect != drbd_md_first_sector(mdev->ldev) 929 md_moved = prev_first_sect != drbd_md_first_sector(mdev->ldev)
884 || prev_size != mdev->ldev->md.md_size_sect; 930 || prev_size != mdev->ldev->md.md_size_sect;
885 931
886 if (la_size_changed || md_moved) { 932 if (la_size_changed || md_moved || rs) {
887 int err; 933 u32 prev_flags;
888 934
889 drbd_al_shrink(mdev); /* All extents inactive. */ 935 drbd_al_shrink(mdev); /* All extents inactive. */
936
937 prev_flags = md->flags;
938 md->flags &= ~MDF_PRIMARY_IND;
939 drbd_md_write(mdev, buffer);
940
890 dev_info(DEV, "Writing the whole bitmap, %s\n", 941 dev_info(DEV, "Writing the whole bitmap, %s\n",
891 la_size_changed && md_moved ? "size changed and md moved" : 942 la_size_changed && md_moved ? "size changed and md moved" :
892 la_size_changed ? "size changed" : "md moved"); 943 la_size_changed ? "size changed" : "md moved");
893 /* next line implicitly does drbd_suspend_io()+drbd_resume_io() */ 944 /* next line implicitly does drbd_suspend_io()+drbd_resume_io() */
894 err = drbd_bitmap_io(mdev, md_moved ? &drbd_bm_write_all : &drbd_bm_write, 945 drbd_bitmap_io(mdev, md_moved ? &drbd_bm_write_all : &drbd_bm_write,
895 "size changed", BM_LOCKED_MASK); 946 "size changed", BM_LOCKED_MASK);
896 if (err) { 947 drbd_initialize_al(mdev, buffer);
897 rv = dev_size_error; 948
898 goto out; 949 md->flags = prev_flags;
899 } 950 drbd_md_write(mdev, buffer);
900 drbd_md_mark_dirty(mdev); 951
952 if (rs)
953 dev_info(DEV, "Changed AL layout to al-stripes = %d, al-stripe-size-kB = %d\n",
954 md->al_stripes, md->al_stripe_size_4k * 4);
901 } 955 }
902 956
903 if (size > la_size_sect) 957 if (size > la_size_sect)
904 rv = grew; 958 rv = DS_GREW;
905 if (size < la_size_sect) 959 if (size < la_size_sect)
906 rv = shrunk; 960 rv = DS_SHRUNK;
907out: 961
962 if (0) {
963 err_out:
964 if (rs) {
965 md->al_stripes = prev_al_stripes;
966 md->al_stripe_size_4k = prev_al_stripe_size_4k;
967 md->al_size_4k = (u64)prev_al_stripes * prev_al_stripe_size_4k;
968
969 drbd_md_set_sector_offsets(mdev, mdev->ldev);
970 }
971 }
908 lc_unlock(mdev->act_log); 972 lc_unlock(mdev->act_log);
909 wake_up(&mdev->al_wait); 973 wake_up(&mdev->al_wait);
974 drbd_md_put_buffer(mdev);
910 drbd_resume_io(mdev); 975 drbd_resume_io(mdev);
911 976
912 return rv; 977 return rv;
@@ -1607,11 +1672,11 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
1607 !drbd_md_test_flag(mdev->ldev, MDF_CONNECTED_IND)) 1672 !drbd_md_test_flag(mdev->ldev, MDF_CONNECTED_IND))
1608 set_bit(USE_DEGR_WFC_T, &mdev->flags); 1673 set_bit(USE_DEGR_WFC_T, &mdev->flags);
1609 1674
1610 dd = drbd_determine_dev_size(mdev, 0); 1675 dd = drbd_determine_dev_size(mdev, 0, NULL);
1611 if (dd == dev_size_error) { 1676 if (dd <= DS_ERROR) {
1612 retcode = ERR_NOMEM_BITMAP; 1677 retcode = ERR_NOMEM_BITMAP;
1613 goto force_diskless_dec; 1678 goto force_diskless_dec;
1614 } else if (dd == grew) 1679 } else if (dd == DS_GREW)
1615 set_bit(RESYNC_AFTER_NEG, &mdev->flags); 1680 set_bit(RESYNC_AFTER_NEG, &mdev->flags);
1616 1681
1617 if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC) || 1682 if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC) ||
@@ -2305,6 +2370,7 @@ int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info)
2305 struct drbd_conf *mdev; 2370 struct drbd_conf *mdev;
2306 enum drbd_ret_code retcode; 2371 enum drbd_ret_code retcode;
2307 enum determine_dev_size dd; 2372 enum determine_dev_size dd;
2373 bool change_al_layout = false;
2308 enum dds_flags ddsf; 2374 enum dds_flags ddsf;
2309 sector_t u_size; 2375 sector_t u_size;
2310 int err; 2376 int err;
@@ -2315,31 +2381,33 @@ int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info)
2315 if (retcode != NO_ERROR) 2381 if (retcode != NO_ERROR)
2316 goto fail; 2382 goto fail;
2317 2383
2384 mdev = adm_ctx.mdev;
2385 if (!get_ldev(mdev)) {
2386 retcode = ERR_NO_DISK;
2387 goto fail;
2388 }
2389
2318 memset(&rs, 0, sizeof(struct resize_parms)); 2390 memset(&rs, 0, sizeof(struct resize_parms));
2391 rs.al_stripes = mdev->ldev->md.al_stripes;
2392 rs.al_stripe_size = mdev->ldev->md.al_stripe_size_4k * 4;
2319 if (info->attrs[DRBD_NLA_RESIZE_PARMS]) { 2393 if (info->attrs[DRBD_NLA_RESIZE_PARMS]) {
2320 err = resize_parms_from_attrs(&rs, info); 2394 err = resize_parms_from_attrs(&rs, info);
2321 if (err) { 2395 if (err) {
2322 retcode = ERR_MANDATORY_TAG; 2396 retcode = ERR_MANDATORY_TAG;
2323 drbd_msg_put_info(from_attrs_err_to_txt(err)); 2397 drbd_msg_put_info(from_attrs_err_to_txt(err));
2324 goto fail; 2398 goto fail_ldev;
2325 } 2399 }
2326 } 2400 }
2327 2401
2328 mdev = adm_ctx.mdev;
2329 if (mdev->state.conn > C_CONNECTED) { 2402 if (mdev->state.conn > C_CONNECTED) {
2330 retcode = ERR_RESIZE_RESYNC; 2403 retcode = ERR_RESIZE_RESYNC;
2331 goto fail; 2404 goto fail_ldev;
2332 } 2405 }
2333 2406
2334 if (mdev->state.role == R_SECONDARY && 2407 if (mdev->state.role == R_SECONDARY &&
2335 mdev->state.peer == R_SECONDARY) { 2408 mdev->state.peer == R_SECONDARY) {
2336 retcode = ERR_NO_PRIMARY; 2409 retcode = ERR_NO_PRIMARY;
2337 goto fail; 2410 goto fail_ldev;
2338 }
2339
2340 if (!get_ldev(mdev)) {
2341 retcode = ERR_NO_DISK;
2342 goto fail;
2343 } 2411 }
2344 2412
2345 if (rs.no_resync && mdev->tconn->agreed_pro_version < 93) { 2413 if (rs.no_resync && mdev->tconn->agreed_pro_version < 93) {
@@ -2358,6 +2426,28 @@ int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info)
2358 } 2426 }
2359 } 2427 }
2360 2428
2429 if (mdev->ldev->md.al_stripes != rs.al_stripes ||
2430 mdev->ldev->md.al_stripe_size_4k != rs.al_stripe_size / 4) {
2431 u32 al_size_k = rs.al_stripes * rs.al_stripe_size;
2432
2433 if (al_size_k > (16 * 1024 * 1024)) {
2434 retcode = ERR_MD_LAYOUT_TOO_BIG;
2435 goto fail_ldev;
2436 }
2437
2438 if (al_size_k < MD_32kB_SECT/2) {
2439 retcode = ERR_MD_LAYOUT_TOO_SMALL;
2440 goto fail_ldev;
2441 }
2442
2443 if (mdev->state.conn != C_CONNECTED) {
2444 retcode = ERR_MD_LAYOUT_CONNECTED;
2445 goto fail_ldev;
2446 }
2447
2448 change_al_layout = true;
2449 }
2450
2361 if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev)) 2451 if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev))
2362 mdev->ldev->known_size = drbd_get_capacity(mdev->ldev->backing_bdev); 2452 mdev->ldev->known_size = drbd_get_capacity(mdev->ldev->backing_bdev);
2363 2453
@@ -2373,16 +2463,22 @@ int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info)
2373 } 2463 }
2374 2464
2375 ddsf = (rs.resize_force ? DDSF_FORCED : 0) | (rs.no_resync ? DDSF_NO_RESYNC : 0); 2465 ddsf = (rs.resize_force ? DDSF_FORCED : 0) | (rs.no_resync ? DDSF_NO_RESYNC : 0);
2376 dd = drbd_determine_dev_size(mdev, ddsf); 2466 dd = drbd_determine_dev_size(mdev, ddsf, change_al_layout ? &rs : NULL);
2377 drbd_md_sync(mdev); 2467 drbd_md_sync(mdev);
2378 put_ldev(mdev); 2468 put_ldev(mdev);
2379 if (dd == dev_size_error) { 2469 if (dd == DS_ERROR) {
2380 retcode = ERR_NOMEM_BITMAP; 2470 retcode = ERR_NOMEM_BITMAP;
2381 goto fail; 2471 goto fail;
2472 } else if (dd == DS_ERROR_SPACE_MD) {
2473 retcode = ERR_MD_LAYOUT_NO_FIT;
2474 goto fail;
2475 } else if (dd == DS_ERROR_SHRINK) {
2476 retcode = ERR_IMPLICIT_SHRINK;
2477 goto fail;
2382 } 2478 }
2383 2479
2384 if (mdev->state.conn == C_CONNECTED) { 2480 if (mdev->state.conn == C_CONNECTED) {
2385 if (dd == grew) 2481 if (dd == DS_GREW)
2386 set_bit(RESIZE_PENDING, &mdev->flags); 2482 set_bit(RESIZE_PENDING, &mdev->flags);
2387 2483
2388 drbd_send_uuids(mdev); 2484 drbd_send_uuids(mdev);
@@ -2658,7 +2754,6 @@ int nla_put_status_info(struct sk_buff *skb, struct drbd_conf *mdev,
2658 const struct sib_info *sib) 2754 const struct sib_info *sib)
2659{ 2755{
2660 struct state_info *si = NULL; /* for sizeof(si->member); */ 2756 struct state_info *si = NULL; /* for sizeof(si->member); */
2661 struct net_conf *nc;
2662 struct nlattr *nla; 2757 struct nlattr *nla;
2663 int got_ldev; 2758 int got_ldev;
2664 int err = 0; 2759 int err = 0;
@@ -2688,13 +2783,19 @@ int nla_put_status_info(struct sk_buff *skb, struct drbd_conf *mdev,
2688 goto nla_put_failure; 2783 goto nla_put_failure;
2689 2784
2690 rcu_read_lock(); 2785 rcu_read_lock();
2691 if (got_ldev) 2786 if (got_ldev) {
2692 if (disk_conf_to_skb(skb, rcu_dereference(mdev->ldev->disk_conf), exclude_sensitive)) 2787 struct disk_conf *disk_conf;
2693 goto nla_put_failure;
2694 2788
2695 nc = rcu_dereference(mdev->tconn->net_conf); 2789 disk_conf = rcu_dereference(mdev->ldev->disk_conf);
2696 if (nc) 2790 err = disk_conf_to_skb(skb, disk_conf, exclude_sensitive);
2697 err = net_conf_to_skb(skb, nc, exclude_sensitive); 2791 }
2792 if (!err) {
2793 struct net_conf *nc;
2794
2795 nc = rcu_dereference(mdev->tconn->net_conf);
2796 if (nc)
2797 err = net_conf_to_skb(skb, nc, exclude_sensitive);
2798 }
2698 rcu_read_unlock(); 2799 rcu_read_unlock();
2699 if (err) 2800 if (err)
2700 goto nla_put_failure; 2801 goto nla_put_failure;
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 4222affff488..cc29cd3bf78b 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -1039,6 +1039,8 @@ randomize:
1039 rcu_read_lock(); 1039 rcu_read_lock();
1040 idr_for_each_entry(&tconn->volumes, mdev, vnr) { 1040 idr_for_each_entry(&tconn->volumes, mdev, vnr) {
1041 kref_get(&mdev->kref); 1041 kref_get(&mdev->kref);
1042 rcu_read_unlock();
1043
1042 /* Prevent a race between resync-handshake and 1044 /* Prevent a race between resync-handshake and
1043 * being promoted to Primary. 1045 * being promoted to Primary.
1044 * 1046 *
@@ -1049,8 +1051,6 @@ randomize:
1049 mutex_lock(mdev->state_mutex); 1051 mutex_lock(mdev->state_mutex);
1050 mutex_unlock(mdev->state_mutex); 1052 mutex_unlock(mdev->state_mutex);
1051 1053
1052 rcu_read_unlock();
1053
1054 if (discard_my_data) 1054 if (discard_my_data)
1055 set_bit(DISCARD_MY_DATA, &mdev->flags); 1055 set_bit(DISCARD_MY_DATA, &mdev->flags);
1056 else 1056 else
@@ -3545,7 +3545,7 @@ static int receive_sizes(struct drbd_tconn *tconn, struct packet_info *pi)
3545{ 3545{
3546 struct drbd_conf *mdev; 3546 struct drbd_conf *mdev;
3547 struct p_sizes *p = pi->data; 3547 struct p_sizes *p = pi->data;
3548 enum determine_dev_size dd = unchanged; 3548 enum determine_dev_size dd = DS_UNCHANGED;
3549 sector_t p_size, p_usize, my_usize; 3549 sector_t p_size, p_usize, my_usize;
3550 int ldsc = 0; /* local disk size changed */ 3550 int ldsc = 0; /* local disk size changed */
3551 enum dds_flags ddsf; 3551 enum dds_flags ddsf;
@@ -3617,9 +3617,9 @@ static int receive_sizes(struct drbd_tconn *tconn, struct packet_info *pi)
3617 3617
3618 ddsf = be16_to_cpu(p->dds_flags); 3618 ddsf = be16_to_cpu(p->dds_flags);
3619 if (get_ldev(mdev)) { 3619 if (get_ldev(mdev)) {
3620 dd = drbd_determine_dev_size(mdev, ddsf); 3620 dd = drbd_determine_dev_size(mdev, ddsf, NULL);
3621 put_ldev(mdev); 3621 put_ldev(mdev);
3622 if (dd == dev_size_error) 3622 if (dd == DS_ERROR)
3623 return -EIO; 3623 return -EIO;
3624 drbd_md_sync(mdev); 3624 drbd_md_sync(mdev);
3625 } else { 3625 } else {
@@ -3647,7 +3647,7 @@ static int receive_sizes(struct drbd_tconn *tconn, struct packet_info *pi)
3647 drbd_send_sizes(mdev, 0, ddsf); 3647 drbd_send_sizes(mdev, 0, ddsf);
3648 } 3648 }
3649 if (test_and_clear_bit(RESIZE_PENDING, &mdev->flags) || 3649 if (test_and_clear_bit(RESIZE_PENDING, &mdev->flags) ||
3650 (dd == grew && mdev->state.conn == C_CONNECTED)) { 3650 (dd == DS_GREW && mdev->state.conn == C_CONNECTED)) {
3651 if (mdev->state.pdsk >= D_INCONSISTENT && 3651 if (mdev->state.pdsk >= D_INCONSISTENT &&
3652 mdev->state.disk >= D_INCONSISTENT) { 3652 mdev->state.disk >= D_INCONSISTENT) {
3653 if (ddsf & DDSF_NO_RESYNC) 3653 if (ddsf & DDSF_NO_RESYNC)
diff --git a/drivers/block/drbd/drbd_state.c b/drivers/block/drbd/drbd_state.c
index 90c5be2b1d30..216d47b7e88b 100644
--- a/drivers/block/drbd/drbd_state.c
+++ b/drivers/block/drbd/drbd_state.c
@@ -1115,8 +1115,10 @@ __drbd_set_state(struct drbd_conf *mdev, union drbd_state ns,
1115 drbd_thread_restart_nowait(&mdev->tconn->receiver); 1115 drbd_thread_restart_nowait(&mdev->tconn->receiver);
1116 1116
1117 /* Resume AL writing if we get a connection */ 1117 /* Resume AL writing if we get a connection */
1118 if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) 1118 if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) {
1119 drbd_resume_al(mdev); 1119 drbd_resume_al(mdev);
1120 mdev->tconn->connect_cnt++;
1121 }
1120 1122
1121 /* remember last attach time so request_timer_fn() won't 1123 /* remember last attach time so request_timer_fn() won't
1122 * kill newly established sessions while we are still trying to thaw 1124 * kill newly established sessions while we are still trying to thaw
diff --git a/drivers/block/rsxx/core.c b/drivers/block/rsxx/core.c
index 5af21f2db29c..6e85e21445eb 100644
--- a/drivers/block/rsxx/core.c
+++ b/drivers/block/rsxx/core.c
@@ -31,6 +31,8 @@
31#include <linux/slab.h> 31#include <linux/slab.h>
32#include <linux/bitops.h> 32#include <linux/bitops.h>
33#include <linux/delay.h> 33#include <linux/delay.h>
34#include <linux/debugfs.h>
35#include <linux/seq_file.h>
34 36
35#include <linux/genhd.h> 37#include <linux/genhd.h>
36#include <linux/idr.h> 38#include <linux/idr.h>
@@ -39,8 +41,9 @@
39#include "rsxx_cfg.h" 41#include "rsxx_cfg.h"
40 42
41#define NO_LEGACY 0 43#define NO_LEGACY 0
44#define SYNC_START_TIMEOUT (10 * 60) /* 10 minutes */
42 45
43MODULE_DESCRIPTION("IBM FlashSystem 70/80 PCIe SSD Device Driver"); 46MODULE_DESCRIPTION("IBM Flash Adapter 900GB Full Height Device Driver");
44MODULE_AUTHOR("Joshua Morris/Philip Kelleher, IBM"); 47MODULE_AUTHOR("Joshua Morris/Philip Kelleher, IBM");
45MODULE_LICENSE("GPL"); 48MODULE_LICENSE("GPL");
46MODULE_VERSION(DRIVER_VERSION); 49MODULE_VERSION(DRIVER_VERSION);
@@ -49,9 +52,282 @@ static unsigned int force_legacy = NO_LEGACY;
49module_param(force_legacy, uint, 0444); 52module_param(force_legacy, uint, 0444);
50MODULE_PARM_DESC(force_legacy, "Force the use of legacy type PCI interrupts"); 53MODULE_PARM_DESC(force_legacy, "Force the use of legacy type PCI interrupts");
51 54
55static unsigned int sync_start = 1;
56module_param(sync_start, uint, 0444);
57MODULE_PARM_DESC(sync_start, "On by Default: Driver load will not complete "
58 "until the card startup has completed.");
59
52static DEFINE_IDA(rsxx_disk_ida); 60static DEFINE_IDA(rsxx_disk_ida);
53static DEFINE_SPINLOCK(rsxx_ida_lock); 61static DEFINE_SPINLOCK(rsxx_ida_lock);
54 62
63/* --------------------Debugfs Setup ------------------- */
64
65struct rsxx_cram {
66 u32 f_pos;
67 u32 offset;
68 void *i_private;
69};
70
71static int rsxx_attr_pci_regs_show(struct seq_file *m, void *p)
72{
73 struct rsxx_cardinfo *card = m->private;
74
75 seq_printf(m, "HWID 0x%08x\n",
76 ioread32(card->regmap + HWID));
77 seq_printf(m, "SCRATCH 0x%08x\n",
78 ioread32(card->regmap + SCRATCH));
79 seq_printf(m, "IER 0x%08x\n",
80 ioread32(card->regmap + IER));
81 seq_printf(m, "IPR 0x%08x\n",
82 ioread32(card->regmap + IPR));
83 seq_printf(m, "CREG_CMD 0x%08x\n",
84 ioread32(card->regmap + CREG_CMD));
85 seq_printf(m, "CREG_ADD 0x%08x\n",
86 ioread32(card->regmap + CREG_ADD));
87 seq_printf(m, "CREG_CNT 0x%08x\n",
88 ioread32(card->regmap + CREG_CNT));
89 seq_printf(m, "CREG_STAT 0x%08x\n",
90 ioread32(card->regmap + CREG_STAT));
91 seq_printf(m, "CREG_DATA0 0x%08x\n",
92 ioread32(card->regmap + CREG_DATA0));
93 seq_printf(m, "CREG_DATA1 0x%08x\n",
94 ioread32(card->regmap + CREG_DATA1));
95 seq_printf(m, "CREG_DATA2 0x%08x\n",
96 ioread32(card->regmap + CREG_DATA2));
97 seq_printf(m, "CREG_DATA3 0x%08x\n",
98 ioread32(card->regmap + CREG_DATA3));
99 seq_printf(m, "CREG_DATA4 0x%08x\n",
100 ioread32(card->regmap + CREG_DATA4));
101 seq_printf(m, "CREG_DATA5 0x%08x\n",
102 ioread32(card->regmap + CREG_DATA5));
103 seq_printf(m, "CREG_DATA6 0x%08x\n",
104 ioread32(card->regmap + CREG_DATA6));
105 seq_printf(m, "CREG_DATA7 0x%08x\n",
106 ioread32(card->regmap + CREG_DATA7));
107 seq_printf(m, "INTR_COAL 0x%08x\n",
108 ioread32(card->regmap + INTR_COAL));
109 seq_printf(m, "HW_ERROR 0x%08x\n",
110 ioread32(card->regmap + HW_ERROR));
111 seq_printf(m, "DEBUG0 0x%08x\n",
112 ioread32(card->regmap + PCI_DEBUG0));
113 seq_printf(m, "DEBUG1 0x%08x\n",
114 ioread32(card->regmap + PCI_DEBUG1));
115 seq_printf(m, "DEBUG2 0x%08x\n",
116 ioread32(card->regmap + PCI_DEBUG2));
117 seq_printf(m, "DEBUG3 0x%08x\n",
118 ioread32(card->regmap + PCI_DEBUG3));
119 seq_printf(m, "DEBUG4 0x%08x\n",
120 ioread32(card->regmap + PCI_DEBUG4));
121 seq_printf(m, "DEBUG5 0x%08x\n",
122 ioread32(card->regmap + PCI_DEBUG5));
123 seq_printf(m, "DEBUG6 0x%08x\n",
124 ioread32(card->regmap + PCI_DEBUG6));
125 seq_printf(m, "DEBUG7 0x%08x\n",
126 ioread32(card->regmap + PCI_DEBUG7));
127 seq_printf(m, "RECONFIG 0x%08x\n",
128 ioread32(card->regmap + PCI_RECONFIG));
129
130 return 0;
131}
132
133static int rsxx_attr_stats_show(struct seq_file *m, void *p)
134{
135 struct rsxx_cardinfo *card = m->private;
136 int i;
137
138 for (i = 0; i < card->n_targets; i++) {
139 seq_printf(m, "Ctrl %d CRC Errors = %d\n",
140 i, card->ctrl[i].stats.crc_errors);
141 seq_printf(m, "Ctrl %d Hard Errors = %d\n",
142 i, card->ctrl[i].stats.hard_errors);
143 seq_printf(m, "Ctrl %d Soft Errors = %d\n",
144 i, card->ctrl[i].stats.soft_errors);
145 seq_printf(m, "Ctrl %d Writes Issued = %d\n",
146 i, card->ctrl[i].stats.writes_issued);
147 seq_printf(m, "Ctrl %d Writes Failed = %d\n",
148 i, card->ctrl[i].stats.writes_failed);
149 seq_printf(m, "Ctrl %d Reads Issued = %d\n",
150 i, card->ctrl[i].stats.reads_issued);
151 seq_printf(m, "Ctrl %d Reads Failed = %d\n",
152 i, card->ctrl[i].stats.reads_failed);
153 seq_printf(m, "Ctrl %d Reads Retried = %d\n",
154 i, card->ctrl[i].stats.reads_retried);
155 seq_printf(m, "Ctrl %d Discards Issued = %d\n",
156 i, card->ctrl[i].stats.discards_issued);
157 seq_printf(m, "Ctrl %d Discards Failed = %d\n",
158 i, card->ctrl[i].stats.discards_failed);
159 seq_printf(m, "Ctrl %d DMA SW Errors = %d\n",
160 i, card->ctrl[i].stats.dma_sw_err);
161 seq_printf(m, "Ctrl %d DMA HW Faults = %d\n",
162 i, card->ctrl[i].stats.dma_hw_fault);
163 seq_printf(m, "Ctrl %d DMAs Cancelled = %d\n",
164 i, card->ctrl[i].stats.dma_cancelled);
165 seq_printf(m, "Ctrl %d SW Queue Depth = %d\n",
166 i, card->ctrl[i].stats.sw_q_depth);
167 seq_printf(m, "Ctrl %d HW Queue Depth = %d\n",
168 i, atomic_read(&card->ctrl[i].stats.hw_q_depth));
169 }
170
171 return 0;
172}
173
174static int rsxx_attr_stats_open(struct inode *inode, struct file *file)
175{
176 return single_open(file, rsxx_attr_stats_show, inode->i_private);
177}
178
179static int rsxx_attr_pci_regs_open(struct inode *inode, struct file *file)
180{
181 return single_open(file, rsxx_attr_pci_regs_show, inode->i_private);
182}
183
184static ssize_t rsxx_cram_read(struct file *fp, char __user *ubuf,
185 size_t cnt, loff_t *ppos)
186{
187 struct rsxx_cram *info = fp->private_data;
188 struct rsxx_cardinfo *card = info->i_private;
189 char *buf;
190 int st;
191
192 buf = kzalloc(sizeof(*buf) * cnt, GFP_KERNEL);
193 if (!buf)
194 return -ENOMEM;
195
196 info->f_pos = (u32)*ppos + info->offset;
197
198 st = rsxx_creg_read(card, CREG_ADD_CRAM + info->f_pos, cnt, buf, 1);
199 if (st)
200 return st;
201
202 st = copy_to_user(ubuf, buf, cnt);
203 if (st)
204 return st;
205
206 info->offset += cnt;
207
208 kfree(buf);
209
210 return cnt;
211}
212
213static ssize_t rsxx_cram_write(struct file *fp, const char __user *ubuf,
214 size_t cnt, loff_t *ppos)
215{
216 struct rsxx_cram *info = fp->private_data;
217 struct rsxx_cardinfo *card = info->i_private;
218 char *buf;
219 int st;
220
221 buf = kzalloc(sizeof(*buf) * cnt, GFP_KERNEL);
222 if (!buf)
223 return -ENOMEM;
224
225 st = copy_from_user(buf, ubuf, cnt);
226 if (st)
227 return st;
228
229 info->f_pos = (u32)*ppos + info->offset;
230
231 st = rsxx_creg_write(card, CREG_ADD_CRAM + info->f_pos, cnt, buf, 1);
232 if (st)
233 return st;
234
235 info->offset += cnt;
236
237 kfree(buf);
238
239 return cnt;
240}
241
242static int rsxx_cram_open(struct inode *inode, struct file *file)
243{
244 struct rsxx_cram *info = kzalloc(sizeof(*info), GFP_KERNEL);
245 if (!info)
246 return -ENOMEM;
247
248 info->i_private = inode->i_private;
249 info->f_pos = file->f_pos;
250 file->private_data = info;
251
252 return 0;
253}
254
255static int rsxx_cram_release(struct inode *inode, struct file *file)
256{
257 struct rsxx_cram *info = file->private_data;
258
259 if (!info)
260 return 0;
261
262 kfree(info);
263 file->private_data = NULL;
264
265 return 0;
266}
267
268static const struct file_operations debugfs_cram_fops = {
269 .owner = THIS_MODULE,
270 .open = rsxx_cram_open,
271 .read = rsxx_cram_read,
272 .write = rsxx_cram_write,
273 .release = rsxx_cram_release,
274};
275
276static const struct file_operations debugfs_stats_fops = {
277 .owner = THIS_MODULE,
278 .open = rsxx_attr_stats_open,
279 .read = seq_read,
280 .llseek = seq_lseek,
281 .release = single_release,
282};
283
284static const struct file_operations debugfs_pci_regs_fops = {
285 .owner = THIS_MODULE,
286 .open = rsxx_attr_pci_regs_open,
287 .read = seq_read,
288 .llseek = seq_lseek,
289 .release = single_release,
290};
291
292static void rsxx_debugfs_dev_new(struct rsxx_cardinfo *card)
293{
294 struct dentry *debugfs_stats;
295 struct dentry *debugfs_pci_regs;
296 struct dentry *debugfs_cram;
297
298 card->debugfs_dir = debugfs_create_dir(card->gendisk->disk_name, NULL);
299 if (IS_ERR_OR_NULL(card->debugfs_dir))
300 goto failed_debugfs_dir;
301
302 debugfs_stats = debugfs_create_file("stats", S_IRUGO,
303 card->debugfs_dir, card,
304 &debugfs_stats_fops);
305 if (IS_ERR_OR_NULL(debugfs_stats))
306 goto failed_debugfs_stats;
307
308 debugfs_pci_regs = debugfs_create_file("pci_regs", S_IRUGO,
309 card->debugfs_dir, card,
310 &debugfs_pci_regs_fops);
311 if (IS_ERR_OR_NULL(debugfs_pci_regs))
312 goto failed_debugfs_pci_regs;
313
314 debugfs_cram = debugfs_create_file("cram", S_IRUGO | S_IWUSR,
315 card->debugfs_dir, card,
316 &debugfs_cram_fops);
317 if (IS_ERR_OR_NULL(debugfs_cram))
318 goto failed_debugfs_cram;
319
320 return;
321failed_debugfs_cram:
322 debugfs_remove(debugfs_pci_regs);
323failed_debugfs_pci_regs:
324 debugfs_remove(debugfs_stats);
325failed_debugfs_stats:
326 debugfs_remove(card->debugfs_dir);
327failed_debugfs_dir:
328 card->debugfs_dir = NULL;
329}
330
55/*----------------- Interrupt Control & Handling -------------------*/ 331/*----------------- Interrupt Control & Handling -------------------*/
56 332
57static void rsxx_mask_interrupts(struct rsxx_cardinfo *card) 333static void rsxx_mask_interrupts(struct rsxx_cardinfo *card)
@@ -163,12 +439,13 @@ static irqreturn_t rsxx_isr(int irq, void *pdata)
163 } 439 }
164 440
165 if (isr & CR_INTR_CREG) { 441 if (isr & CR_INTR_CREG) {
166 schedule_work(&card->creg_ctrl.done_work); 442 queue_work(card->creg_ctrl.creg_wq,
443 &card->creg_ctrl.done_work);
167 handled++; 444 handled++;
168 } 445 }
169 446
170 if (isr & CR_INTR_EVENT) { 447 if (isr & CR_INTR_EVENT) {
171 schedule_work(&card->event_work); 448 queue_work(card->event_wq, &card->event_work);
172 rsxx_disable_ier_and_isr(card, CR_INTR_EVENT); 449 rsxx_disable_ier_and_isr(card, CR_INTR_EVENT);
173 handled++; 450 handled++;
174 } 451 }
@@ -329,7 +606,7 @@ static int rsxx_eeh_frozen(struct pci_dev *dev)
329 int i; 606 int i;
330 int st; 607 int st;
331 608
332 dev_warn(&dev->dev, "IBM FlashSystem PCI: preparing for slot reset.\n"); 609 dev_warn(&dev->dev, "IBM Flash Adapter PCI: preparing for slot reset.\n");
333 610
334 card->eeh_state = 1; 611 card->eeh_state = 1;
335 rsxx_mask_interrupts(card); 612 rsxx_mask_interrupts(card);
@@ -367,15 +644,26 @@ static void rsxx_eeh_failure(struct pci_dev *dev)
367{ 644{
368 struct rsxx_cardinfo *card = pci_get_drvdata(dev); 645 struct rsxx_cardinfo *card = pci_get_drvdata(dev);
369 int i; 646 int i;
647 int cnt = 0;
370 648
371 dev_err(&dev->dev, "IBM FlashSystem PCI: disabling failed card.\n"); 649 dev_err(&dev->dev, "IBM Flash Adapter PCI: disabling failed card.\n");
372 650
373 card->eeh_state = 1; 651 card->eeh_state = 1;
652 card->halt = 1;
374 653
375 for (i = 0; i < card->n_targets; i++) 654 for (i = 0; i < card->n_targets; i++) {
376 del_timer_sync(&card->ctrl[i].activity_timer); 655 spin_lock_bh(&card->ctrl[i].queue_lock);
656 cnt = rsxx_cleanup_dma_queue(&card->ctrl[i],
657 &card->ctrl[i].queue);
658 spin_unlock_bh(&card->ctrl[i].queue_lock);
659
660 cnt += rsxx_dma_cancel(&card->ctrl[i]);
377 661
378 rsxx_eeh_cancel_dmas(card); 662 if (cnt)
663 dev_info(CARD_TO_DEV(card),
664 "Freed %d queued DMAs on channel %d\n",
665 cnt, card->ctrl[i].id);
666 }
379} 667}
380 668
381static int rsxx_eeh_fifo_flush_poll(struct rsxx_cardinfo *card) 669static int rsxx_eeh_fifo_flush_poll(struct rsxx_cardinfo *card)
@@ -432,7 +720,7 @@ static pci_ers_result_t rsxx_slot_reset(struct pci_dev *dev)
432 int st; 720 int st;
433 721
434 dev_warn(&dev->dev, 722 dev_warn(&dev->dev,
435 "IBM FlashSystem PCI: recovering from slot reset.\n"); 723 "IBM Flash Adapter PCI: recovering from slot reset.\n");
436 724
437 st = pci_enable_device(dev); 725 st = pci_enable_device(dev);
438 if (st) 726 if (st)
@@ -485,7 +773,7 @@ static pci_ers_result_t rsxx_slot_reset(struct pci_dev *dev)
485 &card->ctrl[i].issue_dma_work); 773 &card->ctrl[i].issue_dma_work);
486 } 774 }
487 775
488 dev_info(&dev->dev, "IBM FlashSystem PCI: recovery complete.\n"); 776 dev_info(&dev->dev, "IBM Flash Adapter PCI: recovery complete.\n");
489 777
490 return PCI_ERS_RESULT_RECOVERED; 778 return PCI_ERS_RESULT_RECOVERED;
491 779
@@ -528,6 +816,7 @@ static int rsxx_pci_probe(struct pci_dev *dev,
528{ 816{
529 struct rsxx_cardinfo *card; 817 struct rsxx_cardinfo *card;
530 int st; 818 int st;
819 unsigned int sync_timeout;
531 820
532 dev_info(&dev->dev, "PCI-Flash SSD discovered\n"); 821 dev_info(&dev->dev, "PCI-Flash SSD discovered\n");
533 822
@@ -610,7 +899,11 @@ static int rsxx_pci_probe(struct pci_dev *dev,
610 } 899 }
611 900
612 /************* Setup Processor Command Interface *************/ 901 /************* Setup Processor Command Interface *************/
613 rsxx_creg_setup(card); 902 st = rsxx_creg_setup(card);
903 if (st) {
904 dev_err(CARD_TO_DEV(card), "Failed to setup creg interface.\n");
905 goto failed_creg_setup;
906 }
614 907
615 spin_lock_irq(&card->irq_lock); 908 spin_lock_irq(&card->irq_lock);
616 rsxx_enable_ier_and_isr(card, CR_INTR_CREG); 909 rsxx_enable_ier_and_isr(card, CR_INTR_CREG);
@@ -650,6 +943,12 @@ static int rsxx_pci_probe(struct pci_dev *dev,
650 } 943 }
651 944
652 /************* Setup Card Event Handler *************/ 945 /************* Setup Card Event Handler *************/
946 card->event_wq = create_singlethread_workqueue(DRIVER_NAME"_event");
947 if (!card->event_wq) {
948 dev_err(CARD_TO_DEV(card), "Failed card event setup.\n");
949 goto failed_event_handler;
950 }
951
653 INIT_WORK(&card->event_work, card_event_handler); 952 INIT_WORK(&card->event_work, card_event_handler);
654 953
655 st = rsxx_setup_dev(card); 954 st = rsxx_setup_dev(card);
@@ -676,6 +975,33 @@ static int rsxx_pci_probe(struct pci_dev *dev,
676 if (st) 975 if (st)
677 dev_crit(CARD_TO_DEV(card), 976 dev_crit(CARD_TO_DEV(card),
678 "Failed issuing card startup\n"); 977 "Failed issuing card startup\n");
978 if (sync_start) {
979 sync_timeout = SYNC_START_TIMEOUT;
980
981 dev_info(CARD_TO_DEV(card),
982 "Waiting for card to startup\n");
983
984 do {
985 ssleep(1);
986 sync_timeout--;
987
988 rsxx_get_card_state(card, &card->state);
989 } while (sync_timeout &&
990 (card->state == CARD_STATE_STARTING));
991
992 if (card->state == CARD_STATE_STARTING) {
993 dev_warn(CARD_TO_DEV(card),
994 "Card startup timed out\n");
995 card->size8 = 0;
996 } else {
997 dev_info(CARD_TO_DEV(card),
998 "card state: %s\n",
999 rsxx_card_state_to_str(card->state));
1000 st = rsxx_get_card_size8(card, &card->size8);
1001 if (st)
1002 card->size8 = 0;
1003 }
1004 }
679 } else if (card->state == CARD_STATE_GOOD || 1005 } else if (card->state == CARD_STATE_GOOD ||
680 card->state == CARD_STATE_RD_ONLY_FAULT) { 1006 card->state == CARD_STATE_RD_ONLY_FAULT) {
681 st = rsxx_get_card_size8(card, &card->size8); 1007 st = rsxx_get_card_size8(card, &card->size8);
@@ -685,12 +1011,21 @@ static int rsxx_pci_probe(struct pci_dev *dev,
685 1011
686 rsxx_attach_dev(card); 1012 rsxx_attach_dev(card);
687 1013
1014 /************* Setup Debugfs *************/
1015 rsxx_debugfs_dev_new(card);
1016
688 return 0; 1017 return 0;
689 1018
690failed_create_dev: 1019failed_create_dev:
1020 destroy_workqueue(card->event_wq);
1021 card->event_wq = NULL;
1022failed_event_handler:
691 rsxx_dma_destroy(card); 1023 rsxx_dma_destroy(card);
692failed_dma_setup: 1024failed_dma_setup:
693failed_compatiblity_check: 1025failed_compatiblity_check:
1026 destroy_workqueue(card->creg_ctrl.creg_wq);
1027 card->creg_ctrl.creg_wq = NULL;
1028failed_creg_setup:
694 spin_lock_irq(&card->irq_lock); 1029 spin_lock_irq(&card->irq_lock);
695 rsxx_disable_ier_and_isr(card, CR_INTR_ALL); 1030 rsxx_disable_ier_and_isr(card, CR_INTR_ALL);
696 spin_unlock_irq(&card->irq_lock); 1031 spin_unlock_irq(&card->irq_lock);
@@ -756,6 +1091,8 @@ static void rsxx_pci_remove(struct pci_dev *dev)
756 /* Prevent work_structs from re-queuing themselves. */ 1091 /* Prevent work_structs from re-queuing themselves. */
757 card->halt = 1; 1092 card->halt = 1;
758 1093
1094 debugfs_remove_recursive(card->debugfs_dir);
1095
759 free_irq(dev->irq, card); 1096 free_irq(dev->irq, card);
760 1097
761 if (!force_legacy) 1098 if (!force_legacy)
diff --git a/drivers/block/rsxx/cregs.c b/drivers/block/rsxx/cregs.c
index 4b5c020a0a65..926dce9c452f 100644
--- a/drivers/block/rsxx/cregs.c
+++ b/drivers/block/rsxx/cregs.c
@@ -431,6 +431,15 @@ static int __issue_creg_rw(struct rsxx_cardinfo *card,
431 *hw_stat = completion.creg_status; 431 *hw_stat = completion.creg_status;
432 432
433 if (completion.st) { 433 if (completion.st) {
434 /*
435 * This read is needed to verify that there has not been any
436 * extreme errors that might have occurred, i.e. EEH. The
437 * function iowrite32 will not detect EEH errors, so it is
438 * necessary that we recover if such an error is the reason
439 * for the timeout. This is a dummy read.
440 */
441 ioread32(card->regmap + SCRATCH);
442
434 dev_warn(CARD_TO_DEV(card), 443 dev_warn(CARD_TO_DEV(card),
435 "creg command failed(%d x%08x)\n", 444 "creg command failed(%d x%08x)\n",
436 completion.st, addr); 445 completion.st, addr);
@@ -727,6 +736,11 @@ int rsxx_creg_setup(struct rsxx_cardinfo *card)
727{ 736{
728 card->creg_ctrl.active_cmd = NULL; 737 card->creg_ctrl.active_cmd = NULL;
729 738
739 card->creg_ctrl.creg_wq =
740 create_singlethread_workqueue(DRIVER_NAME"_creg");
741 if (!card->creg_ctrl.creg_wq)
742 return -ENOMEM;
743
730 INIT_WORK(&card->creg_ctrl.done_work, creg_cmd_done); 744 INIT_WORK(&card->creg_ctrl.done_work, creg_cmd_done);
731 mutex_init(&card->creg_ctrl.reset_lock); 745 mutex_init(&card->creg_ctrl.reset_lock);
732 INIT_LIST_HEAD(&card->creg_ctrl.queue); 746 INIT_LIST_HEAD(&card->creg_ctrl.queue);
diff --git a/drivers/block/rsxx/dev.c b/drivers/block/rsxx/dev.c
index 4346d17d2949..d7af441880be 100644
--- a/drivers/block/rsxx/dev.c
+++ b/drivers/block/rsxx/dev.c
@@ -155,7 +155,8 @@ static void bio_dma_done_cb(struct rsxx_cardinfo *card,
155 atomic_set(&meta->error, 1); 155 atomic_set(&meta->error, 1);
156 156
157 if (atomic_dec_and_test(&meta->pending_dmas)) { 157 if (atomic_dec_and_test(&meta->pending_dmas)) {
158 disk_stats_complete(card, meta->bio, meta->start_time); 158 if (!card->eeh_state && card->gendisk)
159 disk_stats_complete(card, meta->bio, meta->start_time);
159 160
160 bio_endio(meta->bio, atomic_read(&meta->error) ? -EIO : 0); 161 bio_endio(meta->bio, atomic_read(&meta->error) ? -EIO : 0);
161 kmem_cache_free(bio_meta_pool, meta); 162 kmem_cache_free(bio_meta_pool, meta);
@@ -170,6 +171,12 @@ static void rsxx_make_request(struct request_queue *q, struct bio *bio)
170 171
171 might_sleep(); 172 might_sleep();
172 173
174 if (!card)
175 goto req_err;
176
177 if (bio->bi_sector + (bio->bi_size >> 9) > get_capacity(card->gendisk))
178 goto req_err;
179
173 if (unlikely(card->halt)) { 180 if (unlikely(card->halt)) {
174 st = -EFAULT; 181 st = -EFAULT;
175 goto req_err; 182 goto req_err;
@@ -196,7 +203,8 @@ static void rsxx_make_request(struct request_queue *q, struct bio *bio)
196 atomic_set(&bio_meta->pending_dmas, 0); 203 atomic_set(&bio_meta->pending_dmas, 0);
197 bio_meta->start_time = jiffies; 204 bio_meta->start_time = jiffies;
198 205
199 disk_stats_start(card, bio); 206 if (!unlikely(card->halt))
207 disk_stats_start(card, bio);
200 208
201 dev_dbg(CARD_TO_DEV(card), "BIO[%c]: meta: %p addr8: x%llx size: %d\n", 209 dev_dbg(CARD_TO_DEV(card), "BIO[%c]: meta: %p addr8: x%llx size: %d\n",
202 bio_data_dir(bio) ? 'W' : 'R', bio_meta, 210 bio_data_dir(bio) ? 'W' : 'R', bio_meta,
@@ -225,24 +233,6 @@ static bool rsxx_discard_supported(struct rsxx_cardinfo *card)
225 return (pci_rev >= RSXX_DISCARD_SUPPORT); 233 return (pci_rev >= RSXX_DISCARD_SUPPORT);
226} 234}
227 235
228static unsigned short rsxx_get_logical_block_size(
229 struct rsxx_cardinfo *card)
230{
231 u32 capabilities = 0;
232 int st;
233
234 st = rsxx_get_card_capabilities(card, &capabilities);
235 if (st)
236 dev_warn(CARD_TO_DEV(card),
237 "Failed reading card capabilities register\n");
238
239 /* Earlier firmware did not have support for 512 byte accesses */
240 if (capabilities & CARD_CAP_SUBPAGE_WRITES)
241 return 512;
242 else
243 return RSXX_HW_BLK_SIZE;
244}
245
246int rsxx_attach_dev(struct rsxx_cardinfo *card) 236int rsxx_attach_dev(struct rsxx_cardinfo *card)
247{ 237{
248 mutex_lock(&card->dev_lock); 238 mutex_lock(&card->dev_lock);
@@ -305,7 +295,7 @@ int rsxx_setup_dev(struct rsxx_cardinfo *card)
305 return -ENOMEM; 295 return -ENOMEM;
306 } 296 }
307 297
308 blk_size = rsxx_get_logical_block_size(card); 298 blk_size = card->config.data.block_size;
309 299
310 blk_queue_make_request(card->queue, rsxx_make_request); 300 blk_queue_make_request(card->queue, rsxx_make_request);
311 blk_queue_bounce_limit(card->queue, BLK_BOUNCE_ANY); 301 blk_queue_bounce_limit(card->queue, BLK_BOUNCE_ANY);
@@ -347,6 +337,7 @@ void rsxx_destroy_dev(struct rsxx_cardinfo *card)
347 card->gendisk = NULL; 337 card->gendisk = NULL;
348 338
349 blk_cleanup_queue(card->queue); 339 blk_cleanup_queue(card->queue);
340 card->queue->queuedata = NULL;
350 unregister_blkdev(card->major, DRIVER_NAME); 341 unregister_blkdev(card->major, DRIVER_NAME);
351} 342}
352 343
diff --git a/drivers/block/rsxx/dma.c b/drivers/block/rsxx/dma.c
index 0607513cfb41..bed32f16b084 100644
--- a/drivers/block/rsxx/dma.c
+++ b/drivers/block/rsxx/dma.c
@@ -245,6 +245,22 @@ static void rsxx_complete_dma(struct rsxx_dma_ctrl *ctrl,
245 kmem_cache_free(rsxx_dma_pool, dma); 245 kmem_cache_free(rsxx_dma_pool, dma);
246} 246}
247 247
248int rsxx_cleanup_dma_queue(struct rsxx_dma_ctrl *ctrl,
249 struct list_head *q)
250{
251 struct rsxx_dma *dma;
252 struct rsxx_dma *tmp;
253 int cnt = 0;
254
255 list_for_each_entry_safe(dma, tmp, q, list) {
256 list_del(&dma->list);
257 rsxx_complete_dma(ctrl, dma, DMA_CANCELLED);
258 cnt++;
259 }
260
261 return cnt;
262}
263
248static void rsxx_requeue_dma(struct rsxx_dma_ctrl *ctrl, 264static void rsxx_requeue_dma(struct rsxx_dma_ctrl *ctrl,
249 struct rsxx_dma *dma) 265 struct rsxx_dma *dma)
250{ 266{
@@ -252,9 +268,10 @@ static void rsxx_requeue_dma(struct rsxx_dma_ctrl *ctrl,
252 * Requeued DMAs go to the front of the queue so they are issued 268 * Requeued DMAs go to the front of the queue so they are issued
253 * first. 269 * first.
254 */ 270 */
255 spin_lock(&ctrl->queue_lock); 271 spin_lock_bh(&ctrl->queue_lock);
272 ctrl->stats.sw_q_depth++;
256 list_add(&dma->list, &ctrl->queue); 273 list_add(&dma->list, &ctrl->queue);
257 spin_unlock(&ctrl->queue_lock); 274 spin_unlock_bh(&ctrl->queue_lock);
258} 275}
259 276
260static void rsxx_handle_dma_error(struct rsxx_dma_ctrl *ctrl, 277static void rsxx_handle_dma_error(struct rsxx_dma_ctrl *ctrl,
@@ -329,6 +346,7 @@ static void rsxx_handle_dma_error(struct rsxx_dma_ctrl *ctrl,
329static void dma_engine_stalled(unsigned long data) 346static void dma_engine_stalled(unsigned long data)
330{ 347{
331 struct rsxx_dma_ctrl *ctrl = (struct rsxx_dma_ctrl *)data; 348 struct rsxx_dma_ctrl *ctrl = (struct rsxx_dma_ctrl *)data;
349 int cnt;
332 350
333 if (atomic_read(&ctrl->stats.hw_q_depth) == 0 || 351 if (atomic_read(&ctrl->stats.hw_q_depth) == 0 ||
334 unlikely(ctrl->card->eeh_state)) 352 unlikely(ctrl->card->eeh_state))
@@ -349,18 +367,28 @@ static void dma_engine_stalled(unsigned long data)
349 "DMA channel %d has stalled, faulting interface.\n", 367 "DMA channel %d has stalled, faulting interface.\n",
350 ctrl->id); 368 ctrl->id);
351 ctrl->card->dma_fault = 1; 369 ctrl->card->dma_fault = 1;
370
371 /* Clean up the DMA queue */
372 spin_lock(&ctrl->queue_lock);
373 cnt = rsxx_cleanup_dma_queue(ctrl, &ctrl->queue);
374 spin_unlock(&ctrl->queue_lock);
375
376 cnt += rsxx_dma_cancel(ctrl);
377
378 if (cnt)
379 dev_info(CARD_TO_DEV(ctrl->card),
380 "Freed %d queued DMAs on channel %d\n",
381 cnt, ctrl->id);
352 } 382 }
353} 383}
354 384
355static void rsxx_issue_dmas(struct work_struct *work) 385static void rsxx_issue_dmas(struct rsxx_dma_ctrl *ctrl)
356{ 386{
357 struct rsxx_dma_ctrl *ctrl;
358 struct rsxx_dma *dma; 387 struct rsxx_dma *dma;
359 int tag; 388 int tag;
360 int cmds_pending = 0; 389 int cmds_pending = 0;
361 struct hw_cmd *hw_cmd_buf; 390 struct hw_cmd *hw_cmd_buf;
362 391
363 ctrl = container_of(work, struct rsxx_dma_ctrl, issue_dma_work);
364 hw_cmd_buf = ctrl->cmd.buf; 392 hw_cmd_buf = ctrl->cmd.buf;
365 393
366 if (unlikely(ctrl->card->halt) || 394 if (unlikely(ctrl->card->halt) ||
@@ -368,22 +396,22 @@ static void rsxx_issue_dmas(struct work_struct *work)
368 return; 396 return;
369 397
370 while (1) { 398 while (1) {
371 spin_lock(&ctrl->queue_lock); 399 spin_lock_bh(&ctrl->queue_lock);
372 if (list_empty(&ctrl->queue)) { 400 if (list_empty(&ctrl->queue)) {
373 spin_unlock(&ctrl->queue_lock); 401 spin_unlock_bh(&ctrl->queue_lock);
374 break; 402 break;
375 } 403 }
376 spin_unlock(&ctrl->queue_lock); 404 spin_unlock_bh(&ctrl->queue_lock);
377 405
378 tag = pop_tracker(ctrl->trackers); 406 tag = pop_tracker(ctrl->trackers);
379 if (tag == -1) 407 if (tag == -1)
380 break; 408 break;
381 409
382 spin_lock(&ctrl->queue_lock); 410 spin_lock_bh(&ctrl->queue_lock);
383 dma = list_entry(ctrl->queue.next, struct rsxx_dma, list); 411 dma = list_entry(ctrl->queue.next, struct rsxx_dma, list);
384 list_del(&dma->list); 412 list_del(&dma->list);
385 ctrl->stats.sw_q_depth--; 413 ctrl->stats.sw_q_depth--;
386 spin_unlock(&ctrl->queue_lock); 414 spin_unlock_bh(&ctrl->queue_lock);
387 415
388 /* 416 /*
389 * This will catch any DMAs that slipped in right before the 417 * This will catch any DMAs that slipped in right before the
@@ -440,9 +468,8 @@ static void rsxx_issue_dmas(struct work_struct *work)
440 } 468 }
441} 469}
442 470
443static void rsxx_dma_done(struct work_struct *work) 471static void rsxx_dma_done(struct rsxx_dma_ctrl *ctrl)
444{ 472{
445 struct rsxx_dma_ctrl *ctrl;
446 struct rsxx_dma *dma; 473 struct rsxx_dma *dma;
447 unsigned long flags; 474 unsigned long flags;
448 u16 count; 475 u16 count;
@@ -450,7 +477,6 @@ static void rsxx_dma_done(struct work_struct *work)
450 u8 tag; 477 u8 tag;
451 struct hw_status *hw_st_buf; 478 struct hw_status *hw_st_buf;
452 479
453 ctrl = container_of(work, struct rsxx_dma_ctrl, dma_done_work);
454 hw_st_buf = ctrl->status.buf; 480 hw_st_buf = ctrl->status.buf;
455 481
456 if (unlikely(ctrl->card->halt) || 482 if (unlikely(ctrl->card->halt) ||
@@ -520,33 +546,32 @@ static void rsxx_dma_done(struct work_struct *work)
520 rsxx_enable_ier(ctrl->card, CR_INTR_DMA(ctrl->id)); 546 rsxx_enable_ier(ctrl->card, CR_INTR_DMA(ctrl->id));
521 spin_unlock_irqrestore(&ctrl->card->irq_lock, flags); 547 spin_unlock_irqrestore(&ctrl->card->irq_lock, flags);
522 548
523 spin_lock(&ctrl->queue_lock); 549 spin_lock_bh(&ctrl->queue_lock);
524 if (ctrl->stats.sw_q_depth) 550 if (ctrl->stats.sw_q_depth)
525 queue_work(ctrl->issue_wq, &ctrl->issue_dma_work); 551 queue_work(ctrl->issue_wq, &ctrl->issue_dma_work);
526 spin_unlock(&ctrl->queue_lock); 552 spin_unlock_bh(&ctrl->queue_lock);
527} 553}
528 554
529static int rsxx_cleanup_dma_queue(struct rsxx_cardinfo *card, 555static void rsxx_schedule_issue(struct work_struct *work)
530 struct list_head *q)
531{ 556{
532 struct rsxx_dma *dma; 557 struct rsxx_dma_ctrl *ctrl;
533 struct rsxx_dma *tmp;
534 int cnt = 0;
535 558
536 list_for_each_entry_safe(dma, tmp, q, list) { 559 ctrl = container_of(work, struct rsxx_dma_ctrl, issue_dma_work);
537 list_del(&dma->list);
538 560
539 if (dma->dma_addr) 561 mutex_lock(&ctrl->work_lock);
540 pci_unmap_page(card->dev, dma->dma_addr, 562 rsxx_issue_dmas(ctrl);
541 get_dma_size(dma), 563 mutex_unlock(&ctrl->work_lock);
542 (dma->cmd == HW_CMD_BLK_WRITE) ? 564}
543 PCI_DMA_TODEVICE :
544 PCI_DMA_FROMDEVICE);
545 kmem_cache_free(rsxx_dma_pool, dma);
546 cnt++;
547 }
548 565
549 return cnt; 566static void rsxx_schedule_done(struct work_struct *work)
567{
568 struct rsxx_dma_ctrl *ctrl;
569
570 ctrl = container_of(work, struct rsxx_dma_ctrl, dma_done_work);
571
572 mutex_lock(&ctrl->work_lock);
573 rsxx_dma_done(ctrl);
574 mutex_unlock(&ctrl->work_lock);
550} 575}
551 576
552static int rsxx_queue_discard(struct rsxx_cardinfo *card, 577static int rsxx_queue_discard(struct rsxx_cardinfo *card,
@@ -698,10 +723,10 @@ int rsxx_dma_queue_bio(struct rsxx_cardinfo *card,
698 723
699 for (i = 0; i < card->n_targets; i++) { 724 for (i = 0; i < card->n_targets; i++) {
700 if (!list_empty(&dma_list[i])) { 725 if (!list_empty(&dma_list[i])) {
701 spin_lock(&card->ctrl[i].queue_lock); 726 spin_lock_bh(&card->ctrl[i].queue_lock);
702 card->ctrl[i].stats.sw_q_depth += dma_cnt[i]; 727 card->ctrl[i].stats.sw_q_depth += dma_cnt[i];
703 list_splice_tail(&dma_list[i], &card->ctrl[i].queue); 728 list_splice_tail(&dma_list[i], &card->ctrl[i].queue);
704 spin_unlock(&card->ctrl[i].queue_lock); 729 spin_unlock_bh(&card->ctrl[i].queue_lock);
705 730
706 queue_work(card->ctrl[i].issue_wq, 731 queue_work(card->ctrl[i].issue_wq,
707 &card->ctrl[i].issue_dma_work); 732 &card->ctrl[i].issue_dma_work);
@@ -711,8 +736,11 @@ int rsxx_dma_queue_bio(struct rsxx_cardinfo *card,
711 return 0; 736 return 0;
712 737
713bvec_err: 738bvec_err:
714 for (i = 0; i < card->n_targets; i++) 739 for (i = 0; i < card->n_targets; i++) {
715 rsxx_cleanup_dma_queue(card, &dma_list[i]); 740 spin_lock_bh(&card->ctrl[i].queue_lock);
741 rsxx_cleanup_dma_queue(&card->ctrl[i], &dma_list[i]);
742 spin_unlock_bh(&card->ctrl[i].queue_lock);
743 }
716 744
717 return st; 745 return st;
718} 746}
@@ -780,6 +808,7 @@ static int rsxx_dma_ctrl_init(struct pci_dev *dev,
780 spin_lock_init(&ctrl->trackers->lock); 808 spin_lock_init(&ctrl->trackers->lock);
781 809
782 spin_lock_init(&ctrl->queue_lock); 810 spin_lock_init(&ctrl->queue_lock);
811 mutex_init(&ctrl->work_lock);
783 INIT_LIST_HEAD(&ctrl->queue); 812 INIT_LIST_HEAD(&ctrl->queue);
784 813
785 setup_timer(&ctrl->activity_timer, dma_engine_stalled, 814 setup_timer(&ctrl->activity_timer, dma_engine_stalled,
@@ -793,8 +822,8 @@ static int rsxx_dma_ctrl_init(struct pci_dev *dev,
793 if (!ctrl->done_wq) 822 if (!ctrl->done_wq)
794 return -ENOMEM; 823 return -ENOMEM;
795 824
796 INIT_WORK(&ctrl->issue_dma_work, rsxx_issue_dmas); 825 INIT_WORK(&ctrl->issue_dma_work, rsxx_schedule_issue);
797 INIT_WORK(&ctrl->dma_done_work, rsxx_dma_done); 826 INIT_WORK(&ctrl->dma_done_work, rsxx_schedule_done);
798 827
799 st = rsxx_hw_buffers_init(dev, ctrl); 828 st = rsxx_hw_buffers_init(dev, ctrl);
800 if (st) 829 if (st)
@@ -918,13 +947,30 @@ failed_dma_setup:
918 return st; 947 return st;
919} 948}
920 949
950int rsxx_dma_cancel(struct rsxx_dma_ctrl *ctrl)
951{
952 struct rsxx_dma *dma;
953 int i;
954 int cnt = 0;
955
956 /* Clean up issued DMAs */
957 for (i = 0; i < RSXX_MAX_OUTSTANDING_CMDS; i++) {
958 dma = get_tracker_dma(ctrl->trackers, i);
959 if (dma) {
960 atomic_dec(&ctrl->stats.hw_q_depth);
961 rsxx_complete_dma(ctrl, dma, DMA_CANCELLED);
962 push_tracker(ctrl->trackers, i);
963 cnt++;
964 }
965 }
966
967 return cnt;
968}
921 969
922void rsxx_dma_destroy(struct rsxx_cardinfo *card) 970void rsxx_dma_destroy(struct rsxx_cardinfo *card)
923{ 971{
924 struct rsxx_dma_ctrl *ctrl; 972 struct rsxx_dma_ctrl *ctrl;
925 struct rsxx_dma *dma; 973 int i;
926 int i, j;
927 int cnt = 0;
928 974
929 for (i = 0; i < card->n_targets; i++) { 975 for (i = 0; i < card->n_targets; i++) {
930 ctrl = &card->ctrl[i]; 976 ctrl = &card->ctrl[i];
@@ -943,33 +989,11 @@ void rsxx_dma_destroy(struct rsxx_cardinfo *card)
943 del_timer_sync(&ctrl->activity_timer); 989 del_timer_sync(&ctrl->activity_timer);
944 990
945 /* Clean up the DMA queue */ 991 /* Clean up the DMA queue */
946 spin_lock(&ctrl->queue_lock); 992 spin_lock_bh(&ctrl->queue_lock);
947 cnt = rsxx_cleanup_dma_queue(card, &ctrl->queue); 993 rsxx_cleanup_dma_queue(ctrl, &ctrl->queue);
948 spin_unlock(&ctrl->queue_lock); 994 spin_unlock_bh(&ctrl->queue_lock);
949
950 if (cnt)
951 dev_info(CARD_TO_DEV(card),
952 "Freed %d queued DMAs on channel %d\n",
953 cnt, i);
954
955 /* Clean up issued DMAs */
956 for (j = 0; j < RSXX_MAX_OUTSTANDING_CMDS; j++) {
957 dma = get_tracker_dma(ctrl->trackers, j);
958 if (dma) {
959 pci_unmap_page(card->dev, dma->dma_addr,
960 get_dma_size(dma),
961 (dma->cmd == HW_CMD_BLK_WRITE) ?
962 PCI_DMA_TODEVICE :
963 PCI_DMA_FROMDEVICE);
964 kmem_cache_free(rsxx_dma_pool, dma);
965 cnt++;
966 }
967 }
968 995
969 if (cnt) 996 rsxx_dma_cancel(ctrl);
970 dev_info(CARD_TO_DEV(card),
971 "Freed %d pending DMAs on channel %d\n",
972 cnt, i);
973 997
974 vfree(ctrl->trackers); 998 vfree(ctrl->trackers);
975 999
@@ -1013,7 +1037,7 @@ int rsxx_eeh_save_issued_dmas(struct rsxx_cardinfo *card)
1013 cnt++; 1037 cnt++;
1014 } 1038 }
1015 1039
1016 spin_lock(&card->ctrl[i].queue_lock); 1040 spin_lock_bh(&card->ctrl[i].queue_lock);
1017 list_splice(&issued_dmas[i], &card->ctrl[i].queue); 1041 list_splice(&issued_dmas[i], &card->ctrl[i].queue);
1018 1042
1019 atomic_sub(cnt, &card->ctrl[i].stats.hw_q_depth); 1043 atomic_sub(cnt, &card->ctrl[i].stats.hw_q_depth);
@@ -1028,7 +1052,7 @@ int rsxx_eeh_save_issued_dmas(struct rsxx_cardinfo *card)
1028 PCI_DMA_TODEVICE : 1052 PCI_DMA_TODEVICE :
1029 PCI_DMA_FROMDEVICE); 1053 PCI_DMA_FROMDEVICE);
1030 } 1054 }
1031 spin_unlock(&card->ctrl[i].queue_lock); 1055 spin_unlock_bh(&card->ctrl[i].queue_lock);
1032 } 1056 }
1033 1057
1034 kfree(issued_dmas); 1058 kfree(issued_dmas);
@@ -1036,30 +1060,13 @@ int rsxx_eeh_save_issued_dmas(struct rsxx_cardinfo *card)
1036 return 0; 1060 return 0;
1037} 1061}
1038 1062
1039void rsxx_eeh_cancel_dmas(struct rsxx_cardinfo *card)
1040{
1041 struct rsxx_dma *dma;
1042 struct rsxx_dma *tmp;
1043 int i;
1044
1045 for (i = 0; i < card->n_targets; i++) {
1046 spin_lock(&card->ctrl[i].queue_lock);
1047 list_for_each_entry_safe(dma, tmp, &card->ctrl[i].queue, list) {
1048 list_del(&dma->list);
1049
1050 rsxx_complete_dma(&card->ctrl[i], dma, DMA_CANCELLED);
1051 }
1052 spin_unlock(&card->ctrl[i].queue_lock);
1053 }
1054}
1055
1056int rsxx_eeh_remap_dmas(struct rsxx_cardinfo *card) 1063int rsxx_eeh_remap_dmas(struct rsxx_cardinfo *card)
1057{ 1064{
1058 struct rsxx_dma *dma; 1065 struct rsxx_dma *dma;
1059 int i; 1066 int i;
1060 1067
1061 for (i = 0; i < card->n_targets; i++) { 1068 for (i = 0; i < card->n_targets; i++) {
1062 spin_lock(&card->ctrl[i].queue_lock); 1069 spin_lock_bh(&card->ctrl[i].queue_lock);
1063 list_for_each_entry(dma, &card->ctrl[i].queue, list) { 1070 list_for_each_entry(dma, &card->ctrl[i].queue, list) {
1064 dma->dma_addr = pci_map_page(card->dev, dma->page, 1071 dma->dma_addr = pci_map_page(card->dev, dma->page,
1065 dma->pg_off, get_dma_size(dma), 1072 dma->pg_off, get_dma_size(dma),
@@ -1067,12 +1074,12 @@ int rsxx_eeh_remap_dmas(struct rsxx_cardinfo *card)
1067 PCI_DMA_TODEVICE : 1074 PCI_DMA_TODEVICE :
1068 PCI_DMA_FROMDEVICE); 1075 PCI_DMA_FROMDEVICE);
1069 if (!dma->dma_addr) { 1076 if (!dma->dma_addr) {
1070 spin_unlock(&card->ctrl[i].queue_lock); 1077 spin_unlock_bh(&card->ctrl[i].queue_lock);
1071 kmem_cache_free(rsxx_dma_pool, dma); 1078 kmem_cache_free(rsxx_dma_pool, dma);
1072 return -ENOMEM; 1079 return -ENOMEM;
1073 } 1080 }
1074 } 1081 }
1075 spin_unlock(&card->ctrl[i].queue_lock); 1082 spin_unlock_bh(&card->ctrl[i].queue_lock);
1076 } 1083 }
1077 1084
1078 return 0; 1085 return 0;
diff --git a/drivers/block/rsxx/rsxx_priv.h b/drivers/block/rsxx/rsxx_priv.h
index 382e8bf5c03b..5ad5055a4104 100644
--- a/drivers/block/rsxx/rsxx_priv.h
+++ b/drivers/block/rsxx/rsxx_priv.h
@@ -39,6 +39,7 @@
39#include <linux/vmalloc.h> 39#include <linux/vmalloc.h>
40#include <linux/timer.h> 40#include <linux/timer.h>
41#include <linux/ioctl.h> 41#include <linux/ioctl.h>
42#include <linux/delay.h>
42 43
43#include "rsxx.h" 44#include "rsxx.h"
44#include "rsxx_cfg.h" 45#include "rsxx_cfg.h"
@@ -114,6 +115,7 @@ struct rsxx_dma_ctrl {
114 struct timer_list activity_timer; 115 struct timer_list activity_timer;
115 struct dma_tracker_list *trackers; 116 struct dma_tracker_list *trackers;
116 struct rsxx_dma_stats stats; 117 struct rsxx_dma_stats stats;
118 struct mutex work_lock;
117}; 119};
118 120
119struct rsxx_cardinfo { 121struct rsxx_cardinfo {
@@ -134,6 +136,7 @@ struct rsxx_cardinfo {
134 spinlock_t lock; 136 spinlock_t lock;
135 bool active; 137 bool active;
136 struct creg_cmd *active_cmd; 138 struct creg_cmd *active_cmd;
139 struct workqueue_struct *creg_wq;
137 struct work_struct done_work; 140 struct work_struct done_work;
138 struct list_head queue; 141 struct list_head queue;
139 unsigned int q_depth; 142 unsigned int q_depth;
@@ -154,6 +157,7 @@ struct rsxx_cardinfo {
154 int buf_len; 157 int buf_len;
155 } log; 158 } log;
156 159
160 struct workqueue_struct *event_wq;
157 struct work_struct event_work; 161 struct work_struct event_work;
158 unsigned int state; 162 unsigned int state;
159 u64 size8; 163 u64 size8;
@@ -181,6 +185,8 @@ struct rsxx_cardinfo {
181 185
182 int n_targets; 186 int n_targets;
183 struct rsxx_dma_ctrl *ctrl; 187 struct rsxx_dma_ctrl *ctrl;
188
189 struct dentry *debugfs_dir;
184}; 190};
185 191
186enum rsxx_pci_regmap { 192enum rsxx_pci_regmap {
@@ -283,6 +289,7 @@ enum rsxx_creg_addr {
283 CREG_ADD_CAPABILITIES = 0x80001050, 289 CREG_ADD_CAPABILITIES = 0x80001050,
284 CREG_ADD_LOG = 0x80002000, 290 CREG_ADD_LOG = 0x80002000,
285 CREG_ADD_NUM_TARGETS = 0x80003000, 291 CREG_ADD_NUM_TARGETS = 0x80003000,
292 CREG_ADD_CRAM = 0xA0000000,
286 CREG_ADD_CONFIG = 0xB0000000, 293 CREG_ADD_CONFIG = 0xB0000000,
287}; 294};
288 295
@@ -372,6 +379,8 @@ typedef void (*rsxx_dma_cb)(struct rsxx_cardinfo *card,
372int rsxx_dma_setup(struct rsxx_cardinfo *card); 379int rsxx_dma_setup(struct rsxx_cardinfo *card);
373void rsxx_dma_destroy(struct rsxx_cardinfo *card); 380void rsxx_dma_destroy(struct rsxx_cardinfo *card);
374int rsxx_dma_init(void); 381int rsxx_dma_init(void);
382int rsxx_cleanup_dma_queue(struct rsxx_dma_ctrl *ctrl, struct list_head *q);
383int rsxx_dma_cancel(struct rsxx_dma_ctrl *ctrl);
375void rsxx_dma_cleanup(void); 384void rsxx_dma_cleanup(void);
376void rsxx_dma_queue_reset(struct rsxx_cardinfo *card); 385void rsxx_dma_queue_reset(struct rsxx_cardinfo *card);
377int rsxx_dma_configure(struct rsxx_cardinfo *card); 386int rsxx_dma_configure(struct rsxx_cardinfo *card);
@@ -382,7 +391,6 @@ int rsxx_dma_queue_bio(struct rsxx_cardinfo *card,
382 void *cb_data); 391 void *cb_data);
383int rsxx_hw_buffers_init(struct pci_dev *dev, struct rsxx_dma_ctrl *ctrl); 392int rsxx_hw_buffers_init(struct pci_dev *dev, struct rsxx_dma_ctrl *ctrl);
384int rsxx_eeh_save_issued_dmas(struct rsxx_cardinfo *card); 393int rsxx_eeh_save_issued_dmas(struct rsxx_cardinfo *card);
385void rsxx_eeh_cancel_dmas(struct rsxx_cardinfo *card);
386int rsxx_eeh_remap_dmas(struct rsxx_cardinfo *card); 394int rsxx_eeh_remap_dmas(struct rsxx_cardinfo *card);
387 395
388/***** cregs.c *****/ 396/***** cregs.c *****/
diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c
index dd5b2fed97e9..bf4b9d282c04 100644
--- a/drivers/block/xen-blkback/blkback.c
+++ b/drivers/block/xen-blkback/blkback.c
@@ -50,110 +50,118 @@
50#include "common.h" 50#include "common.h"
51 51
52/* 52/*
53 * These are rather arbitrary. They are fairly large because adjacent requests 53 * Maximum number of unused free pages to keep in the internal buffer.
54 * pulled from a communication ring are quite likely to end up being part of 54 * Setting this to a value too low will reduce memory used in each backend,
55 * the same scatter/gather request at the disc. 55 * but can have a performance penalty.
56 * 56 *
57 * ** TRY INCREASING 'xen_blkif_reqs' IF WRITE SPEEDS SEEM TOO LOW ** 57 * A sane value is xen_blkif_reqs * BLKIF_MAX_SEGMENTS_PER_REQUEST, but can
58 * 58 * be set to a lower value that might degrade performance on some intensive
59 * This will increase the chances of being able to write whole tracks. 59 * IO workloads.
60 * 64 should be enough to keep us competitive with Linux.
61 */ 60 */
62static int xen_blkif_reqs = 64;
63module_param_named(reqs, xen_blkif_reqs, int, 0);
64MODULE_PARM_DESC(reqs, "Number of blkback requests to allocate");
65 61
66/* Run-time switchable: /sys/module/blkback/parameters/ */ 62static int xen_blkif_max_buffer_pages = 1024;
67static unsigned int log_stats; 63module_param_named(max_buffer_pages, xen_blkif_max_buffer_pages, int, 0644);
68module_param(log_stats, int, 0644); 64MODULE_PARM_DESC(max_buffer_pages,
65"Maximum number of free pages to keep in each block backend buffer");
69 66
70/* 67/*
71 * Each outstanding request that we've passed to the lower device layers has a 68 * Maximum number of grants to map persistently in blkback. For maximum
72 * 'pending_req' allocated to it. Each buffer_head that completes decrements 69 * performance this should be the total numbers of grants that can be used
73 * the pendcnt towards zero. When it hits zero, the specified domain has a 70 * to fill the ring, but since this might become too high, specially with
74 * response queued for it, with the saved 'id' passed back. 71 * the use of indirect descriptors, we set it to a value that provides good
72 * performance without using too much memory.
73 *
74 * When the list of persistent grants is full we clean it up using a LRU
75 * algorithm.
75 */ 76 */
76struct pending_req {
77 struct xen_blkif *blkif;
78 u64 id;
79 int nr_pages;
80 atomic_t pendcnt;
81 unsigned short operation;
82 int status;
83 struct list_head free_list;
84 DECLARE_BITMAP(unmap_seg, BLKIF_MAX_SEGMENTS_PER_REQUEST);
85};
86 77
87#define BLKBACK_INVALID_HANDLE (~0) 78static int xen_blkif_max_pgrants = 1056;
79module_param_named(max_persistent_grants, xen_blkif_max_pgrants, int, 0644);
80MODULE_PARM_DESC(max_persistent_grants,
81 "Maximum number of grants to map persistently");
88 82
89struct xen_blkbk { 83/*
90 struct pending_req *pending_reqs; 84 * The LRU mechanism to clean the lists of persistent grants needs to
91 /* List of all 'pending_req' available */ 85 * be executed periodically. The time interval between consecutive executions
92 struct list_head pending_free; 86 * of the purge mechanism is set in ms.
93 /* And its spinlock. */ 87 */
94 spinlock_t pending_free_lock; 88#define LRU_INTERVAL 100
95 wait_queue_head_t pending_free_wq;
96 /* The list of all pages that are available. */
97 struct page **pending_pages;
98 /* And the grant handles that are available. */
99 grant_handle_t *pending_grant_handles;
100};
101
102static struct xen_blkbk *blkbk;
103 89
104/* 90/*
105 * Maximum number of grant pages that can be mapped in blkback. 91 * When the persistent grants list is full we will remove unused grants
106 * BLKIF_MAX_SEGMENTS_PER_REQUEST * RING_SIZE is the maximum number of 92 * from the list. The percent number of grants to be removed at each LRU
107 * pages that blkback will persistently map. 93 * execution.
108 * Currently, this is:
109 * RING_SIZE = 32 (for all known ring types)
110 * BLKIF_MAX_SEGMENTS_PER_REQUEST = 11
111 * sizeof(struct persistent_gnt) = 48
112 * So the maximum memory used to store the grants is:
113 * 32 * 11 * 48 = 16896 bytes
114 */ 94 */
115static inline unsigned int max_mapped_grant_pages(enum blkif_protocol protocol) 95#define LRU_PERCENT_CLEAN 5
96
97/* Run-time switchable: /sys/module/blkback/parameters/ */
98static unsigned int log_stats;
99module_param(log_stats, int, 0644);
100
101#define BLKBACK_INVALID_HANDLE (~0)
102
103/* Number of free pages to remove on each call to free_xenballooned_pages */
104#define NUM_BATCH_FREE_PAGES 10
105
106static inline int get_free_page(struct xen_blkif *blkif, struct page **page)
116{ 107{
117 switch (protocol) { 108 unsigned long flags;
118 case BLKIF_PROTOCOL_NATIVE: 109
119 return __CONST_RING_SIZE(blkif, PAGE_SIZE) * 110 spin_lock_irqsave(&blkif->free_pages_lock, flags);
120 BLKIF_MAX_SEGMENTS_PER_REQUEST; 111 if (list_empty(&blkif->free_pages)) {
121 case BLKIF_PROTOCOL_X86_32: 112 BUG_ON(blkif->free_pages_num != 0);
122 return __CONST_RING_SIZE(blkif_x86_32, PAGE_SIZE) * 113 spin_unlock_irqrestore(&blkif->free_pages_lock, flags);
123 BLKIF_MAX_SEGMENTS_PER_REQUEST; 114 return alloc_xenballooned_pages(1, page, false);
124 case BLKIF_PROTOCOL_X86_64:
125 return __CONST_RING_SIZE(blkif_x86_64, PAGE_SIZE) *
126 BLKIF_MAX_SEGMENTS_PER_REQUEST;
127 default:
128 BUG();
129 } 115 }
116 BUG_ON(blkif->free_pages_num == 0);
117 page[0] = list_first_entry(&blkif->free_pages, struct page, lru);
118 list_del(&page[0]->lru);
119 blkif->free_pages_num--;
120 spin_unlock_irqrestore(&blkif->free_pages_lock, flags);
121
130 return 0; 122 return 0;
131} 123}
132 124
133 125static inline void put_free_pages(struct xen_blkif *blkif, struct page **page,
134/* 126 int num)
135 * Little helpful macro to figure out the index and virtual address of the
136 * pending_pages[..]. For each 'pending_req' we have have up to
137 * BLKIF_MAX_SEGMENTS_PER_REQUEST (11) pages. The seg would be from 0 through
138 * 10 and would index in the pending_pages[..].
139 */
140static inline int vaddr_pagenr(struct pending_req *req, int seg)
141{ 127{
142 return (req - blkbk->pending_reqs) * 128 unsigned long flags;
143 BLKIF_MAX_SEGMENTS_PER_REQUEST + seg; 129 int i;
144}
145 130
146#define pending_page(req, seg) pending_pages[vaddr_pagenr(req, seg)] 131 spin_lock_irqsave(&blkif->free_pages_lock, flags);
132 for (i = 0; i < num; i++)
133 list_add(&page[i]->lru, &blkif->free_pages);
134 blkif->free_pages_num += num;
135 spin_unlock_irqrestore(&blkif->free_pages_lock, flags);
136}
147 137
148static inline unsigned long vaddr(struct pending_req *req, int seg) 138static inline void shrink_free_pagepool(struct xen_blkif *blkif, int num)
149{ 139{
150 unsigned long pfn = page_to_pfn(blkbk->pending_page(req, seg)); 140 /* Remove requested pages in batches of NUM_BATCH_FREE_PAGES */
151 return (unsigned long)pfn_to_kaddr(pfn); 141 struct page *page[NUM_BATCH_FREE_PAGES];
152} 142 unsigned int num_pages = 0;
143 unsigned long flags;
153 144
154#define pending_handle(_req, _seg) \ 145 spin_lock_irqsave(&blkif->free_pages_lock, flags);
155 (blkbk->pending_grant_handles[vaddr_pagenr(_req, _seg)]) 146 while (blkif->free_pages_num > num) {
147 BUG_ON(list_empty(&blkif->free_pages));
148 page[num_pages] = list_first_entry(&blkif->free_pages,
149 struct page, lru);
150 list_del(&page[num_pages]->lru);
151 blkif->free_pages_num--;
152 if (++num_pages == NUM_BATCH_FREE_PAGES) {
153 spin_unlock_irqrestore(&blkif->free_pages_lock, flags);
154 free_xenballooned_pages(num_pages, page);
155 spin_lock_irqsave(&blkif->free_pages_lock, flags);
156 num_pages = 0;
157 }
158 }
159 spin_unlock_irqrestore(&blkif->free_pages_lock, flags);
160 if (num_pages != 0)
161 free_xenballooned_pages(num_pages, page);
162}
156 163
164#define vaddr(page) ((unsigned long)pfn_to_kaddr(page_to_pfn(page)))
157 165
158static int do_block_io_op(struct xen_blkif *blkif); 166static int do_block_io_op(struct xen_blkif *blkif);
159static int dispatch_rw_block_io(struct xen_blkif *blkif, 167static int dispatch_rw_block_io(struct xen_blkif *blkif,
@@ -170,13 +178,29 @@ static void make_response(struct xen_blkif *blkif, u64 id,
170 (n) = (&(pos)->node != NULL) ? rb_next(&(pos)->node) : NULL) 178 (n) = (&(pos)->node != NULL) ? rb_next(&(pos)->node) : NULL)
171 179
172 180
173static void add_persistent_gnt(struct rb_root *root, 181/*
182 * We don't need locking around the persistent grant helpers
183 * because blkback uses a single-thread for each backed, so we
184 * can be sure that this functions will never be called recursively.
185 *
186 * The only exception to that is put_persistent_grant, that can be called
187 * from interrupt context (by xen_blkbk_unmap), so we have to use atomic
188 * bit operations to modify the flags of a persistent grant and to count
189 * the number of used grants.
190 */
191static int add_persistent_gnt(struct xen_blkif *blkif,
174 struct persistent_gnt *persistent_gnt) 192 struct persistent_gnt *persistent_gnt)
175{ 193{
176 struct rb_node **new = &(root->rb_node), *parent = NULL; 194 struct rb_node **new = NULL, *parent = NULL;
177 struct persistent_gnt *this; 195 struct persistent_gnt *this;
178 196
197 if (blkif->persistent_gnt_c >= xen_blkif_max_pgrants) {
198 if (!blkif->vbd.overflow_max_grants)
199 blkif->vbd.overflow_max_grants = 1;
200 return -EBUSY;
201 }
179 /* Figure out where to put new node */ 202 /* Figure out where to put new node */
203 new = &blkif->persistent_gnts.rb_node;
180 while (*new) { 204 while (*new) {
181 this = container_of(*new, struct persistent_gnt, node); 205 this = container_of(*new, struct persistent_gnt, node);
182 206
@@ -186,22 +210,28 @@ static void add_persistent_gnt(struct rb_root *root,
186 else if (persistent_gnt->gnt > this->gnt) 210 else if (persistent_gnt->gnt > this->gnt)
187 new = &((*new)->rb_right); 211 new = &((*new)->rb_right);
188 else { 212 else {
189 pr_alert(DRV_PFX " trying to add a gref that's already in the tree\n"); 213 pr_alert_ratelimited(DRV_PFX " trying to add a gref that's already in the tree\n");
190 BUG(); 214 return -EINVAL;
191 } 215 }
192 } 216 }
193 217
218 bitmap_zero(persistent_gnt->flags, PERSISTENT_GNT_FLAGS_SIZE);
219 set_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags);
194 /* Add new node and rebalance tree. */ 220 /* Add new node and rebalance tree. */
195 rb_link_node(&(persistent_gnt->node), parent, new); 221 rb_link_node(&(persistent_gnt->node), parent, new);
196 rb_insert_color(&(persistent_gnt->node), root); 222 rb_insert_color(&(persistent_gnt->node), &blkif->persistent_gnts);
223 blkif->persistent_gnt_c++;
224 atomic_inc(&blkif->persistent_gnt_in_use);
225 return 0;
197} 226}
198 227
199static struct persistent_gnt *get_persistent_gnt(struct rb_root *root, 228static struct persistent_gnt *get_persistent_gnt(struct xen_blkif *blkif,
200 grant_ref_t gref) 229 grant_ref_t gref)
201{ 230{
202 struct persistent_gnt *data; 231 struct persistent_gnt *data;
203 struct rb_node *node = root->rb_node; 232 struct rb_node *node = NULL;
204 233
234 node = blkif->persistent_gnts.rb_node;
205 while (node) { 235 while (node) {
206 data = container_of(node, struct persistent_gnt, node); 236 data = container_of(node, struct persistent_gnt, node);
207 237
@@ -209,13 +239,31 @@ static struct persistent_gnt *get_persistent_gnt(struct rb_root *root,
209 node = node->rb_left; 239 node = node->rb_left;
210 else if (gref > data->gnt) 240 else if (gref > data->gnt)
211 node = node->rb_right; 241 node = node->rb_right;
212 else 242 else {
243 if(test_bit(PERSISTENT_GNT_ACTIVE, data->flags)) {
244 pr_alert_ratelimited(DRV_PFX " requesting a grant already in use\n");
245 return NULL;
246 }
247 set_bit(PERSISTENT_GNT_ACTIVE, data->flags);
248 atomic_inc(&blkif->persistent_gnt_in_use);
213 return data; 249 return data;
250 }
214 } 251 }
215 return NULL; 252 return NULL;
216} 253}
217 254
218static void free_persistent_gnts(struct rb_root *root, unsigned int num) 255static void put_persistent_gnt(struct xen_blkif *blkif,
256 struct persistent_gnt *persistent_gnt)
257{
258 if(!test_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags))
259 pr_alert_ratelimited(DRV_PFX " freeing a grant already unused");
260 set_bit(PERSISTENT_GNT_WAS_ACTIVE, persistent_gnt->flags);
261 clear_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags);
262 atomic_dec(&blkif->persistent_gnt_in_use);
263}
264
265static void free_persistent_gnts(struct xen_blkif *blkif, struct rb_root *root,
266 unsigned int num)
219{ 267{
220 struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST]; 268 struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
221 struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST]; 269 struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST];
@@ -240,7 +288,7 @@ static void free_persistent_gnts(struct rb_root *root, unsigned int num)
240 ret = gnttab_unmap_refs(unmap, NULL, pages, 288 ret = gnttab_unmap_refs(unmap, NULL, pages,
241 segs_to_unmap); 289 segs_to_unmap);
242 BUG_ON(ret); 290 BUG_ON(ret);
243 free_xenballooned_pages(segs_to_unmap, pages); 291 put_free_pages(blkif, pages, segs_to_unmap);
244 segs_to_unmap = 0; 292 segs_to_unmap = 0;
245 } 293 }
246 294
@@ -251,21 +299,148 @@ static void free_persistent_gnts(struct rb_root *root, unsigned int num)
251 BUG_ON(num != 0); 299 BUG_ON(num != 0);
252} 300}
253 301
302static void unmap_purged_grants(struct work_struct *work)
303{
304 struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
305 struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST];
306 struct persistent_gnt *persistent_gnt;
307 int ret, segs_to_unmap = 0;
308 struct xen_blkif *blkif = container_of(work, typeof(*blkif), persistent_purge_work);
309
310 while(!list_empty(&blkif->persistent_purge_list)) {
311 persistent_gnt = list_first_entry(&blkif->persistent_purge_list,
312 struct persistent_gnt,
313 remove_node);
314 list_del(&persistent_gnt->remove_node);
315
316 gnttab_set_unmap_op(&unmap[segs_to_unmap],
317 vaddr(persistent_gnt->page),
318 GNTMAP_host_map,
319 persistent_gnt->handle);
320
321 pages[segs_to_unmap] = persistent_gnt->page;
322
323 if (++segs_to_unmap == BLKIF_MAX_SEGMENTS_PER_REQUEST) {
324 ret = gnttab_unmap_refs(unmap, NULL, pages,
325 segs_to_unmap);
326 BUG_ON(ret);
327 put_free_pages(blkif, pages, segs_to_unmap);
328 segs_to_unmap = 0;
329 }
330 kfree(persistent_gnt);
331 }
332 if (segs_to_unmap > 0) {
333 ret = gnttab_unmap_refs(unmap, NULL, pages, segs_to_unmap);
334 BUG_ON(ret);
335 put_free_pages(blkif, pages, segs_to_unmap);
336 }
337}
338
339static void purge_persistent_gnt(struct xen_blkif *blkif)
340{
341 struct persistent_gnt *persistent_gnt;
342 struct rb_node *n;
343 unsigned int num_clean, total;
344 bool scan_used = false, clean_used = false;
345 struct rb_root *root;
346
347 if (blkif->persistent_gnt_c < xen_blkif_max_pgrants ||
348 (blkif->persistent_gnt_c == xen_blkif_max_pgrants &&
349 !blkif->vbd.overflow_max_grants)) {
350 return;
351 }
352
353 if (work_pending(&blkif->persistent_purge_work)) {
354 pr_alert_ratelimited(DRV_PFX "Scheduled work from previous purge is still pending, cannot purge list\n");
355 return;
356 }
357
358 num_clean = (xen_blkif_max_pgrants / 100) * LRU_PERCENT_CLEAN;
359 num_clean = blkif->persistent_gnt_c - xen_blkif_max_pgrants + num_clean;
360 num_clean = min(blkif->persistent_gnt_c, num_clean);
361 if ((num_clean == 0) ||
362 (num_clean > (blkif->persistent_gnt_c - atomic_read(&blkif->persistent_gnt_in_use))))
363 return;
364
365 /*
366 * At this point, we can assure that there will be no calls
367 * to get_persistent_grant (because we are executing this code from
368 * xen_blkif_schedule), there can only be calls to put_persistent_gnt,
369 * which means that the number of currently used grants will go down,
370 * but never up, so we will always be able to remove the requested
371 * number of grants.
372 */
373
374 total = num_clean;
375
376 pr_debug(DRV_PFX "Going to purge %u persistent grants\n", num_clean);
377
378 INIT_LIST_HEAD(&blkif->persistent_purge_list);
379 root = &blkif->persistent_gnts;
380purge_list:
381 foreach_grant_safe(persistent_gnt, n, root, node) {
382 BUG_ON(persistent_gnt->handle ==
383 BLKBACK_INVALID_HANDLE);
384
385 if (clean_used) {
386 clear_bit(PERSISTENT_GNT_WAS_ACTIVE, persistent_gnt->flags);
387 continue;
388 }
389
390 if (test_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags))
391 continue;
392 if (!scan_used &&
393 (test_bit(PERSISTENT_GNT_WAS_ACTIVE, persistent_gnt->flags)))
394 continue;
395
396 rb_erase(&persistent_gnt->node, root);
397 list_add(&persistent_gnt->remove_node,
398 &blkif->persistent_purge_list);
399 if (--num_clean == 0)
400 goto finished;
401 }
402 /*
403 * If we get here it means we also need to start cleaning
404 * grants that were used since last purge in order to cope
405 * with the requested num
406 */
407 if (!scan_used && !clean_used) {
408 pr_debug(DRV_PFX "Still missing %u purged frames\n", num_clean);
409 scan_used = true;
410 goto purge_list;
411 }
412finished:
413 if (!clean_used) {
414 pr_debug(DRV_PFX "Finished scanning for grants to clean, removing used flag\n");
415 clean_used = true;
416 goto purge_list;
417 }
418
419 blkif->persistent_gnt_c -= (total - num_clean);
420 blkif->vbd.overflow_max_grants = 0;
421
422 /* We can defer this work */
423 INIT_WORK(&blkif->persistent_purge_work, unmap_purged_grants);
424 schedule_work(&blkif->persistent_purge_work);
425 pr_debug(DRV_PFX "Purged %u/%u\n", (total - num_clean), total);
426 return;
427}
428
254/* 429/*
255 * Retrieve from the 'pending_reqs' a free pending_req structure to be used. 430 * Retrieve from the 'pending_reqs' a free pending_req structure to be used.
256 */ 431 */
257static struct pending_req *alloc_req(void) 432static struct pending_req *alloc_req(struct xen_blkif *blkif)
258{ 433{
259 struct pending_req *req = NULL; 434 struct pending_req *req = NULL;
260 unsigned long flags; 435 unsigned long flags;
261 436
262 spin_lock_irqsave(&blkbk->pending_free_lock, flags); 437 spin_lock_irqsave(&blkif->pending_free_lock, flags);
263 if (!list_empty(&blkbk->pending_free)) { 438 if (!list_empty(&blkif->pending_free)) {
264 req = list_entry(blkbk->pending_free.next, struct pending_req, 439 req = list_entry(blkif->pending_free.next, struct pending_req,
265 free_list); 440 free_list);
266 list_del(&req->free_list); 441 list_del(&req->free_list);
267 } 442 }
268 spin_unlock_irqrestore(&blkbk->pending_free_lock, flags); 443 spin_unlock_irqrestore(&blkif->pending_free_lock, flags);
269 return req; 444 return req;
270} 445}
271 446
@@ -273,17 +448,17 @@ static struct pending_req *alloc_req(void)
273 * Return the 'pending_req' structure back to the freepool. We also 448 * Return the 'pending_req' structure back to the freepool. We also
274 * wake up the thread if it was waiting for a free page. 449 * wake up the thread if it was waiting for a free page.
275 */ 450 */
276static void free_req(struct pending_req *req) 451static void free_req(struct xen_blkif *blkif, struct pending_req *req)
277{ 452{
278 unsigned long flags; 453 unsigned long flags;
279 int was_empty; 454 int was_empty;
280 455
281 spin_lock_irqsave(&blkbk->pending_free_lock, flags); 456 spin_lock_irqsave(&blkif->pending_free_lock, flags);
282 was_empty = list_empty(&blkbk->pending_free); 457 was_empty = list_empty(&blkif->pending_free);
283 list_add(&req->free_list, &blkbk->pending_free); 458 list_add(&req->free_list, &blkif->pending_free);
284 spin_unlock_irqrestore(&blkbk->pending_free_lock, flags); 459 spin_unlock_irqrestore(&blkif->pending_free_lock, flags);
285 if (was_empty) 460 if (was_empty)
286 wake_up(&blkbk->pending_free_wq); 461 wake_up(&blkif->pending_free_wq);
287} 462}
288 463
289/* 464/*
@@ -382,10 +557,12 @@ irqreturn_t xen_blkif_be_int(int irq, void *dev_id)
382static void print_stats(struct xen_blkif *blkif) 557static void print_stats(struct xen_blkif *blkif)
383{ 558{
384 pr_info("xen-blkback (%s): oo %3llu | rd %4llu | wr %4llu | f %4llu" 559 pr_info("xen-blkback (%s): oo %3llu | rd %4llu | wr %4llu | f %4llu"
385 " | ds %4llu\n", 560 " | ds %4llu | pg: %4u/%4d\n",
386 current->comm, blkif->st_oo_req, 561 current->comm, blkif->st_oo_req,
387 blkif->st_rd_req, blkif->st_wr_req, 562 blkif->st_rd_req, blkif->st_wr_req,
388 blkif->st_f_req, blkif->st_ds_req); 563 blkif->st_f_req, blkif->st_ds_req,
564 blkif->persistent_gnt_c,
565 xen_blkif_max_pgrants);
389 blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000); 566 blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000);
390 blkif->st_rd_req = 0; 567 blkif->st_rd_req = 0;
391 blkif->st_wr_req = 0; 568 blkif->st_wr_req = 0;
@@ -397,6 +574,8 @@ int xen_blkif_schedule(void *arg)
397{ 574{
398 struct xen_blkif *blkif = arg; 575 struct xen_blkif *blkif = arg;
399 struct xen_vbd *vbd = &blkif->vbd; 576 struct xen_vbd *vbd = &blkif->vbd;
577 unsigned long timeout;
578 int ret;
400 579
401 xen_blkif_get(blkif); 580 xen_blkif_get(blkif);
402 581
@@ -406,27 +585,52 @@ int xen_blkif_schedule(void *arg)
406 if (unlikely(vbd->size != vbd_sz(vbd))) 585 if (unlikely(vbd->size != vbd_sz(vbd)))
407 xen_vbd_resize(blkif); 586 xen_vbd_resize(blkif);
408 587
409 wait_event_interruptible( 588 timeout = msecs_to_jiffies(LRU_INTERVAL);
589
590 timeout = wait_event_interruptible_timeout(
410 blkif->wq, 591 blkif->wq,
411 blkif->waiting_reqs || kthread_should_stop()); 592 blkif->waiting_reqs || kthread_should_stop(),
412 wait_event_interruptible( 593 timeout);
413 blkbk->pending_free_wq, 594 if (timeout == 0)
414 !list_empty(&blkbk->pending_free) || 595 goto purge_gnt_list;
415 kthread_should_stop()); 596 timeout = wait_event_interruptible_timeout(
597 blkif->pending_free_wq,
598 !list_empty(&blkif->pending_free) ||
599 kthread_should_stop(),
600 timeout);
601 if (timeout == 0)
602 goto purge_gnt_list;
416 603
417 blkif->waiting_reqs = 0; 604 blkif->waiting_reqs = 0;
418 smp_mb(); /* clear flag *before* checking for work */ 605 smp_mb(); /* clear flag *before* checking for work */
419 606
420 if (do_block_io_op(blkif)) 607 ret = do_block_io_op(blkif);
608 if (ret > 0)
421 blkif->waiting_reqs = 1; 609 blkif->waiting_reqs = 1;
610 if (ret == -EACCES)
611 wait_event_interruptible(blkif->shutdown_wq,
612 kthread_should_stop());
613
614purge_gnt_list:
615 if (blkif->vbd.feature_gnt_persistent &&
616 time_after(jiffies, blkif->next_lru)) {
617 purge_persistent_gnt(blkif);
618 blkif->next_lru = jiffies + msecs_to_jiffies(LRU_INTERVAL);
619 }
620
621 /* Shrink if we have more than xen_blkif_max_buffer_pages */
622 shrink_free_pagepool(blkif, xen_blkif_max_buffer_pages);
422 623
423 if (log_stats && time_after(jiffies, blkif->st_print)) 624 if (log_stats && time_after(jiffies, blkif->st_print))
424 print_stats(blkif); 625 print_stats(blkif);
425 } 626 }
426 627
628 /* Since we are shutting down remove all pages from the buffer */
629 shrink_free_pagepool(blkif, 0 /* All */);
630
427 /* Free all persistent grant pages */ 631 /* Free all persistent grant pages */
428 if (!RB_EMPTY_ROOT(&blkif->persistent_gnts)) 632 if (!RB_EMPTY_ROOT(&blkif->persistent_gnts))
429 free_persistent_gnts(&blkif->persistent_gnts, 633 free_persistent_gnts(blkif, &blkif->persistent_gnts,
430 blkif->persistent_gnt_c); 634 blkif->persistent_gnt_c);
431 635
432 BUG_ON(!RB_EMPTY_ROOT(&blkif->persistent_gnts)); 636 BUG_ON(!RB_EMPTY_ROOT(&blkif->persistent_gnts));
@@ -441,148 +645,98 @@ int xen_blkif_schedule(void *arg)
441 return 0; 645 return 0;
442} 646}
443 647
444struct seg_buf {
445 unsigned int offset;
446 unsigned int nsec;
447};
448/* 648/*
449 * Unmap the grant references, and also remove the M2P over-rides 649 * Unmap the grant references, and also remove the M2P over-rides
450 * used in the 'pending_req'. 650 * used in the 'pending_req'.
451 */ 651 */
452static void xen_blkbk_unmap(struct pending_req *req) 652static void xen_blkbk_unmap(struct xen_blkif *blkif,
653 struct grant_page *pages[],
654 int num)
453{ 655{
454 struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST]; 656 struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
455 struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST]; 657 struct page *unmap_pages[BLKIF_MAX_SEGMENTS_PER_REQUEST];
456 unsigned int i, invcount = 0; 658 unsigned int i, invcount = 0;
457 grant_handle_t handle;
458 int ret; 659 int ret;
459 660
460 for (i = 0; i < req->nr_pages; i++) { 661 for (i = 0; i < num; i++) {
461 if (!test_bit(i, req->unmap_seg)) 662 if (pages[i]->persistent_gnt != NULL) {
663 put_persistent_gnt(blkif, pages[i]->persistent_gnt);
462 continue; 664 continue;
463 handle = pending_handle(req, i); 665 }
464 if (handle == BLKBACK_INVALID_HANDLE) 666 if (pages[i]->handle == BLKBACK_INVALID_HANDLE)
465 continue; 667 continue;
466 gnttab_set_unmap_op(&unmap[invcount], vaddr(req, i), 668 unmap_pages[invcount] = pages[i]->page;
467 GNTMAP_host_map, handle); 669 gnttab_set_unmap_op(&unmap[invcount], vaddr(pages[i]->page),
468 pending_handle(req, i) = BLKBACK_INVALID_HANDLE; 670 GNTMAP_host_map, pages[i]->handle);
469 pages[invcount] = virt_to_page(vaddr(req, i)); 671 pages[i]->handle = BLKBACK_INVALID_HANDLE;
470 invcount++; 672 if (++invcount == BLKIF_MAX_SEGMENTS_PER_REQUEST) {
673 ret = gnttab_unmap_refs(unmap, NULL, unmap_pages,
674 invcount);
675 BUG_ON(ret);
676 put_free_pages(blkif, unmap_pages, invcount);
677 invcount = 0;
678 }
679 }
680 if (invcount) {
681 ret = gnttab_unmap_refs(unmap, NULL, unmap_pages, invcount);
682 BUG_ON(ret);
683 put_free_pages(blkif, unmap_pages, invcount);
471 } 684 }
472
473 ret = gnttab_unmap_refs(unmap, NULL, pages, invcount);
474 BUG_ON(ret);
475} 685}
476 686
477static int xen_blkbk_map(struct blkif_request *req, 687static int xen_blkbk_map(struct xen_blkif *blkif,
478 struct pending_req *pending_req, 688 struct grant_page *pages[],
479 struct seg_buf seg[], 689 int num, bool ro)
480 struct page *pages[])
481{ 690{
482 struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST]; 691 struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST];
483 struct persistent_gnt *persistent_gnts[BLKIF_MAX_SEGMENTS_PER_REQUEST];
484 struct page *pages_to_gnt[BLKIF_MAX_SEGMENTS_PER_REQUEST]; 692 struct page *pages_to_gnt[BLKIF_MAX_SEGMENTS_PER_REQUEST];
485 struct persistent_gnt *persistent_gnt = NULL; 693 struct persistent_gnt *persistent_gnt = NULL;
486 struct xen_blkif *blkif = pending_req->blkif;
487 phys_addr_t addr = 0; 694 phys_addr_t addr = 0;
488 int i, j; 695 int i, seg_idx, new_map_idx;
489 bool new_map;
490 int nseg = req->u.rw.nr_segments;
491 int segs_to_map = 0; 696 int segs_to_map = 0;
492 int ret = 0; 697 int ret = 0;
698 int last_map = 0, map_until = 0;
493 int use_persistent_gnts; 699 int use_persistent_gnts;
494 700
495 use_persistent_gnts = (blkif->vbd.feature_gnt_persistent); 701 use_persistent_gnts = (blkif->vbd.feature_gnt_persistent);
496 702
497 BUG_ON(blkif->persistent_gnt_c >
498 max_mapped_grant_pages(pending_req->blkif->blk_protocol));
499
500 /* 703 /*
501 * Fill out preq.nr_sects with proper amount of sectors, and setup 704 * Fill out preq.nr_sects with proper amount of sectors, and setup
502 * assign map[..] with the PFN of the page in our domain with the 705 * assign map[..] with the PFN of the page in our domain with the
503 * corresponding grant reference for each page. 706 * corresponding grant reference for each page.
504 */ 707 */
505 for (i = 0; i < nseg; i++) { 708again:
709 for (i = map_until; i < num; i++) {
506 uint32_t flags; 710 uint32_t flags;
507 711
508 if (use_persistent_gnts) 712 if (use_persistent_gnts)
509 persistent_gnt = get_persistent_gnt( 713 persistent_gnt = get_persistent_gnt(
510 &blkif->persistent_gnts, 714 blkif,
511 req->u.rw.seg[i].gref); 715 pages[i]->gref);
512 716
513 if (persistent_gnt) { 717 if (persistent_gnt) {
514 /* 718 /*
515 * We are using persistent grants and 719 * We are using persistent grants and
516 * the grant is already mapped 720 * the grant is already mapped
517 */ 721 */
518 new_map = false; 722 pages[i]->page = persistent_gnt->page;
519 } else if (use_persistent_gnts && 723 pages[i]->persistent_gnt = persistent_gnt;
520 blkif->persistent_gnt_c <
521 max_mapped_grant_pages(blkif->blk_protocol)) {
522 /*
523 * We are using persistent grants, the grant is
524 * not mapped but we have room for it
525 */
526 new_map = true;
527 persistent_gnt = kmalloc(
528 sizeof(struct persistent_gnt),
529 GFP_KERNEL);
530 if (!persistent_gnt)
531 return -ENOMEM;
532 if (alloc_xenballooned_pages(1, &persistent_gnt->page,
533 false)) {
534 kfree(persistent_gnt);
535 return -ENOMEM;
536 }
537 persistent_gnt->gnt = req->u.rw.seg[i].gref;
538 persistent_gnt->handle = BLKBACK_INVALID_HANDLE;
539
540 pages_to_gnt[segs_to_map] =
541 persistent_gnt->page;
542 addr = (unsigned long) pfn_to_kaddr(
543 page_to_pfn(persistent_gnt->page));
544
545 add_persistent_gnt(&blkif->persistent_gnts,
546 persistent_gnt);
547 blkif->persistent_gnt_c++;
548 pr_debug(DRV_PFX " grant %u added to the tree of persistent grants, using %u/%u\n",
549 persistent_gnt->gnt, blkif->persistent_gnt_c,
550 max_mapped_grant_pages(blkif->blk_protocol));
551 } else { 724 } else {
552 /* 725 if (get_free_page(blkif, &pages[i]->page))
553 * We are either using persistent grants and 726 goto out_of_memory;
554 * hit the maximum limit of grants mapped, 727 addr = vaddr(pages[i]->page);
555 * or we are not using persistent grants. 728 pages_to_gnt[segs_to_map] = pages[i]->page;
556 */ 729 pages[i]->persistent_gnt = NULL;
557 if (use_persistent_gnts &&
558 !blkif->vbd.overflow_max_grants) {
559 blkif->vbd.overflow_max_grants = 1;
560 pr_alert(DRV_PFX " domain %u, device %#x is using maximum number of persistent grants\n",
561 blkif->domid, blkif->vbd.handle);
562 }
563 new_map = true;
564 pages[i] = blkbk->pending_page(pending_req, i);
565 addr = vaddr(pending_req, i);
566 pages_to_gnt[segs_to_map] =
567 blkbk->pending_page(pending_req, i);
568 }
569
570 if (persistent_gnt) {
571 pages[i] = persistent_gnt->page;
572 persistent_gnts[i] = persistent_gnt;
573 } else {
574 persistent_gnts[i] = NULL;
575 }
576
577 if (new_map) {
578 flags = GNTMAP_host_map; 730 flags = GNTMAP_host_map;
579 if (!persistent_gnt && 731 if (!use_persistent_gnts && ro)
580 (pending_req->operation != BLKIF_OP_READ))
581 flags |= GNTMAP_readonly; 732 flags |= GNTMAP_readonly;
582 gnttab_set_map_op(&map[segs_to_map++], addr, 733 gnttab_set_map_op(&map[segs_to_map++], addr,
583 flags, req->u.rw.seg[i].gref, 734 flags, pages[i]->gref,
584 blkif->domid); 735 blkif->domid);
585 } 736 }
737 map_until = i + 1;
738 if (segs_to_map == BLKIF_MAX_SEGMENTS_PER_REQUEST)
739 break;
586 } 740 }
587 741
588 if (segs_to_map) { 742 if (segs_to_map) {
@@ -595,49 +749,133 @@ static int xen_blkbk_map(struct blkif_request *req,
595 * so that when we access vaddr(pending_req,i) it has the contents of 749 * so that when we access vaddr(pending_req,i) it has the contents of
596 * the page from the other domain. 750 * the page from the other domain.
597 */ 751 */
598 bitmap_zero(pending_req->unmap_seg, BLKIF_MAX_SEGMENTS_PER_REQUEST); 752 for (seg_idx = last_map, new_map_idx = 0; seg_idx < map_until; seg_idx++) {
599 for (i = 0, j = 0; i < nseg; i++) { 753 if (!pages[seg_idx]->persistent_gnt) {
600 if (!persistent_gnts[i] ||
601 persistent_gnts[i]->handle == BLKBACK_INVALID_HANDLE) {
602 /* This is a newly mapped grant */ 754 /* This is a newly mapped grant */
603 BUG_ON(j >= segs_to_map); 755 BUG_ON(new_map_idx >= segs_to_map);
604 if (unlikely(map[j].status != 0)) { 756 if (unlikely(map[new_map_idx].status != 0)) {
605 pr_debug(DRV_PFX "invalid buffer -- could not remap it\n"); 757 pr_debug(DRV_PFX "invalid buffer -- could not remap it\n");
606 map[j].handle = BLKBACK_INVALID_HANDLE; 758 pages[seg_idx]->handle = BLKBACK_INVALID_HANDLE;
607 ret |= 1; 759 ret |= 1;
608 if (persistent_gnts[i]) { 760 goto next;
609 rb_erase(&persistent_gnts[i]->node,
610 &blkif->persistent_gnts);
611 blkif->persistent_gnt_c--;
612 kfree(persistent_gnts[i]);
613 persistent_gnts[i] = NULL;
614 }
615 } 761 }
762 pages[seg_idx]->handle = map[new_map_idx].handle;
763 } else {
764 continue;
616 } 765 }
617 if (persistent_gnts[i]) { 766 if (use_persistent_gnts &&
618 if (persistent_gnts[i]->handle == 767 blkif->persistent_gnt_c < xen_blkif_max_pgrants) {
619 BLKBACK_INVALID_HANDLE) { 768 /*
769 * We are using persistent grants, the grant is
770 * not mapped but we might have room for it.
771 */
772 persistent_gnt = kmalloc(sizeof(struct persistent_gnt),
773 GFP_KERNEL);
774 if (!persistent_gnt) {
620 /* 775 /*
621 * If this is a new persistent grant 776 * If we don't have enough memory to
622 * save the handler 777 * allocate the persistent_gnt struct
778 * map this grant non-persistenly
623 */ 779 */
624 persistent_gnts[i]->handle = map[j++].handle; 780 goto next;
625 } 781 }
626 pending_handle(pending_req, i) = 782 persistent_gnt->gnt = map[new_map_idx].ref;
627 persistent_gnts[i]->handle; 783 persistent_gnt->handle = map[new_map_idx].handle;
784 persistent_gnt->page = pages[seg_idx]->page;
785 if (add_persistent_gnt(blkif,
786 persistent_gnt)) {
787 kfree(persistent_gnt);
788 persistent_gnt = NULL;
789 goto next;
790 }
791 pages[seg_idx]->persistent_gnt = persistent_gnt;
792 pr_debug(DRV_PFX " grant %u added to the tree of persistent grants, using %u/%u\n",
793 persistent_gnt->gnt, blkif->persistent_gnt_c,
794 xen_blkif_max_pgrants);
795 goto next;
796 }
797 if (use_persistent_gnts && !blkif->vbd.overflow_max_grants) {
798 blkif->vbd.overflow_max_grants = 1;
799 pr_debug(DRV_PFX " domain %u, device %#x is using maximum number of persistent grants\n",
800 blkif->domid, blkif->vbd.handle);
801 }
802 /*
803 * We could not map this grant persistently, so use it as
804 * a non-persistent grant.
805 */
806next:
807 new_map_idx++;
808 }
809 segs_to_map = 0;
810 last_map = map_until;
811 if (map_until != num)
812 goto again;
628 813
629 if (ret) 814 return ret;
630 continue; 815
631 } else { 816out_of_memory:
632 pending_handle(pending_req, i) = map[j++].handle; 817 pr_alert(DRV_PFX "%s: out of memory\n", __func__);
633 bitmap_set(pending_req->unmap_seg, i, 1); 818 put_free_pages(blkif, pages_to_gnt, segs_to_map);
819 return -ENOMEM;
820}
821
822static int xen_blkbk_map_seg(struct pending_req *pending_req)
823{
824 int rc;
825
826 rc = xen_blkbk_map(pending_req->blkif, pending_req->segments,
827 pending_req->nr_pages,
828 (pending_req->operation != BLKIF_OP_READ));
829
830 return rc;
831}
634 832
635 if (ret) 833static int xen_blkbk_parse_indirect(struct blkif_request *req,
636 continue; 834 struct pending_req *pending_req,
835 struct seg_buf seg[],
836 struct phys_req *preq)
837{
838 struct grant_page **pages = pending_req->indirect_pages;
839 struct xen_blkif *blkif = pending_req->blkif;
840 int indirect_grefs, rc, n, nseg, i;
841 struct blkif_request_segment_aligned *segments = NULL;
842
843 nseg = pending_req->nr_pages;
844 indirect_grefs = INDIRECT_PAGES(nseg);
845 BUG_ON(indirect_grefs > BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST);
846
847 for (i = 0; i < indirect_grefs; i++)
848 pages[i]->gref = req->u.indirect.indirect_grefs[i];
849
850 rc = xen_blkbk_map(blkif, pages, indirect_grefs, true);
851 if (rc)
852 goto unmap;
853
854 for (n = 0, i = 0; n < nseg; n++) {
855 if ((n % SEGS_PER_INDIRECT_FRAME) == 0) {
856 /* Map indirect segments */
857 if (segments)
858 kunmap_atomic(segments);
859 segments = kmap_atomic(pages[n/SEGS_PER_INDIRECT_FRAME]->page);
860 }
861 i = n % SEGS_PER_INDIRECT_FRAME;
862 pending_req->segments[n]->gref = segments[i].gref;
863 seg[n].nsec = segments[i].last_sect -
864 segments[i].first_sect + 1;
865 seg[n].offset = (segments[i].first_sect << 9);
866 if ((segments[i].last_sect >= (PAGE_SIZE >> 9)) ||
867 (segments[i].last_sect < segments[i].first_sect)) {
868 rc = -EINVAL;
869 goto unmap;
637 } 870 }
638 seg[i].offset = (req->u.rw.seg[i].first_sect << 9); 871 preq->nr_sects += seg[n].nsec;
639 } 872 }
640 return ret; 873
874unmap:
875 if (segments)
876 kunmap_atomic(segments);
877 xen_blkbk_unmap(blkif, pages, indirect_grefs);
878 return rc;
641} 879}
642 880
643static int dispatch_discard_io(struct xen_blkif *blkif, 881static int dispatch_discard_io(struct xen_blkif *blkif,
@@ -647,7 +885,18 @@ static int dispatch_discard_io(struct xen_blkif *blkif,
647 int status = BLKIF_RSP_OKAY; 885 int status = BLKIF_RSP_OKAY;
648 struct block_device *bdev = blkif->vbd.bdev; 886 struct block_device *bdev = blkif->vbd.bdev;
649 unsigned long secure; 887 unsigned long secure;
888 struct phys_req preq;
889
890 preq.sector_number = req->u.discard.sector_number;
891 preq.nr_sects = req->u.discard.nr_sectors;
650 892
893 err = xen_vbd_translate(&preq, blkif, WRITE);
894 if (err) {
895 pr_warn(DRV_PFX "access denied: DISCARD [%llu->%llu] on dev=%04x\n",
896 preq.sector_number,
897 preq.sector_number + preq.nr_sects, blkif->vbd.pdevice);
898 goto fail_response;
899 }
651 blkif->st_ds_req++; 900 blkif->st_ds_req++;
652 901
653 xen_blkif_get(blkif); 902 xen_blkif_get(blkif);
@@ -658,7 +907,7 @@ static int dispatch_discard_io(struct xen_blkif *blkif,
658 err = blkdev_issue_discard(bdev, req->u.discard.sector_number, 907 err = blkdev_issue_discard(bdev, req->u.discard.sector_number,
659 req->u.discard.nr_sectors, 908 req->u.discard.nr_sectors,
660 GFP_KERNEL, secure); 909 GFP_KERNEL, secure);
661 910fail_response:
662 if (err == -EOPNOTSUPP) { 911 if (err == -EOPNOTSUPP) {
663 pr_debug(DRV_PFX "discard op failed, not supported\n"); 912 pr_debug(DRV_PFX "discard op failed, not supported\n");
664 status = BLKIF_RSP_EOPNOTSUPP; 913 status = BLKIF_RSP_EOPNOTSUPP;
@@ -674,7 +923,7 @@ static int dispatch_other_io(struct xen_blkif *blkif,
674 struct blkif_request *req, 923 struct blkif_request *req,
675 struct pending_req *pending_req) 924 struct pending_req *pending_req)
676{ 925{
677 free_req(pending_req); 926 free_req(blkif, pending_req);
678 make_response(blkif, req->u.other.id, req->operation, 927 make_response(blkif, req->u.other.id, req->operation,
679 BLKIF_RSP_EOPNOTSUPP); 928 BLKIF_RSP_EOPNOTSUPP);
680 return -EIO; 929 return -EIO;
@@ -726,7 +975,9 @@ static void __end_block_io_op(struct pending_req *pending_req, int error)
726 * the proper response on the ring. 975 * the proper response on the ring.
727 */ 976 */
728 if (atomic_dec_and_test(&pending_req->pendcnt)) { 977 if (atomic_dec_and_test(&pending_req->pendcnt)) {
729 xen_blkbk_unmap(pending_req); 978 xen_blkbk_unmap(pending_req->blkif,
979 pending_req->segments,
980 pending_req->nr_pages);
730 make_response(pending_req->blkif, pending_req->id, 981 make_response(pending_req->blkif, pending_req->id,
731 pending_req->operation, pending_req->status); 982 pending_req->operation, pending_req->status);
732 xen_blkif_put(pending_req->blkif); 983 xen_blkif_put(pending_req->blkif);
@@ -734,7 +985,7 @@ static void __end_block_io_op(struct pending_req *pending_req, int error)
734 if (atomic_read(&pending_req->blkif->drain)) 985 if (atomic_read(&pending_req->blkif->drain))
735 complete(&pending_req->blkif->drain_complete); 986 complete(&pending_req->blkif->drain_complete);
736 } 987 }
737 free_req(pending_req); 988 free_req(pending_req->blkif, pending_req);
738 } 989 }
739} 990}
740 991
@@ -767,6 +1018,12 @@ __do_block_io_op(struct xen_blkif *blkif)
767 rp = blk_rings->common.sring->req_prod; 1018 rp = blk_rings->common.sring->req_prod;
768 rmb(); /* Ensure we see queued requests up to 'rp'. */ 1019 rmb(); /* Ensure we see queued requests up to 'rp'. */
769 1020
1021 if (RING_REQUEST_PROD_OVERFLOW(&blk_rings->common, rp)) {
1022 rc = blk_rings->common.rsp_prod_pvt;
1023 pr_warn(DRV_PFX "Frontend provided bogus ring requests (%d - %d = %d). Halting ring processing on dev=%04x\n",
1024 rp, rc, rp - rc, blkif->vbd.pdevice);
1025 return -EACCES;
1026 }
770 while (rc != rp) { 1027 while (rc != rp) {
771 1028
772 if (RING_REQUEST_CONS_OVERFLOW(&blk_rings->common, rc)) 1029 if (RING_REQUEST_CONS_OVERFLOW(&blk_rings->common, rc))
@@ -777,7 +1034,7 @@ __do_block_io_op(struct xen_blkif *blkif)
777 break; 1034 break;
778 } 1035 }
779 1036
780 pending_req = alloc_req(); 1037 pending_req = alloc_req(blkif);
781 if (NULL == pending_req) { 1038 if (NULL == pending_req) {
782 blkif->st_oo_req++; 1039 blkif->st_oo_req++;
783 more_to_do = 1; 1040 more_to_do = 1;
@@ -807,11 +1064,12 @@ __do_block_io_op(struct xen_blkif *blkif)
807 case BLKIF_OP_WRITE: 1064 case BLKIF_OP_WRITE:
808 case BLKIF_OP_WRITE_BARRIER: 1065 case BLKIF_OP_WRITE_BARRIER:
809 case BLKIF_OP_FLUSH_DISKCACHE: 1066 case BLKIF_OP_FLUSH_DISKCACHE:
1067 case BLKIF_OP_INDIRECT:
810 if (dispatch_rw_block_io(blkif, &req, pending_req)) 1068 if (dispatch_rw_block_io(blkif, &req, pending_req))
811 goto done; 1069 goto done;
812 break; 1070 break;
813 case BLKIF_OP_DISCARD: 1071 case BLKIF_OP_DISCARD:
814 free_req(pending_req); 1072 free_req(blkif, pending_req);
815 if (dispatch_discard_io(blkif, &req)) 1073 if (dispatch_discard_io(blkif, &req))
816 goto done; 1074 goto done;
817 break; 1075 break;
@@ -853,17 +1111,28 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
853 struct pending_req *pending_req) 1111 struct pending_req *pending_req)
854{ 1112{
855 struct phys_req preq; 1113 struct phys_req preq;
856 struct seg_buf seg[BLKIF_MAX_SEGMENTS_PER_REQUEST]; 1114 struct seg_buf *seg = pending_req->seg;
857 unsigned int nseg; 1115 unsigned int nseg;
858 struct bio *bio = NULL; 1116 struct bio *bio = NULL;
859 struct bio *biolist[BLKIF_MAX_SEGMENTS_PER_REQUEST]; 1117 struct bio **biolist = pending_req->biolist;
860 int i, nbio = 0; 1118 int i, nbio = 0;
861 int operation; 1119 int operation;
862 struct blk_plug plug; 1120 struct blk_plug plug;
863 bool drain = false; 1121 bool drain = false;
864 struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST]; 1122 struct grant_page **pages = pending_req->segments;
1123 unsigned short req_operation;
1124
1125 req_operation = req->operation == BLKIF_OP_INDIRECT ?
1126 req->u.indirect.indirect_op : req->operation;
1127 if ((req->operation == BLKIF_OP_INDIRECT) &&
1128 (req_operation != BLKIF_OP_READ) &&
1129 (req_operation != BLKIF_OP_WRITE)) {
1130 pr_debug(DRV_PFX "Invalid indirect operation (%u)\n",
1131 req_operation);
1132 goto fail_response;
1133 }
865 1134
866 switch (req->operation) { 1135 switch (req_operation) {
867 case BLKIF_OP_READ: 1136 case BLKIF_OP_READ:
868 blkif->st_rd_req++; 1137 blkif->st_rd_req++;
869 operation = READ; 1138 operation = READ;
@@ -885,33 +1154,47 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
885 } 1154 }
886 1155
887 /* Check that the number of segments is sane. */ 1156 /* Check that the number of segments is sane. */
888 nseg = req->u.rw.nr_segments; 1157 nseg = req->operation == BLKIF_OP_INDIRECT ?
1158 req->u.indirect.nr_segments : req->u.rw.nr_segments;
889 1159
890 if (unlikely(nseg == 0 && operation != WRITE_FLUSH) || 1160 if (unlikely(nseg == 0 && operation != WRITE_FLUSH) ||
891 unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) { 1161 unlikely((req->operation != BLKIF_OP_INDIRECT) &&
1162 (nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) ||
1163 unlikely((req->operation == BLKIF_OP_INDIRECT) &&
1164 (nseg > MAX_INDIRECT_SEGMENTS))) {
892 pr_debug(DRV_PFX "Bad number of segments in request (%d)\n", 1165 pr_debug(DRV_PFX "Bad number of segments in request (%d)\n",
893 nseg); 1166 nseg);
894 /* Haven't submitted any bio's yet. */ 1167 /* Haven't submitted any bio's yet. */
895 goto fail_response; 1168 goto fail_response;
896 } 1169 }
897 1170
898 preq.sector_number = req->u.rw.sector_number;
899 preq.nr_sects = 0; 1171 preq.nr_sects = 0;
900 1172
901 pending_req->blkif = blkif; 1173 pending_req->blkif = blkif;
902 pending_req->id = req->u.rw.id; 1174 pending_req->id = req->u.rw.id;
903 pending_req->operation = req->operation; 1175 pending_req->operation = req_operation;
904 pending_req->status = BLKIF_RSP_OKAY; 1176 pending_req->status = BLKIF_RSP_OKAY;
905 pending_req->nr_pages = nseg; 1177 pending_req->nr_pages = nseg;
906 1178
907 for (i = 0; i < nseg; i++) { 1179 if (req->operation != BLKIF_OP_INDIRECT) {
908 seg[i].nsec = req->u.rw.seg[i].last_sect - 1180 preq.dev = req->u.rw.handle;
909 req->u.rw.seg[i].first_sect + 1; 1181 preq.sector_number = req->u.rw.sector_number;
910 if ((req->u.rw.seg[i].last_sect >= (PAGE_SIZE >> 9)) || 1182 for (i = 0; i < nseg; i++) {
911 (req->u.rw.seg[i].last_sect < req->u.rw.seg[i].first_sect)) 1183 pages[i]->gref = req->u.rw.seg[i].gref;
1184 seg[i].nsec = req->u.rw.seg[i].last_sect -
1185 req->u.rw.seg[i].first_sect + 1;
1186 seg[i].offset = (req->u.rw.seg[i].first_sect << 9);
1187 if ((req->u.rw.seg[i].last_sect >= (PAGE_SIZE >> 9)) ||
1188 (req->u.rw.seg[i].last_sect <
1189 req->u.rw.seg[i].first_sect))
1190 goto fail_response;
1191 preq.nr_sects += seg[i].nsec;
1192 }
1193 } else {
1194 preq.dev = req->u.indirect.handle;
1195 preq.sector_number = req->u.indirect.sector_number;
1196 if (xen_blkbk_parse_indirect(req, pending_req, seg, &preq))
912 goto fail_response; 1197 goto fail_response;
913 preq.nr_sects += seg[i].nsec;
914
915 } 1198 }
916 1199
917 if (xen_vbd_translate(&preq, blkif, operation) != 0) { 1200 if (xen_vbd_translate(&preq, blkif, operation) != 0) {
@@ -948,7 +1231,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
948 * the hypercall to unmap the grants - that is all done in 1231 * the hypercall to unmap the grants - that is all done in
949 * xen_blkbk_unmap. 1232 * xen_blkbk_unmap.
950 */ 1233 */
951 if (xen_blkbk_map(req, pending_req, seg, pages)) 1234 if (xen_blkbk_map_seg(pending_req))
952 goto fail_flush; 1235 goto fail_flush;
953 1236
954 /* 1237 /*
@@ -960,11 +1243,12 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
960 for (i = 0; i < nseg; i++) { 1243 for (i = 0; i < nseg; i++) {
961 while ((bio == NULL) || 1244 while ((bio == NULL) ||
962 (bio_add_page(bio, 1245 (bio_add_page(bio,
963 pages[i], 1246 pages[i]->page,
964 seg[i].nsec << 9, 1247 seg[i].nsec << 9,
965 seg[i].offset) == 0)) { 1248 seg[i].offset) == 0)) {
966 1249
967 bio = bio_alloc(GFP_KERNEL, nseg-i); 1250 int nr_iovecs = min_t(int, (nseg-i), BIO_MAX_PAGES);
1251 bio = bio_alloc(GFP_KERNEL, nr_iovecs);
968 if (unlikely(bio == NULL)) 1252 if (unlikely(bio == NULL))
969 goto fail_put_bio; 1253 goto fail_put_bio;
970 1254
@@ -1009,11 +1293,12 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
1009 return 0; 1293 return 0;
1010 1294
1011 fail_flush: 1295 fail_flush:
1012 xen_blkbk_unmap(pending_req); 1296 xen_blkbk_unmap(blkif, pending_req->segments,
1297 pending_req->nr_pages);
1013 fail_response: 1298 fail_response:
1014 /* Haven't submitted any bio's yet. */ 1299 /* Haven't submitted any bio's yet. */
1015 make_response(blkif, req->u.rw.id, req->operation, BLKIF_RSP_ERROR); 1300 make_response(blkif, req->u.rw.id, req_operation, BLKIF_RSP_ERROR);
1016 free_req(pending_req); 1301 free_req(blkif, pending_req);
1017 msleep(1); /* back off a bit */ 1302 msleep(1); /* back off a bit */
1018 return -EIO; 1303 return -EIO;
1019 1304
@@ -1070,73 +1355,20 @@ static void make_response(struct xen_blkif *blkif, u64 id,
1070 1355
1071static int __init xen_blkif_init(void) 1356static int __init xen_blkif_init(void)
1072{ 1357{
1073 int i, mmap_pages;
1074 int rc = 0; 1358 int rc = 0;
1075 1359
1076 if (!xen_domain()) 1360 if (!xen_domain())
1077 return -ENODEV; 1361 return -ENODEV;
1078 1362
1079 blkbk = kzalloc(sizeof(struct xen_blkbk), GFP_KERNEL);
1080 if (!blkbk) {
1081 pr_alert(DRV_PFX "%s: out of memory!\n", __func__);
1082 return -ENOMEM;
1083 }
1084
1085 mmap_pages = xen_blkif_reqs * BLKIF_MAX_SEGMENTS_PER_REQUEST;
1086
1087 blkbk->pending_reqs = kzalloc(sizeof(blkbk->pending_reqs[0]) *
1088 xen_blkif_reqs, GFP_KERNEL);
1089 blkbk->pending_grant_handles = kmalloc(sizeof(blkbk->pending_grant_handles[0]) *
1090 mmap_pages, GFP_KERNEL);
1091 blkbk->pending_pages = kzalloc(sizeof(blkbk->pending_pages[0]) *
1092 mmap_pages, GFP_KERNEL);
1093
1094 if (!blkbk->pending_reqs || !blkbk->pending_grant_handles ||
1095 !blkbk->pending_pages) {
1096 rc = -ENOMEM;
1097 goto out_of_memory;
1098 }
1099
1100 for (i = 0; i < mmap_pages; i++) {
1101 blkbk->pending_grant_handles[i] = BLKBACK_INVALID_HANDLE;
1102 blkbk->pending_pages[i] = alloc_page(GFP_KERNEL);
1103 if (blkbk->pending_pages[i] == NULL) {
1104 rc = -ENOMEM;
1105 goto out_of_memory;
1106 }
1107 }
1108 rc = xen_blkif_interface_init(); 1363 rc = xen_blkif_interface_init();
1109 if (rc) 1364 if (rc)
1110 goto failed_init; 1365 goto failed_init;
1111 1366
1112 INIT_LIST_HEAD(&blkbk->pending_free);
1113 spin_lock_init(&blkbk->pending_free_lock);
1114 init_waitqueue_head(&blkbk->pending_free_wq);
1115
1116 for (i = 0; i < xen_blkif_reqs; i++)
1117 list_add_tail(&blkbk->pending_reqs[i].free_list,
1118 &blkbk->pending_free);
1119
1120 rc = xen_blkif_xenbus_init(); 1367 rc = xen_blkif_xenbus_init();
1121 if (rc) 1368 if (rc)
1122 goto failed_init; 1369 goto failed_init;
1123 1370
1124 return 0;
1125
1126 out_of_memory:
1127 pr_alert(DRV_PFX "%s: out of memory\n", __func__);
1128 failed_init: 1371 failed_init:
1129 kfree(blkbk->pending_reqs);
1130 kfree(blkbk->pending_grant_handles);
1131 if (blkbk->pending_pages) {
1132 for (i = 0; i < mmap_pages; i++) {
1133 if (blkbk->pending_pages[i])
1134 __free_page(blkbk->pending_pages[i]);
1135 }
1136 kfree(blkbk->pending_pages);
1137 }
1138 kfree(blkbk);
1139 blkbk = NULL;
1140 return rc; 1372 return rc;
1141} 1373}
1142 1374
diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h
index 60103e2517ba..8d8807563d99 100644
--- a/drivers/block/xen-blkback/common.h
+++ b/drivers/block/xen-blkback/common.h
@@ -50,6 +50,19 @@
50 __func__, __LINE__, ##args) 50 __func__, __LINE__, ##args)
51 51
52 52
53/*
54 * This is the maximum number of segments that would be allowed in indirect
55 * requests. This value will also be passed to the frontend.
56 */
57#define MAX_INDIRECT_SEGMENTS 256
58
59#define SEGS_PER_INDIRECT_FRAME \
60 (PAGE_SIZE/sizeof(struct blkif_request_segment_aligned))
61#define MAX_INDIRECT_PAGES \
62 ((MAX_INDIRECT_SEGMENTS + SEGS_PER_INDIRECT_FRAME - 1)/SEGS_PER_INDIRECT_FRAME)
63#define INDIRECT_PAGES(_segs) \
64 ((_segs + SEGS_PER_INDIRECT_FRAME - 1)/SEGS_PER_INDIRECT_FRAME)
65
53/* Not a real protocol. Used to generate ring structs which contain 66/* Not a real protocol. Used to generate ring structs which contain
54 * the elements common to all protocols only. This way we get a 67 * the elements common to all protocols only. This way we get a
55 * compiler-checkable way to use common struct elements, so we can 68 * compiler-checkable way to use common struct elements, so we can
@@ -83,12 +96,31 @@ struct blkif_x86_32_request_other {
83 uint64_t id; /* private guest value, echoed in resp */ 96 uint64_t id; /* private guest value, echoed in resp */
84} __attribute__((__packed__)); 97} __attribute__((__packed__));
85 98
99struct blkif_x86_32_request_indirect {
100 uint8_t indirect_op;
101 uint16_t nr_segments;
102 uint64_t id;
103 blkif_sector_t sector_number;
104 blkif_vdev_t handle;
105 uint16_t _pad1;
106 grant_ref_t indirect_grefs[BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST];
107 /*
108 * The maximum number of indirect segments (and pages) that will
109 * be used is determined by MAX_INDIRECT_SEGMENTS, this value
110 * is also exported to the guest (via xenstore
111 * feature-max-indirect-segments entry), so the frontend knows how
112 * many indirect segments the backend supports.
113 */
114 uint64_t _pad2; /* make it 64 byte aligned */
115} __attribute__((__packed__));
116
86struct blkif_x86_32_request { 117struct blkif_x86_32_request {
87 uint8_t operation; /* BLKIF_OP_??? */ 118 uint8_t operation; /* BLKIF_OP_??? */
88 union { 119 union {
89 struct blkif_x86_32_request_rw rw; 120 struct blkif_x86_32_request_rw rw;
90 struct blkif_x86_32_request_discard discard; 121 struct blkif_x86_32_request_discard discard;
91 struct blkif_x86_32_request_other other; 122 struct blkif_x86_32_request_other other;
123 struct blkif_x86_32_request_indirect indirect;
92 } u; 124 } u;
93} __attribute__((__packed__)); 125} __attribute__((__packed__));
94 126
@@ -127,12 +159,32 @@ struct blkif_x86_64_request_other {
127 uint64_t id; /* private guest value, echoed in resp */ 159 uint64_t id; /* private guest value, echoed in resp */
128} __attribute__((__packed__)); 160} __attribute__((__packed__));
129 161
162struct blkif_x86_64_request_indirect {
163 uint8_t indirect_op;
164 uint16_t nr_segments;
165 uint32_t _pad1; /* offsetof(blkif_..,u.indirect.id)==8 */
166 uint64_t id;
167 blkif_sector_t sector_number;
168 blkif_vdev_t handle;
169 uint16_t _pad2;
170 grant_ref_t indirect_grefs[BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST];
171 /*
172 * The maximum number of indirect segments (and pages) that will
173 * be used is determined by MAX_INDIRECT_SEGMENTS, this value
174 * is also exported to the guest (via xenstore
175 * feature-max-indirect-segments entry), so the frontend knows how
176 * many indirect segments the backend supports.
177 */
178 uint32_t _pad3; /* make it 64 byte aligned */
179} __attribute__((__packed__));
180
130struct blkif_x86_64_request { 181struct blkif_x86_64_request {
131 uint8_t operation; /* BLKIF_OP_??? */ 182 uint8_t operation; /* BLKIF_OP_??? */
132 union { 183 union {
133 struct blkif_x86_64_request_rw rw; 184 struct blkif_x86_64_request_rw rw;
134 struct blkif_x86_64_request_discard discard; 185 struct blkif_x86_64_request_discard discard;
135 struct blkif_x86_64_request_other other; 186 struct blkif_x86_64_request_other other;
187 struct blkif_x86_64_request_indirect indirect;
136 } u; 188 } u;
137} __attribute__((__packed__)); 189} __attribute__((__packed__));
138 190
@@ -182,12 +234,26 @@ struct xen_vbd {
182 234
183struct backend_info; 235struct backend_info;
184 236
237/* Number of available flags */
238#define PERSISTENT_GNT_FLAGS_SIZE 2
239/* This persistent grant is currently in use */
240#define PERSISTENT_GNT_ACTIVE 0
241/*
242 * This persistent grant has been used, this flag is set when we remove the
243 * PERSISTENT_GNT_ACTIVE, to know that this grant has been used recently.
244 */
245#define PERSISTENT_GNT_WAS_ACTIVE 1
246
247/* Number of requests that we can fit in a ring */
248#define XEN_BLKIF_REQS 32
185 249
186struct persistent_gnt { 250struct persistent_gnt {
187 struct page *page; 251 struct page *page;
188 grant_ref_t gnt; 252 grant_ref_t gnt;
189 grant_handle_t handle; 253 grant_handle_t handle;
254 DECLARE_BITMAP(flags, PERSISTENT_GNT_FLAGS_SIZE);
190 struct rb_node node; 255 struct rb_node node;
256 struct list_head remove_node;
191}; 257};
192 258
193struct xen_blkif { 259struct xen_blkif {
@@ -219,6 +285,23 @@ struct xen_blkif {
219 /* tree to store persistent grants */ 285 /* tree to store persistent grants */
220 struct rb_root persistent_gnts; 286 struct rb_root persistent_gnts;
221 unsigned int persistent_gnt_c; 287 unsigned int persistent_gnt_c;
288 atomic_t persistent_gnt_in_use;
289 unsigned long next_lru;
290
291 /* used by the kworker that offload work from the persistent purge */
292 struct list_head persistent_purge_list;
293 struct work_struct persistent_purge_work;
294
295 /* buffer of free pages to map grant refs */
296 spinlock_t free_pages_lock;
297 int free_pages_num;
298 struct list_head free_pages;
299
300 /* List of all 'pending_req' available */
301 struct list_head pending_free;
302 /* And its spinlock. */
303 spinlock_t pending_free_lock;
304 wait_queue_head_t pending_free_wq;
222 305
223 /* statistics */ 306 /* statistics */
224 unsigned long st_print; 307 unsigned long st_print;
@@ -231,6 +314,41 @@ struct xen_blkif {
231 unsigned long long st_wr_sect; 314 unsigned long long st_wr_sect;
232 315
233 wait_queue_head_t waiting_to_free; 316 wait_queue_head_t waiting_to_free;
317 /* Thread shutdown wait queue. */
318 wait_queue_head_t shutdown_wq;
319};
320
321struct seg_buf {
322 unsigned long offset;
323 unsigned int nsec;
324};
325
326struct grant_page {
327 struct page *page;
328 struct persistent_gnt *persistent_gnt;
329 grant_handle_t handle;
330 grant_ref_t gref;
331};
332
333/*
334 * Each outstanding request that we've passed to the lower device layers has a
335 * 'pending_req' allocated to it. Each buffer_head that completes decrements
336 * the pendcnt towards zero. When it hits zero, the specified domain has a
337 * response queued for it, with the saved 'id' passed back.
338 */
339struct pending_req {
340 struct xen_blkif *blkif;
341 u64 id;
342 int nr_pages;
343 atomic_t pendcnt;
344 unsigned short operation;
345 int status;
346 struct list_head free_list;
347 struct grant_page *segments[MAX_INDIRECT_SEGMENTS];
348 /* Indirect descriptors */
349 struct grant_page *indirect_pages[MAX_INDIRECT_PAGES];
350 struct seg_buf seg[MAX_INDIRECT_SEGMENTS];
351 struct bio *biolist[MAX_INDIRECT_SEGMENTS];
234}; 352};
235 353
236 354
@@ -257,6 +375,7 @@ int xen_blkif_xenbus_init(void);
257 375
258irqreturn_t xen_blkif_be_int(int irq, void *dev_id); 376irqreturn_t xen_blkif_be_int(int irq, void *dev_id);
259int xen_blkif_schedule(void *arg); 377int xen_blkif_schedule(void *arg);
378int xen_blkif_purge_persistent(void *arg);
260 379
261int xen_blkbk_flush_diskcache(struct xenbus_transaction xbt, 380int xen_blkbk_flush_diskcache(struct xenbus_transaction xbt,
262 struct backend_info *be, int state); 381 struct backend_info *be, int state);
@@ -268,7 +387,7 @@ struct xenbus_device *xen_blkbk_xenbus(struct backend_info *be);
268static inline void blkif_get_x86_32_req(struct blkif_request *dst, 387static inline void blkif_get_x86_32_req(struct blkif_request *dst,
269 struct blkif_x86_32_request *src) 388 struct blkif_x86_32_request *src)
270{ 389{
271 int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST; 390 int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST, j;
272 dst->operation = src->operation; 391 dst->operation = src->operation;
273 switch (src->operation) { 392 switch (src->operation) {
274 case BLKIF_OP_READ: 393 case BLKIF_OP_READ:
@@ -291,6 +410,18 @@ static inline void blkif_get_x86_32_req(struct blkif_request *dst,
291 dst->u.discard.sector_number = src->u.discard.sector_number; 410 dst->u.discard.sector_number = src->u.discard.sector_number;
292 dst->u.discard.nr_sectors = src->u.discard.nr_sectors; 411 dst->u.discard.nr_sectors = src->u.discard.nr_sectors;
293 break; 412 break;
413 case BLKIF_OP_INDIRECT:
414 dst->u.indirect.indirect_op = src->u.indirect.indirect_op;
415 dst->u.indirect.nr_segments = src->u.indirect.nr_segments;
416 dst->u.indirect.handle = src->u.indirect.handle;
417 dst->u.indirect.id = src->u.indirect.id;
418 dst->u.indirect.sector_number = src->u.indirect.sector_number;
419 barrier();
420 j = min(MAX_INDIRECT_PAGES, INDIRECT_PAGES(dst->u.indirect.nr_segments));
421 for (i = 0; i < j; i++)
422 dst->u.indirect.indirect_grefs[i] =
423 src->u.indirect.indirect_grefs[i];
424 break;
294 default: 425 default:
295 /* 426 /*
296 * Don't know how to translate this op. Only get the 427 * Don't know how to translate this op. Only get the
@@ -304,7 +435,7 @@ static inline void blkif_get_x86_32_req(struct blkif_request *dst,
304static inline void blkif_get_x86_64_req(struct blkif_request *dst, 435static inline void blkif_get_x86_64_req(struct blkif_request *dst,
305 struct blkif_x86_64_request *src) 436 struct blkif_x86_64_request *src)
306{ 437{
307 int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST; 438 int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST, j;
308 dst->operation = src->operation; 439 dst->operation = src->operation;
309 switch (src->operation) { 440 switch (src->operation) {
310 case BLKIF_OP_READ: 441 case BLKIF_OP_READ:
@@ -327,6 +458,18 @@ static inline void blkif_get_x86_64_req(struct blkif_request *dst,
327 dst->u.discard.sector_number = src->u.discard.sector_number; 458 dst->u.discard.sector_number = src->u.discard.sector_number;
328 dst->u.discard.nr_sectors = src->u.discard.nr_sectors; 459 dst->u.discard.nr_sectors = src->u.discard.nr_sectors;
329 break; 460 break;
461 case BLKIF_OP_INDIRECT:
462 dst->u.indirect.indirect_op = src->u.indirect.indirect_op;
463 dst->u.indirect.nr_segments = src->u.indirect.nr_segments;
464 dst->u.indirect.handle = src->u.indirect.handle;
465 dst->u.indirect.id = src->u.indirect.id;
466 dst->u.indirect.sector_number = src->u.indirect.sector_number;
467 barrier();
468 j = min(MAX_INDIRECT_PAGES, INDIRECT_PAGES(dst->u.indirect.nr_segments));
469 for (i = 0; i < j; i++)
470 dst->u.indirect.indirect_grefs[i] =
471 src->u.indirect.indirect_grefs[i];
472 break;
330 default: 473 default:
331 /* 474 /*
332 * Don't know how to translate this op. Only get the 475 * Don't know how to translate this op. Only get the
diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c
index 8bfd1bcf95ec..2e5b69d612ac 100644
--- a/drivers/block/xen-blkback/xenbus.c
+++ b/drivers/block/xen-blkback/xenbus.c
@@ -98,12 +98,17 @@ static void xen_update_blkif_status(struct xen_blkif *blkif)
98 err = PTR_ERR(blkif->xenblkd); 98 err = PTR_ERR(blkif->xenblkd);
99 blkif->xenblkd = NULL; 99 blkif->xenblkd = NULL;
100 xenbus_dev_error(blkif->be->dev, err, "start xenblkd"); 100 xenbus_dev_error(blkif->be->dev, err, "start xenblkd");
101 return;
101 } 102 }
102} 103}
103 104
104static struct xen_blkif *xen_blkif_alloc(domid_t domid) 105static struct xen_blkif *xen_blkif_alloc(domid_t domid)
105{ 106{
106 struct xen_blkif *blkif; 107 struct xen_blkif *blkif;
108 struct pending_req *req, *n;
109 int i, j;
110
111 BUILD_BUG_ON(MAX_INDIRECT_PAGES > BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST);
107 112
108 blkif = kmem_cache_zalloc(xen_blkif_cachep, GFP_KERNEL); 113 blkif = kmem_cache_zalloc(xen_blkif_cachep, GFP_KERNEL);
109 if (!blkif) 114 if (!blkif)
@@ -118,8 +123,57 @@ static struct xen_blkif *xen_blkif_alloc(domid_t domid)
118 blkif->st_print = jiffies; 123 blkif->st_print = jiffies;
119 init_waitqueue_head(&blkif->waiting_to_free); 124 init_waitqueue_head(&blkif->waiting_to_free);
120 blkif->persistent_gnts.rb_node = NULL; 125 blkif->persistent_gnts.rb_node = NULL;
126 spin_lock_init(&blkif->free_pages_lock);
127 INIT_LIST_HEAD(&blkif->free_pages);
128 blkif->free_pages_num = 0;
129 atomic_set(&blkif->persistent_gnt_in_use, 0);
130
131 INIT_LIST_HEAD(&blkif->pending_free);
132
133 for (i = 0; i < XEN_BLKIF_REQS; i++) {
134 req = kzalloc(sizeof(*req), GFP_KERNEL);
135 if (!req)
136 goto fail;
137 list_add_tail(&req->free_list,
138 &blkif->pending_free);
139 for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++) {
140 req->segments[j] = kzalloc(sizeof(*req->segments[0]),
141 GFP_KERNEL);
142 if (!req->segments[j])
143 goto fail;
144 }
145 for (j = 0; j < MAX_INDIRECT_PAGES; j++) {
146 req->indirect_pages[j] = kzalloc(sizeof(*req->indirect_pages[0]),
147 GFP_KERNEL);
148 if (!req->indirect_pages[j])
149 goto fail;
150 }
151 }
152 spin_lock_init(&blkif->pending_free_lock);
153 init_waitqueue_head(&blkif->pending_free_wq);
154 init_waitqueue_head(&blkif->shutdown_wq);
121 155
122 return blkif; 156 return blkif;
157
158fail:
159 list_for_each_entry_safe(req, n, &blkif->pending_free, free_list) {
160 list_del(&req->free_list);
161 for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++) {
162 if (!req->segments[j])
163 break;
164 kfree(req->segments[j]);
165 }
166 for (j = 0; j < MAX_INDIRECT_PAGES; j++) {
167 if (!req->indirect_pages[j])
168 break;
169 kfree(req->indirect_pages[j]);
170 }
171 kfree(req);
172 }
173
174 kmem_cache_free(xen_blkif_cachep, blkif);
175
176 return ERR_PTR(-ENOMEM);
123} 177}
124 178
125static int xen_blkif_map(struct xen_blkif *blkif, unsigned long shared_page, 179static int xen_blkif_map(struct xen_blkif *blkif, unsigned long shared_page,
@@ -178,6 +232,7 @@ static void xen_blkif_disconnect(struct xen_blkif *blkif)
178{ 232{
179 if (blkif->xenblkd) { 233 if (blkif->xenblkd) {
180 kthread_stop(blkif->xenblkd); 234 kthread_stop(blkif->xenblkd);
235 wake_up(&blkif->shutdown_wq);
181 blkif->xenblkd = NULL; 236 blkif->xenblkd = NULL;
182 } 237 }
183 238
@@ -198,8 +253,28 @@ static void xen_blkif_disconnect(struct xen_blkif *blkif)
198 253
199static void xen_blkif_free(struct xen_blkif *blkif) 254static void xen_blkif_free(struct xen_blkif *blkif)
200{ 255{
256 struct pending_req *req, *n;
257 int i = 0, j;
258
201 if (!atomic_dec_and_test(&blkif->refcnt)) 259 if (!atomic_dec_and_test(&blkif->refcnt))
202 BUG(); 260 BUG();
261
262 /* Check that there is no request in use */
263 list_for_each_entry_safe(req, n, &blkif->pending_free, free_list) {
264 list_del(&req->free_list);
265
266 for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++)
267 kfree(req->segments[j]);
268
269 for (j = 0; j < MAX_INDIRECT_PAGES; j++)
270 kfree(req->indirect_pages[j]);
271
272 kfree(req);
273 i++;
274 }
275
276 WARN_ON(i != XEN_BLKIF_REQS);
277
203 kmem_cache_free(xen_blkif_cachep, blkif); 278 kmem_cache_free(xen_blkif_cachep, blkif);
204} 279}
205 280
@@ -678,6 +753,11 @@ again:
678 dev->nodename); 753 dev->nodename);
679 goto abort; 754 goto abort;
680 } 755 }
756 err = xenbus_printf(xbt, dev->nodename, "feature-max-indirect-segments", "%u",
757 MAX_INDIRECT_SEGMENTS);
758 if (err)
759 dev_warn(&dev->dev, "writing %s/feature-max-indirect-segments (%d)",
760 dev->nodename, err);
681 761
682 err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu", 762 err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu",
683 (unsigned long long)vbd_sz(&be->blkif->vbd)); 763 (unsigned long long)vbd_sz(&be->blkif->vbd));
@@ -704,6 +784,11 @@ again:
704 dev->nodename); 784 dev->nodename);
705 goto abort; 785 goto abort;
706 } 786 }
787 err = xenbus_printf(xbt, dev->nodename, "physical-sector-size", "%u",
788 bdev_physical_block_size(be->blkif->vbd.bdev));
789 if (err)
790 xenbus_dev_error(dev, err, "writing %s/physical-sector-size",
791 dev->nodename);
707 792
708 err = xenbus_transaction_end(xbt, 0); 793 err = xenbus_transaction_end(xbt, 0);
709 if (err == -EAGAIN) 794 if (err == -EAGAIN)
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index d89ef86220f4..a4660bbee8a6 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -74,12 +74,30 @@ struct grant {
74struct blk_shadow { 74struct blk_shadow {
75 struct blkif_request req; 75 struct blkif_request req;
76 struct request *request; 76 struct request *request;
77 struct grant *grants_used[BLKIF_MAX_SEGMENTS_PER_REQUEST]; 77 struct grant **grants_used;
78 struct grant **indirect_grants;
79 struct scatterlist *sg;
80};
81
82struct split_bio {
83 struct bio *bio;
84 atomic_t pending;
85 int err;
78}; 86};
79 87
80static DEFINE_MUTEX(blkfront_mutex); 88static DEFINE_MUTEX(blkfront_mutex);
81static const struct block_device_operations xlvbd_block_fops; 89static const struct block_device_operations xlvbd_block_fops;
82 90
91/*
92 * Maximum number of segments in indirect requests, the actual value used by
93 * the frontend driver is the minimum of this value and the value provided
94 * by the backend driver.
95 */
96
97static unsigned int xen_blkif_max_segments = 32;
98module_param_named(max, xen_blkif_max_segments, int, S_IRUGO);
99MODULE_PARM_DESC(max, "Maximum amount of segments in indirect requests (default is 32)");
100
83#define BLK_RING_SIZE __CONST_RING_SIZE(blkif, PAGE_SIZE) 101#define BLK_RING_SIZE __CONST_RING_SIZE(blkif, PAGE_SIZE)
84 102
85/* 103/*
@@ -98,7 +116,6 @@ struct blkfront_info
98 enum blkif_state connected; 116 enum blkif_state connected;
99 int ring_ref; 117 int ring_ref;
100 struct blkif_front_ring ring; 118 struct blkif_front_ring ring;
101 struct scatterlist sg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
102 unsigned int evtchn, irq; 119 unsigned int evtchn, irq;
103 struct request_queue *rq; 120 struct request_queue *rq;
104 struct work_struct work; 121 struct work_struct work;
@@ -114,6 +131,7 @@ struct blkfront_info
114 unsigned int discard_granularity; 131 unsigned int discard_granularity;
115 unsigned int discard_alignment; 132 unsigned int discard_alignment;
116 unsigned int feature_persistent:1; 133 unsigned int feature_persistent:1;
134 unsigned int max_indirect_segments;
117 int is_ready; 135 int is_ready;
118}; 136};
119 137
@@ -142,6 +160,13 @@ static DEFINE_SPINLOCK(minor_lock);
142 160
143#define DEV_NAME "xvd" /* name in /dev */ 161#define DEV_NAME "xvd" /* name in /dev */
144 162
163#define SEGS_PER_INDIRECT_FRAME \
164 (PAGE_SIZE/sizeof(struct blkif_request_segment_aligned))
165#define INDIRECT_GREFS(_segs) \
166 ((_segs + SEGS_PER_INDIRECT_FRAME - 1)/SEGS_PER_INDIRECT_FRAME)
167
168static int blkfront_setup_indirect(struct blkfront_info *info);
169
145static int get_id_from_freelist(struct blkfront_info *info) 170static int get_id_from_freelist(struct blkfront_info *info)
146{ 171{
147 unsigned long free = info->shadow_free; 172 unsigned long free = info->shadow_free;
@@ -358,7 +383,8 @@ static int blkif_queue_request(struct request *req)
358 struct blkif_request *ring_req; 383 struct blkif_request *ring_req;
359 unsigned long id; 384 unsigned long id;
360 unsigned int fsect, lsect; 385 unsigned int fsect, lsect;
361 int i, ref; 386 int i, ref, n;
387 struct blkif_request_segment_aligned *segments = NULL;
362 388
363 /* 389 /*
364 * Used to store if we are able to queue the request by just using 390 * Used to store if we are able to queue the request by just using
@@ -369,21 +395,27 @@ static int blkif_queue_request(struct request *req)
369 grant_ref_t gref_head; 395 grant_ref_t gref_head;
370 struct grant *gnt_list_entry = NULL; 396 struct grant *gnt_list_entry = NULL;
371 struct scatterlist *sg; 397 struct scatterlist *sg;
398 int nseg, max_grefs;
372 399
373 if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) 400 if (unlikely(info->connected != BLKIF_STATE_CONNECTED))
374 return 1; 401 return 1;
375 402
376 /* Check if we have enought grants to allocate a requests */ 403 max_grefs = info->max_indirect_segments ?
377 if (info->persistent_gnts_c < BLKIF_MAX_SEGMENTS_PER_REQUEST) { 404 info->max_indirect_segments +
405 INDIRECT_GREFS(info->max_indirect_segments) :
406 BLKIF_MAX_SEGMENTS_PER_REQUEST;
407
408 /* Check if we have enough grants to allocate a requests */
409 if (info->persistent_gnts_c < max_grefs) {
378 new_persistent_gnts = 1; 410 new_persistent_gnts = 1;
379 if (gnttab_alloc_grant_references( 411 if (gnttab_alloc_grant_references(
380 BLKIF_MAX_SEGMENTS_PER_REQUEST - info->persistent_gnts_c, 412 max_grefs - info->persistent_gnts_c,
381 &gref_head) < 0) { 413 &gref_head) < 0) {
382 gnttab_request_free_callback( 414 gnttab_request_free_callback(
383 &info->callback, 415 &info->callback,
384 blkif_restart_queue_callback, 416 blkif_restart_queue_callback,
385 info, 417 info,
386 BLKIF_MAX_SEGMENTS_PER_REQUEST); 418 max_grefs);
387 return 1; 419 return 1;
388 } 420 }
389 } else 421 } else
@@ -394,42 +426,67 @@ static int blkif_queue_request(struct request *req)
394 id = get_id_from_freelist(info); 426 id = get_id_from_freelist(info);
395 info->shadow[id].request = req; 427 info->shadow[id].request = req;
396 428
397 ring_req->u.rw.id = id;
398 ring_req->u.rw.sector_number = (blkif_sector_t)blk_rq_pos(req);
399 ring_req->u.rw.handle = info->handle;
400
401 ring_req->operation = rq_data_dir(req) ?
402 BLKIF_OP_WRITE : BLKIF_OP_READ;
403
404 if (req->cmd_flags & (REQ_FLUSH | REQ_FUA)) {
405 /*
406 * Ideally we can do an unordered flush-to-disk. In case the
407 * backend onlysupports barriers, use that. A barrier request
408 * a superset of FUA, so we can implement it the same
409 * way. (It's also a FLUSH+FUA, since it is
410 * guaranteed ordered WRT previous writes.)
411 */
412 ring_req->operation = info->flush_op;
413 }
414
415 if (unlikely(req->cmd_flags & (REQ_DISCARD | REQ_SECURE))) { 429 if (unlikely(req->cmd_flags & (REQ_DISCARD | REQ_SECURE))) {
416 /* id, sector_number and handle are set above. */
417 ring_req->operation = BLKIF_OP_DISCARD; 430 ring_req->operation = BLKIF_OP_DISCARD;
418 ring_req->u.discard.nr_sectors = blk_rq_sectors(req); 431 ring_req->u.discard.nr_sectors = blk_rq_sectors(req);
432 ring_req->u.discard.id = id;
433 ring_req->u.discard.sector_number = (blkif_sector_t)blk_rq_pos(req);
419 if ((req->cmd_flags & REQ_SECURE) && info->feature_secdiscard) 434 if ((req->cmd_flags & REQ_SECURE) && info->feature_secdiscard)
420 ring_req->u.discard.flag = BLKIF_DISCARD_SECURE; 435 ring_req->u.discard.flag = BLKIF_DISCARD_SECURE;
421 else 436 else
422 ring_req->u.discard.flag = 0; 437 ring_req->u.discard.flag = 0;
423 } else { 438 } else {
424 ring_req->u.rw.nr_segments = blk_rq_map_sg(req->q, req, 439 BUG_ON(info->max_indirect_segments == 0 &&
425 info->sg); 440 req->nr_phys_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST);
426 BUG_ON(ring_req->u.rw.nr_segments > 441 BUG_ON(info->max_indirect_segments &&
427 BLKIF_MAX_SEGMENTS_PER_REQUEST); 442 req->nr_phys_segments > info->max_indirect_segments);
428 443 nseg = blk_rq_map_sg(req->q, req, info->shadow[id].sg);
429 for_each_sg(info->sg, sg, ring_req->u.rw.nr_segments, i) { 444 ring_req->u.rw.id = id;
445 if (nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST) {
446 /*
447 * The indirect operation can only be a BLKIF_OP_READ or
448 * BLKIF_OP_WRITE
449 */
450 BUG_ON(req->cmd_flags & (REQ_FLUSH | REQ_FUA));
451 ring_req->operation = BLKIF_OP_INDIRECT;
452 ring_req->u.indirect.indirect_op = rq_data_dir(req) ?
453 BLKIF_OP_WRITE : BLKIF_OP_READ;
454 ring_req->u.indirect.sector_number = (blkif_sector_t)blk_rq_pos(req);
455 ring_req->u.indirect.handle = info->handle;
456 ring_req->u.indirect.nr_segments = nseg;
457 } else {
458 ring_req->u.rw.sector_number = (blkif_sector_t)blk_rq_pos(req);
459 ring_req->u.rw.handle = info->handle;
460 ring_req->operation = rq_data_dir(req) ?
461 BLKIF_OP_WRITE : BLKIF_OP_READ;
462 if (req->cmd_flags & (REQ_FLUSH | REQ_FUA)) {
463 /*
464 * Ideally we can do an unordered flush-to-disk. In case the
465 * backend onlysupports barriers, use that. A barrier request
466 * a superset of FUA, so we can implement it the same
467 * way. (It's also a FLUSH+FUA, since it is
468 * guaranteed ordered WRT previous writes.)
469 */
470 ring_req->operation = info->flush_op;
471 }
472 ring_req->u.rw.nr_segments = nseg;
473 }
474 for_each_sg(info->shadow[id].sg, sg, nseg, i) {
430 fsect = sg->offset >> 9; 475 fsect = sg->offset >> 9;
431 lsect = fsect + (sg->length >> 9) - 1; 476 lsect = fsect + (sg->length >> 9) - 1;
432 477
478 if ((ring_req->operation == BLKIF_OP_INDIRECT) &&
479 (i % SEGS_PER_INDIRECT_FRAME == 0)) {
480 if (segments)
481 kunmap_atomic(segments);
482
483 n = i / SEGS_PER_INDIRECT_FRAME;
484 gnt_list_entry = get_grant(&gref_head, info);
485 info->shadow[id].indirect_grants[n] = gnt_list_entry;
486 segments = kmap_atomic(pfn_to_page(gnt_list_entry->pfn));
487 ring_req->u.indirect.indirect_grefs[n] = gnt_list_entry->gref;
488 }
489
433 gnt_list_entry = get_grant(&gref_head, info); 490 gnt_list_entry = get_grant(&gref_head, info);
434 ref = gnt_list_entry->gref; 491 ref = gnt_list_entry->gref;
435 492
@@ -441,8 +498,7 @@ static int blkif_queue_request(struct request *req)
441 498
442 BUG_ON(sg->offset + sg->length > PAGE_SIZE); 499 BUG_ON(sg->offset + sg->length > PAGE_SIZE);
443 500
444 shared_data = kmap_atomic( 501 shared_data = kmap_atomic(pfn_to_page(gnt_list_entry->pfn));
445 pfn_to_page(gnt_list_entry->pfn));
446 bvec_data = kmap_atomic(sg_page(sg)); 502 bvec_data = kmap_atomic(sg_page(sg));
447 503
448 /* 504 /*
@@ -461,13 +517,23 @@ static int blkif_queue_request(struct request *req)
461 kunmap_atomic(bvec_data); 517 kunmap_atomic(bvec_data);
462 kunmap_atomic(shared_data); 518 kunmap_atomic(shared_data);
463 } 519 }
464 520 if (ring_req->operation != BLKIF_OP_INDIRECT) {
465 ring_req->u.rw.seg[i] = 521 ring_req->u.rw.seg[i] =
466 (struct blkif_request_segment) { 522 (struct blkif_request_segment) {
467 .gref = ref, 523 .gref = ref,
468 .first_sect = fsect, 524 .first_sect = fsect,
469 .last_sect = lsect }; 525 .last_sect = lsect };
526 } else {
527 n = i % SEGS_PER_INDIRECT_FRAME;
528 segments[n] =
529 (struct blkif_request_segment_aligned) {
530 .gref = ref,
531 .first_sect = fsect,
532 .last_sect = lsect };
533 }
470 } 534 }
535 if (segments)
536 kunmap_atomic(segments);
471 } 537 }
472 538
473 info->ring.req_prod_pvt++; 539 info->ring.req_prod_pvt++;
@@ -542,7 +608,9 @@ wait:
542 flush_requests(info); 608 flush_requests(info);
543} 609}
544 610
545static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size) 611static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size,
612 unsigned int physical_sector_size,
613 unsigned int segments)
546{ 614{
547 struct request_queue *rq; 615 struct request_queue *rq;
548 struct blkfront_info *info = gd->private_data; 616 struct blkfront_info *info = gd->private_data;
@@ -564,14 +632,15 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size)
564 632
565 /* Hard sector size and max sectors impersonate the equiv. hardware. */ 633 /* Hard sector size and max sectors impersonate the equiv. hardware. */
566 blk_queue_logical_block_size(rq, sector_size); 634 blk_queue_logical_block_size(rq, sector_size);
567 blk_queue_max_hw_sectors(rq, 512); 635 blk_queue_physical_block_size(rq, physical_sector_size);
636 blk_queue_max_hw_sectors(rq, (segments * PAGE_SIZE) / 512);
568 637
569 /* Each segment in a request is up to an aligned page in size. */ 638 /* Each segment in a request is up to an aligned page in size. */
570 blk_queue_segment_boundary(rq, PAGE_SIZE - 1); 639 blk_queue_segment_boundary(rq, PAGE_SIZE - 1);
571 blk_queue_max_segment_size(rq, PAGE_SIZE); 640 blk_queue_max_segment_size(rq, PAGE_SIZE);
572 641
573 /* Ensure a merged request will fit in a single I/O ring slot. */ 642 /* Ensure a merged request will fit in a single I/O ring slot. */
574 blk_queue_max_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST); 643 blk_queue_max_segments(rq, segments);
575 644
576 /* Make sure buffer addresses are sector-aligned. */ 645 /* Make sure buffer addresses are sector-aligned. */
577 blk_queue_dma_alignment(rq, 511); 646 blk_queue_dma_alignment(rq, 511);
@@ -588,13 +657,16 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size)
588static void xlvbd_flush(struct blkfront_info *info) 657static void xlvbd_flush(struct blkfront_info *info)
589{ 658{
590 blk_queue_flush(info->rq, info->feature_flush); 659 blk_queue_flush(info->rq, info->feature_flush);
591 printk(KERN_INFO "blkfront: %s: %s: %s %s\n", 660 printk(KERN_INFO "blkfront: %s: %s: %s %s %s %s %s\n",
592 info->gd->disk_name, 661 info->gd->disk_name,
593 info->flush_op == BLKIF_OP_WRITE_BARRIER ? 662 info->flush_op == BLKIF_OP_WRITE_BARRIER ?
594 "barrier" : (info->flush_op == BLKIF_OP_FLUSH_DISKCACHE ? 663 "barrier" : (info->flush_op == BLKIF_OP_FLUSH_DISKCACHE ?
595 "flush diskcache" : "barrier or flush"), 664 "flush diskcache" : "barrier or flush"),
596 info->feature_flush ? "enabled" : "disabled", 665 info->feature_flush ? "enabled;" : "disabled;",
597 info->feature_persistent ? "using persistent grants" : ""); 666 "persistent grants:",
667 info->feature_persistent ? "enabled;" : "disabled;",
668 "indirect descriptors:",
669 info->max_indirect_segments ? "enabled;" : "disabled;");
598} 670}
599 671
600static int xen_translate_vdev(int vdevice, int *minor, unsigned int *offset) 672static int xen_translate_vdev(int vdevice, int *minor, unsigned int *offset)
@@ -667,7 +739,8 @@ static char *encode_disk_name(char *ptr, unsigned int n)
667 739
668static int xlvbd_alloc_gendisk(blkif_sector_t capacity, 740static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
669 struct blkfront_info *info, 741 struct blkfront_info *info,
670 u16 vdisk_info, u16 sector_size) 742 u16 vdisk_info, u16 sector_size,
743 unsigned int physical_sector_size)
671{ 744{
672 struct gendisk *gd; 745 struct gendisk *gd;
673 int nr_minors = 1; 746 int nr_minors = 1;
@@ -734,7 +807,9 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
734 gd->driverfs_dev = &(info->xbdev->dev); 807 gd->driverfs_dev = &(info->xbdev->dev);
735 set_capacity(gd, capacity); 808 set_capacity(gd, capacity);
736 809
737 if (xlvbd_init_blk_queue(gd, sector_size)) { 810 if (xlvbd_init_blk_queue(gd, sector_size, physical_sector_size,
811 info->max_indirect_segments ? :
812 BLKIF_MAX_SEGMENTS_PER_REQUEST)) {
738 del_gendisk(gd); 813 del_gendisk(gd);
739 goto release; 814 goto release;
740 } 815 }
@@ -818,6 +893,7 @@ static void blkif_free(struct blkfront_info *info, int suspend)
818{ 893{
819 struct grant *persistent_gnt; 894 struct grant *persistent_gnt;
820 struct grant *n; 895 struct grant *n;
896 int i, j, segs;
821 897
822 /* Prevent new requests being issued until we fix things up. */ 898 /* Prevent new requests being issued until we fix things up. */
823 spin_lock_irq(&info->io_lock); 899 spin_lock_irq(&info->io_lock);
@@ -843,6 +919,47 @@ static void blkif_free(struct blkfront_info *info, int suspend)
843 } 919 }
844 BUG_ON(info->persistent_gnts_c != 0); 920 BUG_ON(info->persistent_gnts_c != 0);
845 921
922 for (i = 0; i < BLK_RING_SIZE; i++) {
923 /*
924 * Clear persistent grants present in requests already
925 * on the shared ring
926 */
927 if (!info->shadow[i].request)
928 goto free_shadow;
929
930 segs = info->shadow[i].req.operation == BLKIF_OP_INDIRECT ?
931 info->shadow[i].req.u.indirect.nr_segments :
932 info->shadow[i].req.u.rw.nr_segments;
933 for (j = 0; j < segs; j++) {
934 persistent_gnt = info->shadow[i].grants_used[j];
935 gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL);
936 __free_page(pfn_to_page(persistent_gnt->pfn));
937 kfree(persistent_gnt);
938 }
939
940 if (info->shadow[i].req.operation != BLKIF_OP_INDIRECT)
941 /*
942 * If this is not an indirect operation don't try to
943 * free indirect segments
944 */
945 goto free_shadow;
946
947 for (j = 0; j < INDIRECT_GREFS(segs); j++) {
948 persistent_gnt = info->shadow[i].indirect_grants[j];
949 gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL);
950 __free_page(pfn_to_page(persistent_gnt->pfn));
951 kfree(persistent_gnt);
952 }
953
954free_shadow:
955 kfree(info->shadow[i].grants_used);
956 info->shadow[i].grants_used = NULL;
957 kfree(info->shadow[i].indirect_grants);
958 info->shadow[i].indirect_grants = NULL;
959 kfree(info->shadow[i].sg);
960 info->shadow[i].sg = NULL;
961 }
962
846 /* No more gnttab callback work. */ 963 /* No more gnttab callback work. */
847 gnttab_cancel_free_callback(&info->callback); 964 gnttab_cancel_free_callback(&info->callback);
848 spin_unlock_irq(&info->io_lock); 965 spin_unlock_irq(&info->io_lock);
@@ -867,12 +984,13 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info,
867 struct blkif_response *bret) 984 struct blkif_response *bret)
868{ 985{
869 int i = 0; 986 int i = 0;
870 struct bio_vec *bvec; 987 struct scatterlist *sg;
871 struct req_iterator iter;
872 unsigned long flags;
873 char *bvec_data; 988 char *bvec_data;
874 void *shared_data; 989 void *shared_data;
875 unsigned int offset = 0; 990 int nseg;
991
992 nseg = s->req.operation == BLKIF_OP_INDIRECT ?
993 s->req.u.indirect.nr_segments : s->req.u.rw.nr_segments;
876 994
877 if (bret->operation == BLKIF_OP_READ) { 995 if (bret->operation == BLKIF_OP_READ) {
878 /* 996 /*
@@ -881,26 +999,29 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info,
881 * than PAGE_SIZE, we have to keep track of the current offset, 999 * than PAGE_SIZE, we have to keep track of the current offset,
882 * to be sure we are copying the data from the right shared page. 1000 * to be sure we are copying the data from the right shared page.
883 */ 1001 */
884 rq_for_each_segment(bvec, s->request, iter) { 1002 for_each_sg(s->sg, sg, nseg, i) {
885 BUG_ON((bvec->bv_offset + bvec->bv_len) > PAGE_SIZE); 1003 BUG_ON(sg->offset + sg->length > PAGE_SIZE);
886 if (bvec->bv_offset < offset)
887 i++;
888 BUG_ON(i >= s->req.u.rw.nr_segments);
889 shared_data = kmap_atomic( 1004 shared_data = kmap_atomic(
890 pfn_to_page(s->grants_used[i]->pfn)); 1005 pfn_to_page(s->grants_used[i]->pfn));
891 bvec_data = bvec_kmap_irq(bvec, &flags); 1006 bvec_data = kmap_atomic(sg_page(sg));
892 memcpy(bvec_data, shared_data + bvec->bv_offset, 1007 memcpy(bvec_data + sg->offset,
893 bvec->bv_len); 1008 shared_data + sg->offset,
894 bvec_kunmap_irq(bvec_data, &flags); 1009 sg->length);
1010 kunmap_atomic(bvec_data);
895 kunmap_atomic(shared_data); 1011 kunmap_atomic(shared_data);
896 offset = bvec->bv_offset + bvec->bv_len;
897 } 1012 }
898 } 1013 }
899 /* Add the persistent grant into the list of free grants */ 1014 /* Add the persistent grant into the list of free grants */
900 for (i = 0; i < s->req.u.rw.nr_segments; i++) { 1015 for (i = 0; i < nseg; i++) {
901 list_add(&s->grants_used[i]->node, &info->persistent_gnts); 1016 list_add(&s->grants_used[i]->node, &info->persistent_gnts);
902 info->persistent_gnts_c++; 1017 info->persistent_gnts_c++;
903 } 1018 }
1019 if (s->req.operation == BLKIF_OP_INDIRECT) {
1020 for (i = 0; i < INDIRECT_GREFS(nseg); i++) {
1021 list_add(&s->indirect_grants[i]->node, &info->persistent_gnts);
1022 info->persistent_gnts_c++;
1023 }
1024 }
904} 1025}
905 1026
906static irqreturn_t blkif_interrupt(int irq, void *dev_id) 1027static irqreturn_t blkif_interrupt(int irq, void *dev_id)
@@ -1034,14 +1155,6 @@ static int setup_blkring(struct xenbus_device *dev,
1034 SHARED_RING_INIT(sring); 1155 SHARED_RING_INIT(sring);
1035 FRONT_RING_INIT(&info->ring, sring, PAGE_SIZE); 1156 FRONT_RING_INIT(&info->ring, sring, PAGE_SIZE);
1036 1157
1037 sg_init_table(info->sg, BLKIF_MAX_SEGMENTS_PER_REQUEST);
1038
1039 /* Allocate memory for grants */
1040 err = fill_grant_buffer(info, BLK_RING_SIZE *
1041 BLKIF_MAX_SEGMENTS_PER_REQUEST);
1042 if (err)
1043 goto fail;
1044
1045 err = xenbus_grant_ring(dev, virt_to_mfn(info->ring.sring)); 1158 err = xenbus_grant_ring(dev, virt_to_mfn(info->ring.sring));
1046 if (err < 0) { 1159 if (err < 0) {
1047 free_page((unsigned long)sring); 1160 free_page((unsigned long)sring);
@@ -1223,13 +1336,84 @@ static int blkfront_probe(struct xenbus_device *dev,
1223 return 0; 1336 return 0;
1224} 1337}
1225 1338
1339/*
1340 * This is a clone of md_trim_bio, used to split a bio into smaller ones
1341 */
1342static void trim_bio(struct bio *bio, int offset, int size)
1343{
1344 /* 'bio' is a cloned bio which we need to trim to match
1345 * the given offset and size.
1346 * This requires adjusting bi_sector, bi_size, and bi_io_vec
1347 */
1348 int i;
1349 struct bio_vec *bvec;
1350 int sofar = 0;
1351
1352 size <<= 9;
1353 if (offset == 0 && size == bio->bi_size)
1354 return;
1355
1356 bio->bi_sector += offset;
1357 bio->bi_size = size;
1358 offset <<= 9;
1359 clear_bit(BIO_SEG_VALID, &bio->bi_flags);
1360
1361 while (bio->bi_idx < bio->bi_vcnt &&
1362 bio->bi_io_vec[bio->bi_idx].bv_len <= offset) {
1363 /* remove this whole bio_vec */
1364 offset -= bio->bi_io_vec[bio->bi_idx].bv_len;
1365 bio->bi_idx++;
1366 }
1367 if (bio->bi_idx < bio->bi_vcnt) {
1368 bio->bi_io_vec[bio->bi_idx].bv_offset += offset;
1369 bio->bi_io_vec[bio->bi_idx].bv_len -= offset;
1370 }
1371 /* avoid any complications with bi_idx being non-zero*/
1372 if (bio->bi_idx) {
1373 memmove(bio->bi_io_vec, bio->bi_io_vec+bio->bi_idx,
1374 (bio->bi_vcnt - bio->bi_idx) * sizeof(struct bio_vec));
1375 bio->bi_vcnt -= bio->bi_idx;
1376 bio->bi_idx = 0;
1377 }
1378 /* Make sure vcnt and last bv are not too big */
1379 bio_for_each_segment(bvec, bio, i) {
1380 if (sofar + bvec->bv_len > size)
1381 bvec->bv_len = size - sofar;
1382 if (bvec->bv_len == 0) {
1383 bio->bi_vcnt = i;
1384 break;
1385 }
1386 sofar += bvec->bv_len;
1387 }
1388}
1389
1390static void split_bio_end(struct bio *bio, int error)
1391{
1392 struct split_bio *split_bio = bio->bi_private;
1393
1394 if (error)
1395 split_bio->err = error;
1396
1397 if (atomic_dec_and_test(&split_bio->pending)) {
1398 split_bio->bio->bi_phys_segments = 0;
1399 bio_endio(split_bio->bio, split_bio->err);
1400 kfree(split_bio);
1401 }
1402 bio_put(bio);
1403}
1226 1404
1227static int blkif_recover(struct blkfront_info *info) 1405static int blkif_recover(struct blkfront_info *info)
1228{ 1406{
1229 int i; 1407 int i;
1230 struct blkif_request *req; 1408 struct request *req, *n;
1231 struct blk_shadow *copy; 1409 struct blk_shadow *copy;
1232 int j; 1410 int rc;
1411 struct bio *bio, *cloned_bio;
1412 struct bio_list bio_list, merge_bio;
1413 unsigned int segs, offset;
1414 int pending, size;
1415 struct split_bio *split_bio;
1416 struct list_head requests;
1233 1417
1234 /* Stage 1: Make a safe copy of the shadow state. */ 1418 /* Stage 1: Make a safe copy of the shadow state. */
1235 copy = kmemdup(info->shadow, sizeof(info->shadow), 1419 copy = kmemdup(info->shadow, sizeof(info->shadow),
@@ -1244,36 +1428,64 @@ static int blkif_recover(struct blkfront_info *info)
1244 info->shadow_free = info->ring.req_prod_pvt; 1428 info->shadow_free = info->ring.req_prod_pvt;
1245 info->shadow[BLK_RING_SIZE-1].req.u.rw.id = 0x0fffffff; 1429 info->shadow[BLK_RING_SIZE-1].req.u.rw.id = 0x0fffffff;
1246 1430
1247 /* Stage 3: Find pending requests and requeue them. */ 1431 rc = blkfront_setup_indirect(info);
1432 if (rc) {
1433 kfree(copy);
1434 return rc;
1435 }
1436
1437 segs = info->max_indirect_segments ? : BLKIF_MAX_SEGMENTS_PER_REQUEST;
1438 blk_queue_max_segments(info->rq, segs);
1439 bio_list_init(&bio_list);
1440 INIT_LIST_HEAD(&requests);
1248 for (i = 0; i < BLK_RING_SIZE; i++) { 1441 for (i = 0; i < BLK_RING_SIZE; i++) {
1249 /* Not in use? */ 1442 /* Not in use? */
1250 if (!copy[i].request) 1443 if (!copy[i].request)
1251 continue; 1444 continue;
1252 1445
1253 /* Grab a request slot and copy shadow state into it. */ 1446 /*
1254 req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt); 1447 * Get the bios in the request so we can re-queue them.
1255 *req = copy[i].req; 1448 */
1256 1449 if (copy[i].request->cmd_flags &
1257 /* We get a new request id, and must reset the shadow state. */ 1450 (REQ_FLUSH | REQ_FUA | REQ_DISCARD | REQ_SECURE)) {
1258 req->u.rw.id = get_id_from_freelist(info); 1451 /*
1259 memcpy(&info->shadow[req->u.rw.id], &copy[i], sizeof(copy[i])); 1452 * Flush operations don't contain bios, so
1260 1453 * we need to requeue the whole request
1261 if (req->operation != BLKIF_OP_DISCARD) { 1454 */
1262 /* Rewrite any grant references invalidated by susp/resume. */ 1455 list_add(&copy[i].request->queuelist, &requests);
1263 for (j = 0; j < req->u.rw.nr_segments; j++) 1456 continue;
1264 gnttab_grant_foreign_access_ref(
1265 req->u.rw.seg[j].gref,
1266 info->xbdev->otherend_id,
1267 pfn_to_mfn(copy[i].grants_used[j]->pfn),
1268 0);
1269 } 1457 }
1270 info->shadow[req->u.rw.id].req = *req; 1458 merge_bio.head = copy[i].request->bio;
1271 1459 merge_bio.tail = copy[i].request->biotail;
1272 info->ring.req_prod_pvt++; 1460 bio_list_merge(&bio_list, &merge_bio);
1461 copy[i].request->bio = NULL;
1462 blk_put_request(copy[i].request);
1273 } 1463 }
1274 1464
1275 kfree(copy); 1465 kfree(copy);
1276 1466
1467 /*
1468 * Empty the queue, this is important because we might have
1469 * requests in the queue with more segments than what we
1470 * can handle now.
1471 */
1472 spin_lock_irq(&info->io_lock);
1473 while ((req = blk_fetch_request(info->rq)) != NULL) {
1474 if (req->cmd_flags &
1475 (REQ_FLUSH | REQ_FUA | REQ_DISCARD | REQ_SECURE)) {
1476 list_add(&req->queuelist, &requests);
1477 continue;
1478 }
1479 merge_bio.head = req->bio;
1480 merge_bio.tail = req->biotail;
1481 bio_list_merge(&bio_list, &merge_bio);
1482 req->bio = NULL;
1483 if (req->cmd_flags & (REQ_FLUSH | REQ_FUA))
1484 pr_alert("diskcache flush request found!\n");
1485 __blk_put_request(info->rq, req);
1486 }
1487 spin_unlock_irq(&info->io_lock);
1488
1277 xenbus_switch_state(info->xbdev, XenbusStateConnected); 1489 xenbus_switch_state(info->xbdev, XenbusStateConnected);
1278 1490
1279 spin_lock_irq(&info->io_lock); 1491 spin_lock_irq(&info->io_lock);
@@ -1281,14 +1493,50 @@ static int blkif_recover(struct blkfront_info *info)
1281 /* Now safe for us to use the shared ring */ 1493 /* Now safe for us to use the shared ring */
1282 info->connected = BLKIF_STATE_CONNECTED; 1494 info->connected = BLKIF_STATE_CONNECTED;
1283 1495
1284 /* Send off requeued requests */
1285 flush_requests(info);
1286
1287 /* Kick any other new requests queued since we resumed */ 1496 /* Kick any other new requests queued since we resumed */
1288 kick_pending_request_queues(info); 1497 kick_pending_request_queues(info);
1289 1498
1499 list_for_each_entry_safe(req, n, &requests, queuelist) {
1500 /* Requeue pending requests (flush or discard) */
1501 list_del_init(&req->queuelist);
1502 BUG_ON(req->nr_phys_segments > segs);
1503 blk_requeue_request(info->rq, req);
1504 }
1290 spin_unlock_irq(&info->io_lock); 1505 spin_unlock_irq(&info->io_lock);
1291 1506
1507 while ((bio = bio_list_pop(&bio_list)) != NULL) {
1508 /* Traverse the list of pending bios and re-queue them */
1509 if (bio_segments(bio) > segs) {
1510 /*
1511 * This bio has more segments than what we can
1512 * handle, we have to split it.
1513 */
1514 pending = (bio_segments(bio) + segs - 1) / segs;
1515 split_bio = kzalloc(sizeof(*split_bio), GFP_NOIO);
1516 BUG_ON(split_bio == NULL);
1517 atomic_set(&split_bio->pending, pending);
1518 split_bio->bio = bio;
1519 for (i = 0; i < pending; i++) {
1520 offset = (i * segs * PAGE_SIZE) >> 9;
1521 size = min((unsigned int)(segs * PAGE_SIZE) >> 9,
1522 (unsigned int)(bio->bi_size >> 9) - offset);
1523 cloned_bio = bio_clone(bio, GFP_NOIO);
1524 BUG_ON(cloned_bio == NULL);
1525 trim_bio(cloned_bio, offset, size);
1526 cloned_bio->bi_private = split_bio;
1527 cloned_bio->bi_end_io = split_bio_end;
1528 submit_bio(cloned_bio->bi_rw, cloned_bio);
1529 }
1530 /*
1531 * Now we have to wait for all those smaller bios to
1532 * end, so we can also end the "parent" bio.
1533 */
1534 continue;
1535 }
1536 /* We don't need to split this bio */
1537 submit_bio(bio->bi_rw, bio);
1538 }
1539
1292 return 0; 1540 return 0;
1293} 1541}
1294 1542
@@ -1308,8 +1556,12 @@ static int blkfront_resume(struct xenbus_device *dev)
1308 blkif_free(info, info->connected == BLKIF_STATE_CONNECTED); 1556 blkif_free(info, info->connected == BLKIF_STATE_CONNECTED);
1309 1557
1310 err = talk_to_blkback(dev, info); 1558 err = talk_to_blkback(dev, info);
1311 if (info->connected == BLKIF_STATE_SUSPENDED && !err) 1559
1312 err = blkif_recover(info); 1560 /*
1561 * We have to wait for the backend to switch to
1562 * connected state, since we want to read which
1563 * features it supports.
1564 */
1313 1565
1314 return err; 1566 return err;
1315} 1567}
@@ -1387,6 +1639,60 @@ static void blkfront_setup_discard(struct blkfront_info *info)
1387 kfree(type); 1639 kfree(type);
1388} 1640}
1389 1641
1642static int blkfront_setup_indirect(struct blkfront_info *info)
1643{
1644 unsigned int indirect_segments, segs;
1645 int err, i;
1646
1647 err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
1648 "feature-max-indirect-segments", "%u", &indirect_segments,
1649 NULL);
1650 if (err) {
1651 info->max_indirect_segments = 0;
1652 segs = BLKIF_MAX_SEGMENTS_PER_REQUEST;
1653 } else {
1654 info->max_indirect_segments = min(indirect_segments,
1655 xen_blkif_max_segments);
1656 segs = info->max_indirect_segments;
1657 }
1658
1659 err = fill_grant_buffer(info, (segs + INDIRECT_GREFS(segs)) * BLK_RING_SIZE);
1660 if (err)
1661 goto out_of_memory;
1662
1663 for (i = 0; i < BLK_RING_SIZE; i++) {
1664 info->shadow[i].grants_used = kzalloc(
1665 sizeof(info->shadow[i].grants_used[0]) * segs,
1666 GFP_NOIO);
1667 info->shadow[i].sg = kzalloc(sizeof(info->shadow[i].sg[0]) * segs, GFP_NOIO);
1668 if (info->max_indirect_segments)
1669 info->shadow[i].indirect_grants = kzalloc(
1670 sizeof(info->shadow[i].indirect_grants[0]) *
1671 INDIRECT_GREFS(segs),
1672 GFP_NOIO);
1673 if ((info->shadow[i].grants_used == NULL) ||
1674 (info->shadow[i].sg == NULL) ||
1675 (info->max_indirect_segments &&
1676 (info->shadow[i].indirect_grants == NULL)))
1677 goto out_of_memory;
1678 sg_init_table(info->shadow[i].sg, segs);
1679 }
1680
1681
1682 return 0;
1683
1684out_of_memory:
1685 for (i = 0; i < BLK_RING_SIZE; i++) {
1686 kfree(info->shadow[i].grants_used);
1687 info->shadow[i].grants_used = NULL;
1688 kfree(info->shadow[i].sg);
1689 info->shadow[i].sg = NULL;
1690 kfree(info->shadow[i].indirect_grants);
1691 info->shadow[i].indirect_grants = NULL;
1692 }
1693 return -ENOMEM;
1694}
1695
1390/* 1696/*
1391 * Invoked when the backend is finally 'ready' (and has told produced 1697 * Invoked when the backend is finally 'ready' (and has told produced
1392 * the details about the physical device - #sectors, size, etc). 1698 * the details about the physical device - #sectors, size, etc).
@@ -1395,6 +1701,7 @@ static void blkfront_connect(struct blkfront_info *info)
1395{ 1701{
1396 unsigned long long sectors; 1702 unsigned long long sectors;
1397 unsigned long sector_size; 1703 unsigned long sector_size;
1704 unsigned int physical_sector_size;
1398 unsigned int binfo; 1705 unsigned int binfo;
1399 int err; 1706 int err;
1400 int barrier, flush, discard, persistent; 1707 int barrier, flush, discard, persistent;
@@ -1414,8 +1721,15 @@ static void blkfront_connect(struct blkfront_info *info)
1414 set_capacity(info->gd, sectors); 1721 set_capacity(info->gd, sectors);
1415 revalidate_disk(info->gd); 1722 revalidate_disk(info->gd);
1416 1723
1417 /* fall through */ 1724 return;
1418 case BLKIF_STATE_SUSPENDED: 1725 case BLKIF_STATE_SUSPENDED:
1726 /*
1727 * If we are recovering from suspension, we need to wait
1728 * for the backend to announce it's features before
1729 * reconnecting, at least we need to know if the backend
1730 * supports indirect descriptors, and how many.
1731 */
1732 blkif_recover(info);
1419 return; 1733 return;
1420 1734
1421 default: 1735 default:
@@ -1437,6 +1751,16 @@ static void blkfront_connect(struct blkfront_info *info)
1437 return; 1751 return;
1438 } 1752 }
1439 1753
1754 /*
1755 * physcial-sector-size is a newer field, so old backends may not
1756 * provide this. Assume physical sector size to be the same as
1757 * sector_size in that case.
1758 */
1759 err = xenbus_scanf(XBT_NIL, info->xbdev->otherend,
1760 "physical-sector-size", "%u", &physical_sector_size);
1761 if (err != 1)
1762 physical_sector_size = sector_size;
1763
1440 info->feature_flush = 0; 1764 info->feature_flush = 0;
1441 info->flush_op = 0; 1765 info->flush_op = 0;
1442 1766
@@ -1483,7 +1807,15 @@ static void blkfront_connect(struct blkfront_info *info)
1483 else 1807 else
1484 info->feature_persistent = persistent; 1808 info->feature_persistent = persistent;
1485 1809
1486 err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size); 1810 err = blkfront_setup_indirect(info);
1811 if (err) {
1812 xenbus_dev_fatal(info->xbdev, err, "setup_indirect at %s",
1813 info->xbdev->otherend);
1814 return;
1815 }
1816
1817 err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size,
1818 physical_sector_size);
1487 if (err) { 1819 if (err) {
1488 xenbus_dev_fatal(info->xbdev, err, "xlvbd_add at %s", 1820 xenbus_dev_fatal(info->xbdev, err, "xlvbd_add at %s",
1489 info->xbdev->otherend); 1821 info->xbdev->otherend);
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 8bda1294c035..dac7738df7ff 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -272,6 +272,8 @@ enum {
272 * - memcg: use_hierarchy is on by default and the cgroup file for 272 * - memcg: use_hierarchy is on by default and the cgroup file for
273 * the flag is not created. 273 * the flag is not created.
274 * 274 *
275 * - blkcg: blk-throttle becomes properly hierarchical.
276 *
275 * The followings are planned changes. 277 * The followings are planned changes.
276 * 278 *
277 * - release_agent will be disallowed once replacement notification 279 * - release_agent will be disallowed once replacement notification
diff --git a/include/linux/drbd.h b/include/linux/drbd.h
index 1b4d4ee1168f..de7d74ab3de6 100644
--- a/include/linux/drbd.h
+++ b/include/linux/drbd.h
@@ -177,7 +177,11 @@ enum drbd_ret_code {
177 ERR_NEED_APV_100 = 163, 177 ERR_NEED_APV_100 = 163,
178 ERR_NEED_ALLOW_TWO_PRI = 164, 178 ERR_NEED_ALLOW_TWO_PRI = 164,
179 ERR_MD_UNCLEAN = 165, 179 ERR_MD_UNCLEAN = 165,
180 180 ERR_MD_LAYOUT_CONNECTED = 166,
181 ERR_MD_LAYOUT_TOO_BIG = 167,
182 ERR_MD_LAYOUT_TOO_SMALL = 168,
183 ERR_MD_LAYOUT_NO_FIT = 169,
184 ERR_IMPLICIT_SHRINK = 170,
181 /* insert new ones above this line */ 185 /* insert new ones above this line */
182 AFTER_LAST_ERR_CODE 186 AFTER_LAST_ERR_CODE
183}; 187};
diff --git a/include/linux/drbd_genl.h b/include/linux/drbd_genl.h
index d0d8fac8a6e4..e8c44572b8cb 100644
--- a/include/linux/drbd_genl.h
+++ b/include/linux/drbd_genl.h
@@ -181,6 +181,8 @@ GENL_struct(DRBD_NLA_RESIZE_PARMS, 7, resize_parms,
181 __u64_field(1, DRBD_GENLA_F_MANDATORY, resize_size) 181 __u64_field(1, DRBD_GENLA_F_MANDATORY, resize_size)
182 __flg_field(2, DRBD_GENLA_F_MANDATORY, resize_force) 182 __flg_field(2, DRBD_GENLA_F_MANDATORY, resize_force)
183 __flg_field(3, DRBD_GENLA_F_MANDATORY, no_resync) 183 __flg_field(3, DRBD_GENLA_F_MANDATORY, no_resync)
184 __u32_field_def(4, 0 /* OPTIONAL */, al_stripes, DRBD_AL_STRIPES_DEF)
185 __u32_field_def(5, 0 /* OPTIONAL */, al_stripe_size, DRBD_AL_STRIPE_SIZE_DEF)
184) 186)
185 187
186GENL_struct(DRBD_NLA_STATE_INFO, 8, state_info, 188GENL_struct(DRBD_NLA_STATE_INFO, 8, state_info,
diff --git a/include/linux/drbd_limits.h b/include/linux/drbd_limits.h
index 1fedf2b17cc8..17e50bb00521 100644
--- a/include/linux/drbd_limits.h
+++ b/include/linux/drbd_limits.h
@@ -215,4 +215,13 @@
215#define DRBD_ALWAYS_ASBP_DEF 0 215#define DRBD_ALWAYS_ASBP_DEF 0
216#define DRBD_USE_RLE_DEF 1 216#define DRBD_USE_RLE_DEF 1
217 217
218#define DRBD_AL_STRIPES_MIN 1
219#define DRBD_AL_STRIPES_MAX 1024
220#define DRBD_AL_STRIPES_DEF 1
221#define DRBD_AL_STRIPES_SCALE '1'
222
223#define DRBD_AL_STRIPE_SIZE_MIN 4
224#define DRBD_AL_STRIPE_SIZE_MAX 16777216
225#define DRBD_AL_STRIPE_SIZE_DEF 32
226#define DRBD_AL_STRIPE_SIZE_SCALE 'k' /* kilobytes */
218#endif 227#endif
diff --git a/include/xen/interface/io/blkif.h b/include/xen/interface/io/blkif.h
index ffd4652de91c..65e12099ef89 100644
--- a/include/xen/interface/io/blkif.h
+++ b/include/xen/interface/io/blkif.h
@@ -103,12 +103,46 @@ typedef uint64_t blkif_sector_t;
103#define BLKIF_OP_DISCARD 5 103#define BLKIF_OP_DISCARD 5
104 104
105/* 105/*
106 * Recognized if "feature-max-indirect-segments" in present in the backend
107 * xenbus info. The "feature-max-indirect-segments" node contains the maximum
108 * number of segments allowed by the backend per request. If the node is
109 * present, the frontend might use blkif_request_indirect structs in order to
110 * issue requests with more than BLKIF_MAX_SEGMENTS_PER_REQUEST (11). The
111 * maximum number of indirect segments is fixed by the backend, but the
112 * frontend can issue requests with any number of indirect segments as long as
113 * it's less than the number provided by the backend. The indirect_grefs field
114 * in blkif_request_indirect should be filled by the frontend with the
115 * grant references of the pages that are holding the indirect segments.
116 * This pages are filled with an array of blkif_request_segment_aligned
117 * that hold the information about the segments. The number of indirect
118 * pages to use is determined by the maximum number of segments
119 * a indirect request contains. Every indirect page can contain a maximum
120 * of 512 segments (PAGE_SIZE/sizeof(blkif_request_segment_aligned)),
121 * so to calculate the number of indirect pages to use we have to do
122 * ceil(indirect_segments/512).
123 *
124 * If a backend does not recognize BLKIF_OP_INDIRECT, it should *not*
125 * create the "feature-max-indirect-segments" node!
126 */
127#define BLKIF_OP_INDIRECT 6
128
129/*
106 * Maximum scatter/gather segments per request. 130 * Maximum scatter/gather segments per request.
107 * This is carefully chosen so that sizeof(struct blkif_ring) <= PAGE_SIZE. 131 * This is carefully chosen so that sizeof(struct blkif_ring) <= PAGE_SIZE.
108 * NB. This could be 12 if the ring indexes weren't stored in the same page. 132 * NB. This could be 12 if the ring indexes weren't stored in the same page.
109 */ 133 */
110#define BLKIF_MAX_SEGMENTS_PER_REQUEST 11 134#define BLKIF_MAX_SEGMENTS_PER_REQUEST 11
111 135
136#define BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST 8
137
138struct blkif_request_segment_aligned {
139 grant_ref_t gref; /* reference to I/O buffer frame */
140 /* @first_sect: first sector in frame to transfer (inclusive). */
141 /* @last_sect: last sector in frame to transfer (inclusive). */
142 uint8_t first_sect, last_sect;
143 uint16_t _pad; /* padding to make it 8 bytes, so it's cache-aligned */
144} __attribute__((__packed__));
145
112struct blkif_request_rw { 146struct blkif_request_rw {
113 uint8_t nr_segments; /* number of segments */ 147 uint8_t nr_segments; /* number of segments */
114 blkif_vdev_t handle; /* only for read/write requests */ 148 blkif_vdev_t handle; /* only for read/write requests */
@@ -147,12 +181,31 @@ struct blkif_request_other {
147 uint64_t id; /* private guest value, echoed in resp */ 181 uint64_t id; /* private guest value, echoed in resp */
148} __attribute__((__packed__)); 182} __attribute__((__packed__));
149 183
184struct blkif_request_indirect {
185 uint8_t indirect_op;
186 uint16_t nr_segments;
187#ifdef CONFIG_X86_64
188 uint32_t _pad1; /* offsetof(blkif_...,u.indirect.id) == 8 */
189#endif
190 uint64_t id;
191 blkif_sector_t sector_number;
192 blkif_vdev_t handle;
193 uint16_t _pad2;
194 grant_ref_t indirect_grefs[BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST];
195#ifdef CONFIG_X86_64
196 uint32_t _pad3; /* make it 64 byte aligned */
197#else
198 uint64_t _pad3; /* make it 64 byte aligned */
199#endif
200} __attribute__((__packed__));
201
150struct blkif_request { 202struct blkif_request {
151 uint8_t operation; /* BLKIF_OP_??? */ 203 uint8_t operation; /* BLKIF_OP_??? */
152 union { 204 union {
153 struct blkif_request_rw rw; 205 struct blkif_request_rw rw;
154 struct blkif_request_discard discard; 206 struct blkif_request_discard discard;
155 struct blkif_request_other other; 207 struct blkif_request_other other;
208 struct blkif_request_indirect indirect;
156 } u; 209 } u;
157} __attribute__((__packed__)); 210} __attribute__((__packed__));
158 211
diff --git a/include/xen/interface/io/ring.h b/include/xen/interface/io/ring.h
index 75271b9a8f61..7d28aff605c7 100644
--- a/include/xen/interface/io/ring.h
+++ b/include/xen/interface/io/ring.h
@@ -188,6 +188,11 @@ struct __name##_back_ring { \
188#define RING_REQUEST_CONS_OVERFLOW(_r, _cons) \ 188#define RING_REQUEST_CONS_OVERFLOW(_r, _cons) \
189 (((_cons) - (_r)->rsp_prod_pvt) >= RING_SIZE(_r)) 189 (((_cons) - (_r)->rsp_prod_pvt) >= RING_SIZE(_r))
190 190
191/* Ill-behaved frontend determination: Can there be this many requests? */
192#define RING_REQUEST_PROD_OVERFLOW(_r, _prod) \
193 (((_prod) - (_r)->rsp_prod_pvt) > RING_SIZE(_r))
194
195
191#define RING_PUSH_REQUESTS(_r) do { \ 196#define RING_PUSH_REQUESTS(_r) do { \
192 wmb(); /* back sees requests /before/ updated producer index */ \ 197 wmb(); /* back sees requests /before/ updated producer index */ \
193 (_r)->sring->req_prod = (_r)->req_prod_pvt; \ 198 (_r)->sring->req_prod = (_r)->req_prod_pvt; \