diff options
29 files changed, 2854 insertions, 1027 deletions
diff --git a/Documentation/ABI/testing/sysfs-driver-xen-blkback b/Documentation/ABI/testing/sysfs-driver-xen-blkback new file mode 100644 index 000000000000..8bb43b66eb55 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-driver-xen-blkback | |||
@@ -0,0 +1,17 @@ | |||
1 | What: /sys/module/xen_blkback/parameters/max_buffer_pages | ||
2 | Date: March 2013 | ||
3 | KernelVersion: 3.11 | ||
4 | Contact: Roger Pau Monné <roger.pau@citrix.com> | ||
5 | Description: | ||
6 | Maximum number of free pages to keep in each block | ||
7 | backend buffer. | ||
8 | |||
9 | What: /sys/module/xen_blkback/parameters/max_persistent_grants | ||
10 | Date: March 2013 | ||
11 | KernelVersion: 3.11 | ||
12 | Contact: Roger Pau Monné <roger.pau@citrix.com> | ||
13 | Description: | ||
14 | Maximum number of grants to map persistently in | ||
15 | blkback. If the frontend tries to use more than | ||
16 | max_persistent_grants, the LRU kicks in and starts | ||
17 | removing 5% of max_persistent_grants every 100ms. | ||
diff --git a/Documentation/ABI/testing/sysfs-driver-xen-blkfront b/Documentation/ABI/testing/sysfs-driver-xen-blkfront new file mode 100644 index 000000000000..c0a6cb7eb314 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-driver-xen-blkfront | |||
@@ -0,0 +1,10 @@ | |||
1 | What: /sys/module/xen_blkfront/parameters/max | ||
2 | Date: June 2013 | ||
3 | KernelVersion: 3.11 | ||
4 | Contact: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> | ||
5 | Description: | ||
6 | Maximum number of segments that the frontend will negotiate | ||
7 | with the backend for indirect descriptors. The default value | ||
8 | is 32 - higher value means more potential throughput but more | ||
9 | memory usage. The backend picks the minimum of the frontend | ||
10 | and its default backend value. | ||
diff --git a/Documentation/cgroups/blkio-controller.txt b/Documentation/cgroups/blkio-controller.txt index da272c8f44e7..cd556b914786 100644 --- a/Documentation/cgroups/blkio-controller.txt +++ b/Documentation/cgroups/blkio-controller.txt | |||
@@ -94,11 +94,13 @@ Throttling/Upper Limit policy | |||
94 | 94 | ||
95 | Hierarchical Cgroups | 95 | Hierarchical Cgroups |
96 | ==================== | 96 | ==================== |
97 | - Currently only CFQ supports hierarchical groups. For throttling, | ||
98 | cgroup interface does allow creation of hierarchical cgroups and | ||
99 | internally it treats them as flat hierarchy. | ||
100 | 97 | ||
101 | If somebody created a hierarchy like as follows. | 98 | Both CFQ and throttling implement hierarchy support; however, |
99 | throttling's hierarchy support is enabled iff "sane_behavior" is | ||
100 | enabled from cgroup side, which currently is a development option and | ||
101 | not publicly available. | ||
102 | |||
103 | If somebody created a hierarchy like as follows. | ||
102 | 104 | ||
103 | root | 105 | root |
104 | / \ | 106 | / \ |
@@ -106,21 +108,20 @@ Hierarchical Cgroups | |||
106 | | | 108 | | |
107 | test3 | 109 | test3 |
108 | 110 | ||
109 | CFQ will handle the hierarchy correctly but and throttling will | 111 | CFQ by default and throttling with "sane_behavior" will handle the |
110 | practically treat all groups at same level. For details on CFQ | 112 | hierarchy correctly. For details on CFQ hierarchy support, refer to |
111 | hierarchy support, refer to Documentation/block/cfq-iosched.txt. | 113 | Documentation/block/cfq-iosched.txt. For throttling, all limits apply |
112 | Throttling will treat the hierarchy as if it looks like the | 114 | to the whole subtree while all statistics are local to the IOs |
113 | following. | 115 | directly generated by tasks in that cgroup. |
116 | |||
117 | Throttling without "sane_behavior" enabled from cgroup side will | ||
118 | practically treat all groups at same level as if it looks like the | ||
119 | following. | ||
114 | 120 | ||
115 | pivot | 121 | pivot |
116 | / / \ \ | 122 | / / \ \ |
117 | root test1 test2 test3 | 123 | root test1 test2 test3 |
118 | 124 | ||
119 | Nesting cgroups, while allowed, isn't officially supported and blkio | ||
120 | genereates warning when cgroups nest. Once throttling implements | ||
121 | hierarchy support, hierarchy will be supported and the warning will | ||
122 | be removed. | ||
123 | |||
124 | Various user visible config options | 125 | Various user visible config options |
125 | =================================== | 126 | =================================== |
126 | CONFIG_BLK_CGROUP | 127 | CONFIG_BLK_CGROUP |
diff --git a/MAINTAINERS b/MAINTAINERS index 58814f0f06bd..a0a76fb7323f 100644 --- a/MAINTAINERS +++ b/MAINTAINERS | |||
@@ -3297,7 +3297,7 @@ F: Documentation/firmware_class/ | |||
3297 | F: drivers/base/firmware*.c | 3297 | F: drivers/base/firmware*.c |
3298 | F: include/linux/firmware.h | 3298 | F: include/linux/firmware.h |
3299 | 3299 | ||
3300 | FLASHSYSTEM DRIVER (IBM FlashSystem 70/80 PCI SSD Flash Card) | 3300 | FLASH ADAPTER DRIVER (IBM Flash Adapter 900GB Full Height PCI Flash Card) |
3301 | M: Joshua Morris <josh.h.morris@us.ibm.com> | 3301 | M: Joshua Morris <josh.h.morris@us.ibm.com> |
3302 | M: Philip Kelleher <pjk1939@linux.vnet.ibm.com> | 3302 | M: Philip Kelleher <pjk1939@linux.vnet.ibm.com> |
3303 | S: Maintained | 3303 | S: Maintained |
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index e8918ffaf96d..290792a13e3c 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c | |||
@@ -32,26 +32,6 @@ EXPORT_SYMBOL_GPL(blkcg_root); | |||
32 | 32 | ||
33 | static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS]; | 33 | static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS]; |
34 | 34 | ||
35 | static struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, | ||
36 | struct request_queue *q, bool update_hint); | ||
37 | |||
38 | /** | ||
39 | * blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants | ||
40 | * @d_blkg: loop cursor pointing to the current descendant | ||
41 | * @pos_cgrp: used for iteration | ||
42 | * @p_blkg: target blkg to walk descendants of | ||
43 | * | ||
44 | * Walk @c_blkg through the descendants of @p_blkg. Must be used with RCU | ||
45 | * read locked. If called under either blkcg or queue lock, the iteration | ||
46 | * is guaranteed to include all and only online blkgs. The caller may | ||
47 | * update @pos_cgrp by calling cgroup_rightmost_descendant() to skip | ||
48 | * subtree. | ||
49 | */ | ||
50 | #define blkg_for_each_descendant_pre(d_blkg, pos_cgrp, p_blkg) \ | ||
51 | cgroup_for_each_descendant_pre((pos_cgrp), (p_blkg)->blkcg->css.cgroup) \ | ||
52 | if (((d_blkg) = __blkg_lookup(cgroup_to_blkcg(pos_cgrp), \ | ||
53 | (p_blkg)->q, false))) | ||
54 | |||
55 | static bool blkcg_policy_enabled(struct request_queue *q, | 35 | static bool blkcg_policy_enabled(struct request_queue *q, |
56 | const struct blkcg_policy *pol) | 36 | const struct blkcg_policy *pol) |
57 | { | 37 | { |
@@ -71,18 +51,8 @@ static void blkg_free(struct blkcg_gq *blkg) | |||
71 | if (!blkg) | 51 | if (!blkg) |
72 | return; | 52 | return; |
73 | 53 | ||
74 | for (i = 0; i < BLKCG_MAX_POLS; i++) { | 54 | for (i = 0; i < BLKCG_MAX_POLS; i++) |
75 | struct blkcg_policy *pol = blkcg_policy[i]; | 55 | kfree(blkg->pd[i]); |
76 | struct blkg_policy_data *pd = blkg->pd[i]; | ||
77 | |||
78 | if (!pd) | ||
79 | continue; | ||
80 | |||
81 | if (pol && pol->pd_exit_fn) | ||
82 | pol->pd_exit_fn(blkg); | ||
83 | |||
84 | kfree(pd); | ||
85 | } | ||
86 | 56 | ||
87 | blk_exit_rl(&blkg->rl); | 57 | blk_exit_rl(&blkg->rl); |
88 | kfree(blkg); | 58 | kfree(blkg); |
@@ -134,10 +104,6 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q, | |||
134 | blkg->pd[i] = pd; | 104 | blkg->pd[i] = pd; |
135 | pd->blkg = blkg; | 105 | pd->blkg = blkg; |
136 | pd->plid = i; | 106 | pd->plid = i; |
137 | |||
138 | /* invoke per-policy init */ | ||
139 | if (pol->pd_init_fn) | ||
140 | pol->pd_init_fn(blkg); | ||
141 | } | 107 | } |
142 | 108 | ||
143 | return blkg; | 109 | return blkg; |
@@ -158,8 +124,8 @@ err_free: | |||
158 | * @q's bypass state. If @update_hint is %true, the caller should be | 124 | * @q's bypass state. If @update_hint is %true, the caller should be |
159 | * holding @q->queue_lock and lookup hint is updated on success. | 125 | * holding @q->queue_lock and lookup hint is updated on success. |
160 | */ | 126 | */ |
161 | static struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, | 127 | struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q, |
162 | struct request_queue *q, bool update_hint) | 128 | bool update_hint) |
163 | { | 129 | { |
164 | struct blkcg_gq *blkg; | 130 | struct blkcg_gq *blkg; |
165 | 131 | ||
@@ -234,16 +200,25 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, | |||
234 | } | 200 | } |
235 | blkg = new_blkg; | 201 | blkg = new_blkg; |
236 | 202 | ||
237 | /* link parent and insert */ | 203 | /* link parent */ |
238 | if (blkcg_parent(blkcg)) { | 204 | if (blkcg_parent(blkcg)) { |
239 | blkg->parent = __blkg_lookup(blkcg_parent(blkcg), q, false); | 205 | blkg->parent = __blkg_lookup(blkcg_parent(blkcg), q, false); |
240 | if (WARN_ON_ONCE(!blkg->parent)) { | 206 | if (WARN_ON_ONCE(!blkg->parent)) { |
241 | blkg = ERR_PTR(-EINVAL); | 207 | ret = -EINVAL; |
242 | goto err_put_css; | 208 | goto err_put_css; |
243 | } | 209 | } |
244 | blkg_get(blkg->parent); | 210 | blkg_get(blkg->parent); |
245 | } | 211 | } |
246 | 212 | ||
213 | /* invoke per-policy init */ | ||
214 | for (i = 0; i < BLKCG_MAX_POLS; i++) { | ||
215 | struct blkcg_policy *pol = blkcg_policy[i]; | ||
216 | |||
217 | if (blkg->pd[i] && pol->pd_init_fn) | ||
218 | pol->pd_init_fn(blkg); | ||
219 | } | ||
220 | |||
221 | /* insert */ | ||
247 | spin_lock(&blkcg->lock); | 222 | spin_lock(&blkcg->lock); |
248 | ret = radix_tree_insert(&blkcg->blkg_tree, q->id, blkg); | 223 | ret = radix_tree_insert(&blkcg->blkg_tree, q->id, blkg); |
249 | if (likely(!ret)) { | 224 | if (likely(!ret)) { |
@@ -394,30 +369,38 @@ static void blkg_destroy_all(struct request_queue *q) | |||
394 | q->root_rl.blkg = NULL; | 369 | q->root_rl.blkg = NULL; |
395 | } | 370 | } |
396 | 371 | ||
397 | static void blkg_rcu_free(struct rcu_head *rcu_head) | 372 | /* |
373 | * A group is RCU protected, but having an rcu lock does not mean that one | ||
374 | * can access all the fields of blkg and assume these are valid. For | ||
375 | * example, don't try to follow throtl_data and request queue links. | ||
376 | * | ||
377 | * Having a reference to blkg under an rcu allows accesses to only values | ||
378 | * local to groups like group stats and group rate limits. | ||
379 | */ | ||
380 | void __blkg_release_rcu(struct rcu_head *rcu_head) | ||
398 | { | 381 | { |
399 | blkg_free(container_of(rcu_head, struct blkcg_gq, rcu_head)); | 382 | struct blkcg_gq *blkg = container_of(rcu_head, struct blkcg_gq, rcu_head); |
400 | } | 383 | int i; |
384 | |||
385 | /* tell policies that this one is being freed */ | ||
386 | for (i = 0; i < BLKCG_MAX_POLS; i++) { | ||
387 | struct blkcg_policy *pol = blkcg_policy[i]; | ||
388 | |||
389 | if (blkg->pd[i] && pol->pd_exit_fn) | ||
390 | pol->pd_exit_fn(blkg); | ||
391 | } | ||
401 | 392 | ||
402 | void __blkg_release(struct blkcg_gq *blkg) | ||
403 | { | ||
404 | /* release the blkcg and parent blkg refs this blkg has been holding */ | 393 | /* release the blkcg and parent blkg refs this blkg has been holding */ |
405 | css_put(&blkg->blkcg->css); | 394 | css_put(&blkg->blkcg->css); |
406 | if (blkg->parent) | 395 | if (blkg->parent) { |
396 | spin_lock_irq(blkg->q->queue_lock); | ||
407 | blkg_put(blkg->parent); | 397 | blkg_put(blkg->parent); |
398 | spin_unlock_irq(blkg->q->queue_lock); | ||
399 | } | ||
408 | 400 | ||
409 | /* | 401 | blkg_free(blkg); |
410 | * A group is freed in rcu manner. But having an rcu lock does not | ||
411 | * mean that one can access all the fields of blkg and assume these | ||
412 | * are valid. For example, don't try to follow throtl_data and | ||
413 | * request queue links. | ||
414 | * | ||
415 | * Having a reference to blkg under an rcu allows acess to only | ||
416 | * values local to groups like group stats and group rate limits | ||
417 | */ | ||
418 | call_rcu(&blkg->rcu_head, blkg_rcu_free); | ||
419 | } | 402 | } |
420 | EXPORT_SYMBOL_GPL(__blkg_release); | 403 | EXPORT_SYMBOL_GPL(__blkg_release_rcu); |
421 | 404 | ||
422 | /* | 405 | /* |
423 | * The next function used by blk_queue_for_each_rl(). It's a bit tricky | 406 | * The next function used by blk_queue_for_each_rl(). It's a bit tricky |
@@ -928,14 +911,6 @@ struct cgroup_subsys blkio_subsys = { | |||
928 | .subsys_id = blkio_subsys_id, | 911 | .subsys_id = blkio_subsys_id, |
929 | .base_cftypes = blkcg_files, | 912 | .base_cftypes = blkcg_files, |
930 | .module = THIS_MODULE, | 913 | .module = THIS_MODULE, |
931 | |||
932 | /* | ||
933 | * blkio subsystem is utterly broken in terms of hierarchy support. | ||
934 | * It treats all cgroups equally regardless of where they're | ||
935 | * located in the hierarchy - all cgroups are treated as if they're | ||
936 | * right below the root. Fix it and remove the following. | ||
937 | */ | ||
938 | .broken_hierarchy = true, | ||
939 | }; | 914 | }; |
940 | EXPORT_SYMBOL_GPL(blkio_subsys); | 915 | EXPORT_SYMBOL_GPL(blkio_subsys); |
941 | 916 | ||
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index 4e595ee8c915..8056c03a3382 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h | |||
@@ -266,7 +266,7 @@ static inline void blkg_get(struct blkcg_gq *blkg) | |||
266 | blkg->refcnt++; | 266 | blkg->refcnt++; |
267 | } | 267 | } |
268 | 268 | ||
269 | void __blkg_release(struct blkcg_gq *blkg); | 269 | void __blkg_release_rcu(struct rcu_head *rcu); |
270 | 270 | ||
271 | /** | 271 | /** |
272 | * blkg_put - put a blkg reference | 272 | * blkg_put - put a blkg reference |
@@ -279,9 +279,43 @@ static inline void blkg_put(struct blkcg_gq *blkg) | |||
279 | lockdep_assert_held(blkg->q->queue_lock); | 279 | lockdep_assert_held(blkg->q->queue_lock); |
280 | WARN_ON_ONCE(blkg->refcnt <= 0); | 280 | WARN_ON_ONCE(blkg->refcnt <= 0); |
281 | if (!--blkg->refcnt) | 281 | if (!--blkg->refcnt) |
282 | __blkg_release(blkg); | 282 | call_rcu(&blkg->rcu_head, __blkg_release_rcu); |
283 | } | 283 | } |
284 | 284 | ||
285 | struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q, | ||
286 | bool update_hint); | ||
287 | |||
288 | /** | ||
289 | * blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants | ||
290 | * @d_blkg: loop cursor pointing to the current descendant | ||
291 | * @pos_cgrp: used for iteration | ||
292 | * @p_blkg: target blkg to walk descendants of | ||
293 | * | ||
294 | * Walk @c_blkg through the descendants of @p_blkg. Must be used with RCU | ||
295 | * read locked. If called under either blkcg or queue lock, the iteration | ||
296 | * is guaranteed to include all and only online blkgs. The caller may | ||
297 | * update @pos_cgrp by calling cgroup_rightmost_descendant() to skip | ||
298 | * subtree. | ||
299 | */ | ||
300 | #define blkg_for_each_descendant_pre(d_blkg, pos_cgrp, p_blkg) \ | ||
301 | cgroup_for_each_descendant_pre((pos_cgrp), (p_blkg)->blkcg->css.cgroup) \ | ||
302 | if (((d_blkg) = __blkg_lookup(cgroup_to_blkcg(pos_cgrp), \ | ||
303 | (p_blkg)->q, false))) | ||
304 | |||
305 | /** | ||
306 | * blkg_for_each_descendant_post - post-order walk of a blkg's descendants | ||
307 | * @d_blkg: loop cursor pointing to the current descendant | ||
308 | * @pos_cgrp: used for iteration | ||
309 | * @p_blkg: target blkg to walk descendants of | ||
310 | * | ||
311 | * Similar to blkg_for_each_descendant_pre() but performs post-order | ||
312 | * traversal instead. Synchronization rules are the same. | ||
313 | */ | ||
314 | #define blkg_for_each_descendant_post(d_blkg, pos_cgrp, p_blkg) \ | ||
315 | cgroup_for_each_descendant_post((pos_cgrp), (p_blkg)->blkcg->css.cgroup) \ | ||
316 | if (((d_blkg) = __blkg_lookup(cgroup_to_blkcg(pos_cgrp), \ | ||
317 | (p_blkg)->q, false))) | ||
318 | |||
285 | /** | 319 | /** |
286 | * blk_get_rl - get request_list to use | 320 | * blk_get_rl - get request_list to use |
287 | * @q: request_queue of interest | 321 | * @q: request_queue of interest |
diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 31146225f3d0..08a32dfd3844 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c | |||
@@ -25,18 +25,61 @@ static struct blkcg_policy blkcg_policy_throtl; | |||
25 | 25 | ||
26 | /* A workqueue to queue throttle related work */ | 26 | /* A workqueue to queue throttle related work */ |
27 | static struct workqueue_struct *kthrotld_workqueue; | 27 | static struct workqueue_struct *kthrotld_workqueue; |
28 | static void throtl_schedule_delayed_work(struct throtl_data *td, | 28 | |
29 | unsigned long delay); | 29 | /* |
30 | 30 | * To implement hierarchical throttling, throtl_grps form a tree and bios | |
31 | struct throtl_rb_root { | 31 | * are dispatched upwards level by level until they reach the top and get |
32 | struct rb_root rb; | 32 | * issued. When dispatching bios from the children and local group at each |
33 | struct rb_node *left; | 33 | * level, if the bios are dispatched into a single bio_list, there's a risk |
34 | unsigned int count; | 34 | * of a local or child group which can queue many bios at once filling up |
35 | unsigned long min_disptime; | 35 | * the list starving others. |
36 | * | ||
37 | * To avoid such starvation, dispatched bios are queued separately | ||
38 | * according to where they came from. When they are again dispatched to | ||
39 | * the parent, they're popped in round-robin order so that no single source | ||
40 | * hogs the dispatch window. | ||
41 | * | ||
42 | * throtl_qnode is used to keep the queued bios separated by their sources. | ||
43 | * Bios are queued to throtl_qnode which in turn is queued to | ||
44 | * throtl_service_queue and then dispatched in round-robin order. | ||
45 | * | ||
46 | * It's also used to track the reference counts on blkg's. A qnode always | ||
47 | * belongs to a throtl_grp and gets queued on itself or the parent, so | ||
48 | * incrementing the reference of the associated throtl_grp when a qnode is | ||
49 | * queued and decrementing when dequeued is enough to keep the whole blkg | ||
50 | * tree pinned while bios are in flight. | ||
51 | */ | ||
52 | struct throtl_qnode { | ||
53 | struct list_head node; /* service_queue->queued[] */ | ||
54 | struct bio_list bios; /* queued bios */ | ||
55 | struct throtl_grp *tg; /* tg this qnode belongs to */ | ||
36 | }; | 56 | }; |
37 | 57 | ||
38 | #define THROTL_RB_ROOT (struct throtl_rb_root) { .rb = RB_ROOT, .left = NULL, \ | 58 | struct throtl_service_queue { |
39 | .count = 0, .min_disptime = 0} | 59 | struct throtl_service_queue *parent_sq; /* the parent service_queue */ |
60 | |||
61 | /* | ||
62 | * Bios queued directly to this service_queue or dispatched from | ||
63 | * children throtl_grp's. | ||
64 | */ | ||
65 | struct list_head queued[2]; /* throtl_qnode [READ/WRITE] */ | ||
66 | unsigned int nr_queued[2]; /* number of queued bios */ | ||
67 | |||
68 | /* | ||
69 | * RB tree of active children throtl_grp's, which are sorted by | ||
70 | * their ->disptime. | ||
71 | */ | ||
72 | struct rb_root pending_tree; /* RB tree of active tgs */ | ||
73 | struct rb_node *first_pending; /* first node in the tree */ | ||
74 | unsigned int nr_pending; /* # queued in the tree */ | ||
75 | unsigned long first_pending_disptime; /* disptime of the first tg */ | ||
76 | struct timer_list pending_timer; /* fires on first_pending_disptime */ | ||
77 | }; | ||
78 | |||
79 | enum tg_state_flags { | ||
80 | THROTL_TG_PENDING = 1 << 0, /* on parent's pending tree */ | ||
81 | THROTL_TG_WAS_EMPTY = 1 << 1, /* bio_lists[] became non-empty */ | ||
82 | }; | ||
40 | 83 | ||
41 | #define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node) | 84 | #define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node) |
42 | 85 | ||
@@ -52,9 +95,26 @@ struct throtl_grp { | |||
52 | /* must be the first member */ | 95 | /* must be the first member */ |
53 | struct blkg_policy_data pd; | 96 | struct blkg_policy_data pd; |
54 | 97 | ||
55 | /* active throtl group service_tree member */ | 98 | /* active throtl group service_queue member */ |
56 | struct rb_node rb_node; | 99 | struct rb_node rb_node; |
57 | 100 | ||
101 | /* throtl_data this group belongs to */ | ||
102 | struct throtl_data *td; | ||
103 | |||
104 | /* this group's service queue */ | ||
105 | struct throtl_service_queue service_queue; | ||
106 | |||
107 | /* | ||
108 | * qnode_on_self is used when bios are directly queued to this | ||
109 | * throtl_grp so that local bios compete fairly with bios | ||
110 | * dispatched from children. qnode_on_parent is used when bios are | ||
111 | * dispatched from this throtl_grp into its parent and will compete | ||
112 | * with the sibling qnode_on_parents and the parent's | ||
113 | * qnode_on_self. | ||
114 | */ | ||
115 | struct throtl_qnode qnode_on_self[2]; | ||
116 | struct throtl_qnode qnode_on_parent[2]; | ||
117 | |||
58 | /* | 118 | /* |
59 | * Dispatch time in jiffies. This is the estimated time when group | 119 | * Dispatch time in jiffies. This is the estimated time when group |
60 | * will unthrottle and is ready to dispatch more bio. It is used as | 120 | * will unthrottle and is ready to dispatch more bio. It is used as |
@@ -64,11 +124,8 @@ struct throtl_grp { | |||
64 | 124 | ||
65 | unsigned int flags; | 125 | unsigned int flags; |
66 | 126 | ||
67 | /* Two lists for READ and WRITE */ | 127 | /* are there any throtl rules between this group and td? */ |
68 | struct bio_list bio_lists[2]; | 128 | bool has_rules[2]; |
69 | |||
70 | /* Number of queued bios on READ and WRITE lists */ | ||
71 | unsigned int nr_queued[2]; | ||
72 | 129 | ||
73 | /* bytes per second rate limits */ | 130 | /* bytes per second rate limits */ |
74 | uint64_t bps[2]; | 131 | uint64_t bps[2]; |
@@ -85,9 +142,6 @@ struct throtl_grp { | |||
85 | unsigned long slice_start[2]; | 142 | unsigned long slice_start[2]; |
86 | unsigned long slice_end[2]; | 143 | unsigned long slice_end[2]; |
87 | 144 | ||
88 | /* Some throttle limits got updated for the group */ | ||
89 | int limits_changed; | ||
90 | |||
91 | /* Per cpu stats pointer */ | 145 | /* Per cpu stats pointer */ |
92 | struct tg_stats_cpu __percpu *stats_cpu; | 146 | struct tg_stats_cpu __percpu *stats_cpu; |
93 | 147 | ||
@@ -98,7 +152,7 @@ struct throtl_grp { | |||
98 | struct throtl_data | 152 | struct throtl_data |
99 | { | 153 | { |
100 | /* service tree for active throtl groups */ | 154 | /* service tree for active throtl groups */ |
101 | struct throtl_rb_root tg_service_tree; | 155 | struct throtl_service_queue service_queue; |
102 | 156 | ||
103 | struct request_queue *queue; | 157 | struct request_queue *queue; |
104 | 158 | ||
@@ -111,9 +165,7 @@ struct throtl_data | |||
111 | unsigned int nr_undestroyed_grps; | 165 | unsigned int nr_undestroyed_grps; |
112 | 166 | ||
113 | /* Work for dispatching throttled bios */ | 167 | /* Work for dispatching throttled bios */ |
114 | struct delayed_work throtl_work; | 168 | struct work_struct dispatch_work; |
115 | |||
116 | int limits_changed; | ||
117 | }; | 169 | }; |
118 | 170 | ||
119 | /* list and work item to allocate percpu group stats */ | 171 | /* list and work item to allocate percpu group stats */ |
@@ -123,6 +175,8 @@ static LIST_HEAD(tg_stats_alloc_list); | |||
123 | static void tg_stats_alloc_fn(struct work_struct *); | 175 | static void tg_stats_alloc_fn(struct work_struct *); |
124 | static DECLARE_DELAYED_WORK(tg_stats_alloc_work, tg_stats_alloc_fn); | 176 | static DECLARE_DELAYED_WORK(tg_stats_alloc_work, tg_stats_alloc_fn); |
125 | 177 | ||
178 | static void throtl_pending_timer_fn(unsigned long arg); | ||
179 | |||
126 | static inline struct throtl_grp *pd_to_tg(struct blkg_policy_data *pd) | 180 | static inline struct throtl_grp *pd_to_tg(struct blkg_policy_data *pd) |
127 | { | 181 | { |
128 | return pd ? container_of(pd, struct throtl_grp, pd) : NULL; | 182 | return pd ? container_of(pd, struct throtl_grp, pd) : NULL; |
@@ -143,41 +197,65 @@ static inline struct throtl_grp *td_root_tg(struct throtl_data *td) | |||
143 | return blkg_to_tg(td->queue->root_blkg); | 197 | return blkg_to_tg(td->queue->root_blkg); |
144 | } | 198 | } |
145 | 199 | ||
146 | enum tg_state_flags { | 200 | /** |
147 | THROTL_TG_FLAG_on_rr = 0, /* on round-robin busy list */ | 201 | * sq_to_tg - return the throl_grp the specified service queue belongs to |
148 | }; | 202 | * @sq: the throtl_service_queue of interest |
149 | 203 | * | |
150 | #define THROTL_TG_FNS(name) \ | 204 | * Return the throtl_grp @sq belongs to. If @sq is the top-level one |
151 | static inline void throtl_mark_tg_##name(struct throtl_grp *tg) \ | 205 | * embedded in throtl_data, %NULL is returned. |
152 | { \ | 206 | */ |
153 | (tg)->flags |= (1 << THROTL_TG_FLAG_##name); \ | 207 | static struct throtl_grp *sq_to_tg(struct throtl_service_queue *sq) |
154 | } \ | 208 | { |
155 | static inline void throtl_clear_tg_##name(struct throtl_grp *tg) \ | 209 | if (sq && sq->parent_sq) |
156 | { \ | 210 | return container_of(sq, struct throtl_grp, service_queue); |
157 | (tg)->flags &= ~(1 << THROTL_TG_FLAG_##name); \ | 211 | else |
158 | } \ | 212 | return NULL; |
159 | static inline int throtl_tg_##name(const struct throtl_grp *tg) \ | ||
160 | { \ | ||
161 | return ((tg)->flags & (1 << THROTL_TG_FLAG_##name)) != 0; \ | ||
162 | } | 213 | } |
163 | 214 | ||
164 | THROTL_TG_FNS(on_rr); | 215 | /** |
216 | * sq_to_td - return throtl_data the specified service queue belongs to | ||
217 | * @sq: the throtl_service_queue of interest | ||
218 | * | ||
219 | * A service_queue can be embeded in either a throtl_grp or throtl_data. | ||
220 | * Determine the associated throtl_data accordingly and return it. | ||
221 | */ | ||
222 | static struct throtl_data *sq_to_td(struct throtl_service_queue *sq) | ||
223 | { | ||
224 | struct throtl_grp *tg = sq_to_tg(sq); | ||
165 | 225 | ||
166 | #define throtl_log_tg(td, tg, fmt, args...) do { \ | 226 | if (tg) |
167 | char __pbuf[128]; \ | 227 | return tg->td; |
228 | else | ||
229 | return container_of(sq, struct throtl_data, service_queue); | ||
230 | } | ||
231 | |||
232 | /** | ||
233 | * throtl_log - log debug message via blktrace | ||
234 | * @sq: the service_queue being reported | ||
235 | * @fmt: printf format string | ||
236 | * @args: printf args | ||
237 | * | ||
238 | * The messages are prefixed with "throtl BLKG_NAME" if @sq belongs to a | ||
239 | * throtl_grp; otherwise, just "throtl". | ||
240 | * | ||
241 | * TODO: this should be made a function and name formatting should happen | ||
242 | * after testing whether blktrace is enabled. | ||
243 | */ | ||
244 | #define throtl_log(sq, fmt, args...) do { \ | ||
245 | struct throtl_grp *__tg = sq_to_tg((sq)); \ | ||
246 | struct throtl_data *__td = sq_to_td((sq)); \ | ||
247 | \ | ||
248 | (void)__td; \ | ||
249 | if ((__tg)) { \ | ||
250 | char __pbuf[128]; \ | ||
168 | \ | 251 | \ |
169 | blkg_path(tg_to_blkg(tg), __pbuf, sizeof(__pbuf)); \ | 252 | blkg_path(tg_to_blkg(__tg), __pbuf, sizeof(__pbuf)); \ |
170 | blk_add_trace_msg((td)->queue, "throtl %s " fmt, __pbuf, ##args); \ | 253 | blk_add_trace_msg(__td->queue, "throtl %s " fmt, __pbuf, ##args); \ |
254 | } else { \ | ||
255 | blk_add_trace_msg(__td->queue, "throtl " fmt, ##args); \ | ||
256 | } \ | ||
171 | } while (0) | 257 | } while (0) |
172 | 258 | ||
173 | #define throtl_log(td, fmt, args...) \ | ||
174 | blk_add_trace_msg((td)->queue, "throtl " fmt, ##args) | ||
175 | |||
176 | static inline unsigned int total_nr_queued(struct throtl_data *td) | ||
177 | { | ||
178 | return td->nr_queued[0] + td->nr_queued[1]; | ||
179 | } | ||
180 | |||
181 | /* | 259 | /* |
182 | * Worker for allocating per cpu stat for tgs. This is scheduled on the | 260 | * Worker for allocating per cpu stat for tgs. This is scheduled on the |
183 | * system_wq once there are some groups on the alloc_list waiting for | 261 | * system_wq once there are some groups on the alloc_list waiting for |
@@ -215,15 +293,141 @@ alloc_stats: | |||
215 | goto alloc_stats; | 293 | goto alloc_stats; |
216 | } | 294 | } |
217 | 295 | ||
296 | static void throtl_qnode_init(struct throtl_qnode *qn, struct throtl_grp *tg) | ||
297 | { | ||
298 | INIT_LIST_HEAD(&qn->node); | ||
299 | bio_list_init(&qn->bios); | ||
300 | qn->tg = tg; | ||
301 | } | ||
302 | |||
303 | /** | ||
304 | * throtl_qnode_add_bio - add a bio to a throtl_qnode and activate it | ||
305 | * @bio: bio being added | ||
306 | * @qn: qnode to add bio to | ||
307 | * @queued: the service_queue->queued[] list @qn belongs to | ||
308 | * | ||
309 | * Add @bio to @qn and put @qn on @queued if it's not already on. | ||
310 | * @qn->tg's reference count is bumped when @qn is activated. See the | ||
311 | * comment on top of throtl_qnode definition for details. | ||
312 | */ | ||
313 | static void throtl_qnode_add_bio(struct bio *bio, struct throtl_qnode *qn, | ||
314 | struct list_head *queued) | ||
315 | { | ||
316 | bio_list_add(&qn->bios, bio); | ||
317 | if (list_empty(&qn->node)) { | ||
318 | list_add_tail(&qn->node, queued); | ||
319 | blkg_get(tg_to_blkg(qn->tg)); | ||
320 | } | ||
321 | } | ||
322 | |||
323 | /** | ||
324 | * throtl_peek_queued - peek the first bio on a qnode list | ||
325 | * @queued: the qnode list to peek | ||
326 | */ | ||
327 | static struct bio *throtl_peek_queued(struct list_head *queued) | ||
328 | { | ||
329 | struct throtl_qnode *qn = list_first_entry(queued, struct throtl_qnode, node); | ||
330 | struct bio *bio; | ||
331 | |||
332 | if (list_empty(queued)) | ||
333 | return NULL; | ||
334 | |||
335 | bio = bio_list_peek(&qn->bios); | ||
336 | WARN_ON_ONCE(!bio); | ||
337 | return bio; | ||
338 | } | ||
339 | |||
340 | /** | ||
341 | * throtl_pop_queued - pop the first bio form a qnode list | ||
342 | * @queued: the qnode list to pop a bio from | ||
343 | * @tg_to_put: optional out argument for throtl_grp to put | ||
344 | * | ||
345 | * Pop the first bio from the qnode list @queued. After popping, the first | ||
346 | * qnode is removed from @queued if empty or moved to the end of @queued so | ||
347 | * that the popping order is round-robin. | ||
348 | * | ||
349 | * When the first qnode is removed, its associated throtl_grp should be put | ||
350 | * too. If @tg_to_put is NULL, this function automatically puts it; | ||
351 | * otherwise, *@tg_to_put is set to the throtl_grp to put and the caller is | ||
352 | * responsible for putting it. | ||
353 | */ | ||
354 | static struct bio *throtl_pop_queued(struct list_head *queued, | ||
355 | struct throtl_grp **tg_to_put) | ||
356 | { | ||
357 | struct throtl_qnode *qn = list_first_entry(queued, struct throtl_qnode, node); | ||
358 | struct bio *bio; | ||
359 | |||
360 | if (list_empty(queued)) | ||
361 | return NULL; | ||
362 | |||
363 | bio = bio_list_pop(&qn->bios); | ||
364 | WARN_ON_ONCE(!bio); | ||
365 | |||
366 | if (bio_list_empty(&qn->bios)) { | ||
367 | list_del_init(&qn->node); | ||
368 | if (tg_to_put) | ||
369 | *tg_to_put = qn->tg; | ||
370 | else | ||
371 | blkg_put(tg_to_blkg(qn->tg)); | ||
372 | } else { | ||
373 | list_move_tail(&qn->node, queued); | ||
374 | } | ||
375 | |||
376 | return bio; | ||
377 | } | ||
378 | |||
379 | /* init a service_queue, assumes the caller zeroed it */ | ||
380 | static void throtl_service_queue_init(struct throtl_service_queue *sq, | ||
381 | struct throtl_service_queue *parent_sq) | ||
382 | { | ||
383 | INIT_LIST_HEAD(&sq->queued[0]); | ||
384 | INIT_LIST_HEAD(&sq->queued[1]); | ||
385 | sq->pending_tree = RB_ROOT; | ||
386 | sq->parent_sq = parent_sq; | ||
387 | setup_timer(&sq->pending_timer, throtl_pending_timer_fn, | ||
388 | (unsigned long)sq); | ||
389 | } | ||
390 | |||
391 | static void throtl_service_queue_exit(struct throtl_service_queue *sq) | ||
392 | { | ||
393 | del_timer_sync(&sq->pending_timer); | ||
394 | } | ||
395 | |||
218 | static void throtl_pd_init(struct blkcg_gq *blkg) | 396 | static void throtl_pd_init(struct blkcg_gq *blkg) |
219 | { | 397 | { |
220 | struct throtl_grp *tg = blkg_to_tg(blkg); | 398 | struct throtl_grp *tg = blkg_to_tg(blkg); |
399 | struct throtl_data *td = blkg->q->td; | ||
400 | struct throtl_service_queue *parent_sq; | ||
221 | unsigned long flags; | 401 | unsigned long flags; |
402 | int rw; | ||
403 | |||
404 | /* | ||
405 | * If sane_hierarchy is enabled, we switch to properly hierarchical | ||
406 | * behavior where limits on a given throtl_grp are applied to the | ||
407 | * whole subtree rather than just the group itself. e.g. If 16M | ||
408 | * read_bps limit is set on the root group, the whole system can't | ||
409 | * exceed 16M for the device. | ||
410 | * | ||
411 | * If sane_hierarchy is not enabled, the broken flat hierarchy | ||
412 | * behavior is retained where all throtl_grps are treated as if | ||
413 | * they're all separate root groups right below throtl_data. | ||
414 | * Limits of a group don't interact with limits of other groups | ||
415 | * regardless of the position of the group in the hierarchy. | ||
416 | */ | ||
417 | parent_sq = &td->service_queue; | ||
418 | |||
419 | if (cgroup_sane_behavior(blkg->blkcg->css.cgroup) && blkg->parent) | ||
420 | parent_sq = &blkg_to_tg(blkg->parent)->service_queue; | ||
421 | |||
422 | throtl_service_queue_init(&tg->service_queue, parent_sq); | ||
423 | |||
424 | for (rw = READ; rw <= WRITE; rw++) { | ||
425 | throtl_qnode_init(&tg->qnode_on_self[rw], tg); | ||
426 | throtl_qnode_init(&tg->qnode_on_parent[rw], tg); | ||
427 | } | ||
222 | 428 | ||
223 | RB_CLEAR_NODE(&tg->rb_node); | 429 | RB_CLEAR_NODE(&tg->rb_node); |
224 | bio_list_init(&tg->bio_lists[0]); | 430 | tg->td = td; |
225 | bio_list_init(&tg->bio_lists[1]); | ||
226 | tg->limits_changed = false; | ||
227 | 431 | ||
228 | tg->bps[READ] = -1; | 432 | tg->bps[READ] = -1; |
229 | tg->bps[WRITE] = -1; | 433 | tg->bps[WRITE] = -1; |
@@ -241,6 +445,30 @@ static void throtl_pd_init(struct blkcg_gq *blkg) | |||
241 | spin_unlock_irqrestore(&tg_stats_alloc_lock, flags); | 445 | spin_unlock_irqrestore(&tg_stats_alloc_lock, flags); |
242 | } | 446 | } |
243 | 447 | ||
448 | /* | ||
449 | * Set has_rules[] if @tg or any of its parents have limits configured. | ||
450 | * This doesn't require walking up to the top of the hierarchy as the | ||
451 | * parent's has_rules[] is guaranteed to be correct. | ||
452 | */ | ||
453 | static void tg_update_has_rules(struct throtl_grp *tg) | ||
454 | { | ||
455 | struct throtl_grp *parent_tg = sq_to_tg(tg->service_queue.parent_sq); | ||
456 | int rw; | ||
457 | |||
458 | for (rw = READ; rw <= WRITE; rw++) | ||
459 | tg->has_rules[rw] = (parent_tg && parent_tg->has_rules[rw]) || | ||
460 | (tg->bps[rw] != -1 || tg->iops[rw] != -1); | ||
461 | } | ||
462 | |||
463 | static void throtl_pd_online(struct blkcg_gq *blkg) | ||
464 | { | ||
465 | /* | ||
466 | * We don't want new groups to escape the limits of its ancestors. | ||
467 | * Update has_rules[] after a new group is brought online. | ||
468 | */ | ||
469 | tg_update_has_rules(blkg_to_tg(blkg)); | ||
470 | } | ||
471 | |||
244 | static void throtl_pd_exit(struct blkcg_gq *blkg) | 472 | static void throtl_pd_exit(struct blkcg_gq *blkg) |
245 | { | 473 | { |
246 | struct throtl_grp *tg = blkg_to_tg(blkg); | 474 | struct throtl_grp *tg = blkg_to_tg(blkg); |
@@ -251,6 +479,8 @@ static void throtl_pd_exit(struct blkcg_gq *blkg) | |||
251 | spin_unlock_irqrestore(&tg_stats_alloc_lock, flags); | 479 | spin_unlock_irqrestore(&tg_stats_alloc_lock, flags); |
252 | 480 | ||
253 | free_percpu(tg->stats_cpu); | 481 | free_percpu(tg->stats_cpu); |
482 | |||
483 | throtl_service_queue_exit(&tg->service_queue); | ||
254 | } | 484 | } |
255 | 485 | ||
256 | static void throtl_pd_reset_stats(struct blkcg_gq *blkg) | 486 | static void throtl_pd_reset_stats(struct blkcg_gq *blkg) |
@@ -309,17 +539,18 @@ static struct throtl_grp *throtl_lookup_create_tg(struct throtl_data *td, | |||
309 | return tg; | 539 | return tg; |
310 | } | 540 | } |
311 | 541 | ||
312 | static struct throtl_grp *throtl_rb_first(struct throtl_rb_root *root) | 542 | static struct throtl_grp * |
543 | throtl_rb_first(struct throtl_service_queue *parent_sq) | ||
313 | { | 544 | { |
314 | /* Service tree is empty */ | 545 | /* Service tree is empty */ |
315 | if (!root->count) | 546 | if (!parent_sq->nr_pending) |
316 | return NULL; | 547 | return NULL; |
317 | 548 | ||
318 | if (!root->left) | 549 | if (!parent_sq->first_pending) |
319 | root->left = rb_first(&root->rb); | 550 | parent_sq->first_pending = rb_first(&parent_sq->pending_tree); |
320 | 551 | ||
321 | if (root->left) | 552 | if (parent_sq->first_pending) |
322 | return rb_entry_tg(root->left); | 553 | return rb_entry_tg(parent_sq->first_pending); |
323 | 554 | ||
324 | return NULL; | 555 | return NULL; |
325 | } | 556 | } |
@@ -330,29 +561,30 @@ static void rb_erase_init(struct rb_node *n, struct rb_root *root) | |||
330 | RB_CLEAR_NODE(n); | 561 | RB_CLEAR_NODE(n); |
331 | } | 562 | } |
332 | 563 | ||
333 | static void throtl_rb_erase(struct rb_node *n, struct throtl_rb_root *root) | 564 | static void throtl_rb_erase(struct rb_node *n, |
565 | struct throtl_service_queue *parent_sq) | ||
334 | { | 566 | { |
335 | if (root->left == n) | 567 | if (parent_sq->first_pending == n) |
336 | root->left = NULL; | 568 | parent_sq->first_pending = NULL; |
337 | rb_erase_init(n, &root->rb); | 569 | rb_erase_init(n, &parent_sq->pending_tree); |
338 | --root->count; | 570 | --parent_sq->nr_pending; |
339 | } | 571 | } |
340 | 572 | ||
341 | static void update_min_dispatch_time(struct throtl_rb_root *st) | 573 | static void update_min_dispatch_time(struct throtl_service_queue *parent_sq) |
342 | { | 574 | { |
343 | struct throtl_grp *tg; | 575 | struct throtl_grp *tg; |
344 | 576 | ||
345 | tg = throtl_rb_first(st); | 577 | tg = throtl_rb_first(parent_sq); |
346 | if (!tg) | 578 | if (!tg) |
347 | return; | 579 | return; |
348 | 580 | ||
349 | st->min_disptime = tg->disptime; | 581 | parent_sq->first_pending_disptime = tg->disptime; |
350 | } | 582 | } |
351 | 583 | ||
352 | static void | 584 | static void tg_service_queue_add(struct throtl_grp *tg) |
353 | tg_service_tree_add(struct throtl_rb_root *st, struct throtl_grp *tg) | ||
354 | { | 585 | { |
355 | struct rb_node **node = &st->rb.rb_node; | 586 | struct throtl_service_queue *parent_sq = tg->service_queue.parent_sq; |
587 | struct rb_node **node = &parent_sq->pending_tree.rb_node; | ||
356 | struct rb_node *parent = NULL; | 588 | struct rb_node *parent = NULL; |
357 | struct throtl_grp *__tg; | 589 | struct throtl_grp *__tg; |
358 | unsigned long key = tg->disptime; | 590 | unsigned long key = tg->disptime; |
@@ -371,89 +603,135 @@ tg_service_tree_add(struct throtl_rb_root *st, struct throtl_grp *tg) | |||
371 | } | 603 | } |
372 | 604 | ||
373 | if (left) | 605 | if (left) |
374 | st->left = &tg->rb_node; | 606 | parent_sq->first_pending = &tg->rb_node; |
375 | 607 | ||
376 | rb_link_node(&tg->rb_node, parent, node); | 608 | rb_link_node(&tg->rb_node, parent, node); |
377 | rb_insert_color(&tg->rb_node, &st->rb); | 609 | rb_insert_color(&tg->rb_node, &parent_sq->pending_tree); |
378 | } | 610 | } |
379 | 611 | ||
380 | static void __throtl_enqueue_tg(struct throtl_data *td, struct throtl_grp *tg) | 612 | static void __throtl_enqueue_tg(struct throtl_grp *tg) |
381 | { | 613 | { |
382 | struct throtl_rb_root *st = &td->tg_service_tree; | 614 | tg_service_queue_add(tg); |
615 | tg->flags |= THROTL_TG_PENDING; | ||
616 | tg->service_queue.parent_sq->nr_pending++; | ||
617 | } | ||
383 | 618 | ||
384 | tg_service_tree_add(st, tg); | 619 | static void throtl_enqueue_tg(struct throtl_grp *tg) |
385 | throtl_mark_tg_on_rr(tg); | 620 | { |
386 | st->count++; | 621 | if (!(tg->flags & THROTL_TG_PENDING)) |
622 | __throtl_enqueue_tg(tg); | ||
387 | } | 623 | } |
388 | 624 | ||
389 | static void throtl_enqueue_tg(struct throtl_data *td, struct throtl_grp *tg) | 625 | static void __throtl_dequeue_tg(struct throtl_grp *tg) |
390 | { | 626 | { |
391 | if (!throtl_tg_on_rr(tg)) | 627 | throtl_rb_erase(&tg->rb_node, tg->service_queue.parent_sq); |
392 | __throtl_enqueue_tg(td, tg); | 628 | tg->flags &= ~THROTL_TG_PENDING; |
393 | } | 629 | } |
394 | 630 | ||
395 | static void __throtl_dequeue_tg(struct throtl_data *td, struct throtl_grp *tg) | 631 | static void throtl_dequeue_tg(struct throtl_grp *tg) |
396 | { | 632 | { |
397 | throtl_rb_erase(&tg->rb_node, &td->tg_service_tree); | 633 | if (tg->flags & THROTL_TG_PENDING) |
398 | throtl_clear_tg_on_rr(tg); | 634 | __throtl_dequeue_tg(tg); |
399 | } | 635 | } |
400 | 636 | ||
401 | static void throtl_dequeue_tg(struct throtl_data *td, struct throtl_grp *tg) | 637 | /* Call with queue lock held */ |
638 | static void throtl_schedule_pending_timer(struct throtl_service_queue *sq, | ||
639 | unsigned long expires) | ||
402 | { | 640 | { |
403 | if (throtl_tg_on_rr(tg)) | 641 | mod_timer(&sq->pending_timer, expires); |
404 | __throtl_dequeue_tg(td, tg); | 642 | throtl_log(sq, "schedule timer. delay=%lu jiffies=%lu", |
643 | expires - jiffies, jiffies); | ||
405 | } | 644 | } |
406 | 645 | ||
407 | static void throtl_schedule_next_dispatch(struct throtl_data *td) | 646 | /** |
647 | * throtl_schedule_next_dispatch - schedule the next dispatch cycle | ||
648 | * @sq: the service_queue to schedule dispatch for | ||
649 | * @force: force scheduling | ||
650 | * | ||
651 | * Arm @sq->pending_timer so that the next dispatch cycle starts on the | ||
652 | * dispatch time of the first pending child. Returns %true if either timer | ||
653 | * is armed or there's no pending child left. %false if the current | ||
654 | * dispatch window is still open and the caller should continue | ||
655 | * dispatching. | ||
656 | * | ||
657 | * If @force is %true, the dispatch timer is always scheduled and this | ||
658 | * function is guaranteed to return %true. This is to be used when the | ||
659 | * caller can't dispatch itself and needs to invoke pending_timer | ||
660 | * unconditionally. Note that forced scheduling is likely to induce short | ||
661 | * delay before dispatch starts even if @sq->first_pending_disptime is not | ||
662 | * in the future and thus shouldn't be used in hot paths. | ||
663 | */ | ||
664 | static bool throtl_schedule_next_dispatch(struct throtl_service_queue *sq, | ||
665 | bool force) | ||
408 | { | 666 | { |
409 | struct throtl_rb_root *st = &td->tg_service_tree; | 667 | /* any pending children left? */ |
668 | if (!sq->nr_pending) | ||
669 | return true; | ||
410 | 670 | ||
411 | /* | 671 | update_min_dispatch_time(sq); |
412 | * If there are more bios pending, schedule more work. | ||
413 | */ | ||
414 | if (!total_nr_queued(td)) | ||
415 | return; | ||
416 | 672 | ||
417 | BUG_ON(!st->count); | 673 | /* is the next dispatch time in the future? */ |
674 | if (force || time_after(sq->first_pending_disptime, jiffies)) { | ||
675 | throtl_schedule_pending_timer(sq, sq->first_pending_disptime); | ||
676 | return true; | ||
677 | } | ||
418 | 678 | ||
419 | update_min_dispatch_time(st); | 679 | /* tell the caller to continue dispatching */ |
680 | return false; | ||
681 | } | ||
420 | 682 | ||
421 | if (time_before_eq(st->min_disptime, jiffies)) | 683 | static inline void throtl_start_new_slice_with_credit(struct throtl_grp *tg, |
422 | throtl_schedule_delayed_work(td, 0); | 684 | bool rw, unsigned long start) |
423 | else | 685 | { |
424 | throtl_schedule_delayed_work(td, (st->min_disptime - jiffies)); | 686 | tg->bytes_disp[rw] = 0; |
687 | tg->io_disp[rw] = 0; | ||
688 | |||
689 | /* | ||
690 | * Previous slice has expired. We must have trimmed it after last | ||
691 | * bio dispatch. That means since start of last slice, we never used | ||
692 | * that bandwidth. Do try to make use of that bandwidth while giving | ||
693 | * credit. | ||
694 | */ | ||
695 | if (time_after_eq(start, tg->slice_start[rw])) | ||
696 | tg->slice_start[rw] = start; | ||
697 | |||
698 | tg->slice_end[rw] = jiffies + throtl_slice; | ||
699 | throtl_log(&tg->service_queue, | ||
700 | "[%c] new slice with credit start=%lu end=%lu jiffies=%lu", | ||
701 | rw == READ ? 'R' : 'W', tg->slice_start[rw], | ||
702 | tg->slice_end[rw], jiffies); | ||
425 | } | 703 | } |
426 | 704 | ||
427 | static inline void | 705 | static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw) |
428 | throtl_start_new_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw) | ||
429 | { | 706 | { |
430 | tg->bytes_disp[rw] = 0; | 707 | tg->bytes_disp[rw] = 0; |
431 | tg->io_disp[rw] = 0; | 708 | tg->io_disp[rw] = 0; |
432 | tg->slice_start[rw] = jiffies; | 709 | tg->slice_start[rw] = jiffies; |
433 | tg->slice_end[rw] = jiffies + throtl_slice; | 710 | tg->slice_end[rw] = jiffies + throtl_slice; |
434 | throtl_log_tg(td, tg, "[%c] new slice start=%lu end=%lu jiffies=%lu", | 711 | throtl_log(&tg->service_queue, |
435 | rw == READ ? 'R' : 'W', tg->slice_start[rw], | 712 | "[%c] new slice start=%lu end=%lu jiffies=%lu", |
436 | tg->slice_end[rw], jiffies); | 713 | rw == READ ? 'R' : 'W', tg->slice_start[rw], |
714 | tg->slice_end[rw], jiffies); | ||
437 | } | 715 | } |
438 | 716 | ||
439 | static inline void throtl_set_slice_end(struct throtl_data *td, | 717 | static inline void throtl_set_slice_end(struct throtl_grp *tg, bool rw, |
440 | struct throtl_grp *tg, bool rw, unsigned long jiffy_end) | 718 | unsigned long jiffy_end) |
441 | { | 719 | { |
442 | tg->slice_end[rw] = roundup(jiffy_end, throtl_slice); | 720 | tg->slice_end[rw] = roundup(jiffy_end, throtl_slice); |
443 | } | 721 | } |
444 | 722 | ||
445 | static inline void throtl_extend_slice(struct throtl_data *td, | 723 | static inline void throtl_extend_slice(struct throtl_grp *tg, bool rw, |
446 | struct throtl_grp *tg, bool rw, unsigned long jiffy_end) | 724 | unsigned long jiffy_end) |
447 | { | 725 | { |
448 | tg->slice_end[rw] = roundup(jiffy_end, throtl_slice); | 726 | tg->slice_end[rw] = roundup(jiffy_end, throtl_slice); |
449 | throtl_log_tg(td, tg, "[%c] extend slice start=%lu end=%lu jiffies=%lu", | 727 | throtl_log(&tg->service_queue, |
450 | rw == READ ? 'R' : 'W', tg->slice_start[rw], | 728 | "[%c] extend slice start=%lu end=%lu jiffies=%lu", |
451 | tg->slice_end[rw], jiffies); | 729 | rw == READ ? 'R' : 'W', tg->slice_start[rw], |
730 | tg->slice_end[rw], jiffies); | ||
452 | } | 731 | } |
453 | 732 | ||
454 | /* Determine if previously allocated or extended slice is complete or not */ | 733 | /* Determine if previously allocated or extended slice is complete or not */ |
455 | static bool | 734 | static bool throtl_slice_used(struct throtl_grp *tg, bool rw) |
456 | throtl_slice_used(struct throtl_data *td, struct throtl_grp *tg, bool rw) | ||
457 | { | 735 | { |
458 | if (time_in_range(jiffies, tg->slice_start[rw], tg->slice_end[rw])) | 736 | if (time_in_range(jiffies, tg->slice_start[rw], tg->slice_end[rw])) |
459 | return 0; | 737 | return 0; |
@@ -462,8 +740,7 @@ throtl_slice_used(struct throtl_data *td, struct throtl_grp *tg, bool rw) | |||
462 | } | 740 | } |
463 | 741 | ||
464 | /* Trim the used slices and adjust slice start accordingly */ | 742 | /* Trim the used slices and adjust slice start accordingly */ |
465 | static inline void | 743 | static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw) |
466 | throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw) | ||
467 | { | 744 | { |
468 | unsigned long nr_slices, time_elapsed, io_trim; | 745 | unsigned long nr_slices, time_elapsed, io_trim; |
469 | u64 bytes_trim, tmp; | 746 | u64 bytes_trim, tmp; |
@@ -475,7 +752,7 @@ throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw) | |||
475 | * renewed. Don't try to trim the slice if slice is used. A new | 752 | * renewed. Don't try to trim the slice if slice is used. A new |
476 | * slice will start when appropriate. | 753 | * slice will start when appropriate. |
477 | */ | 754 | */ |
478 | if (throtl_slice_used(td, tg, rw)) | 755 | if (throtl_slice_used(tg, rw)) |
479 | return; | 756 | return; |
480 | 757 | ||
481 | /* | 758 | /* |
@@ -486,7 +763,7 @@ throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw) | |||
486 | * is bad because it does not allow new slice to start. | 763 | * is bad because it does not allow new slice to start. |
487 | */ | 764 | */ |
488 | 765 | ||
489 | throtl_set_slice_end(td, tg, rw, jiffies + throtl_slice); | 766 | throtl_set_slice_end(tg, rw, jiffies + throtl_slice); |
490 | 767 | ||
491 | time_elapsed = jiffies - tg->slice_start[rw]; | 768 | time_elapsed = jiffies - tg->slice_start[rw]; |
492 | 769 | ||
@@ -515,14 +792,14 @@ throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw) | |||
515 | 792 | ||
516 | tg->slice_start[rw] += nr_slices * throtl_slice; | 793 | tg->slice_start[rw] += nr_slices * throtl_slice; |
517 | 794 | ||
518 | throtl_log_tg(td, tg, "[%c] trim slice nr=%lu bytes=%llu io=%lu" | 795 | throtl_log(&tg->service_queue, |
519 | " start=%lu end=%lu jiffies=%lu", | 796 | "[%c] trim slice nr=%lu bytes=%llu io=%lu start=%lu end=%lu jiffies=%lu", |
520 | rw == READ ? 'R' : 'W', nr_slices, bytes_trim, io_trim, | 797 | rw == READ ? 'R' : 'W', nr_slices, bytes_trim, io_trim, |
521 | tg->slice_start[rw], tg->slice_end[rw], jiffies); | 798 | tg->slice_start[rw], tg->slice_end[rw], jiffies); |
522 | } | 799 | } |
523 | 800 | ||
524 | static bool tg_with_in_iops_limit(struct throtl_data *td, struct throtl_grp *tg, | 801 | static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio, |
525 | struct bio *bio, unsigned long *wait) | 802 | unsigned long *wait) |
526 | { | 803 | { |
527 | bool rw = bio_data_dir(bio); | 804 | bool rw = bio_data_dir(bio); |
528 | unsigned int io_allowed; | 805 | unsigned int io_allowed; |
@@ -571,8 +848,8 @@ static bool tg_with_in_iops_limit(struct throtl_data *td, struct throtl_grp *tg, | |||
571 | return 0; | 848 | return 0; |
572 | } | 849 | } |
573 | 850 | ||
574 | static bool tg_with_in_bps_limit(struct throtl_data *td, struct throtl_grp *tg, | 851 | static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio, |
575 | struct bio *bio, unsigned long *wait) | 852 | unsigned long *wait) |
576 | { | 853 | { |
577 | bool rw = bio_data_dir(bio); | 854 | bool rw = bio_data_dir(bio); |
578 | u64 bytes_allowed, extra_bytes, tmp; | 855 | u64 bytes_allowed, extra_bytes, tmp; |
@@ -613,18 +890,12 @@ static bool tg_with_in_bps_limit(struct throtl_data *td, struct throtl_grp *tg, | |||
613 | return 0; | 890 | return 0; |
614 | } | 891 | } |
615 | 892 | ||
616 | static bool tg_no_rule_group(struct throtl_grp *tg, bool rw) { | ||
617 | if (tg->bps[rw] == -1 && tg->iops[rw] == -1) | ||
618 | return 1; | ||
619 | return 0; | ||
620 | } | ||
621 | |||
622 | /* | 893 | /* |
623 | * Returns whether one can dispatch a bio or not. Also returns approx number | 894 | * Returns whether one can dispatch a bio or not. Also returns approx number |
624 | * of jiffies to wait before this bio is with-in IO rate and can be dispatched | 895 | * of jiffies to wait before this bio is with-in IO rate and can be dispatched |
625 | */ | 896 | */ |
626 | static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg, | 897 | static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio, |
627 | struct bio *bio, unsigned long *wait) | 898 | unsigned long *wait) |
628 | { | 899 | { |
629 | bool rw = bio_data_dir(bio); | 900 | bool rw = bio_data_dir(bio); |
630 | unsigned long bps_wait = 0, iops_wait = 0, max_wait = 0; | 901 | unsigned long bps_wait = 0, iops_wait = 0, max_wait = 0; |
@@ -635,7 +906,8 @@ static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg, | |||
635 | * this function with a different bio if there are other bios | 906 | * this function with a different bio if there are other bios |
636 | * queued. | 907 | * queued. |
637 | */ | 908 | */ |
638 | BUG_ON(tg->nr_queued[rw] && bio != bio_list_peek(&tg->bio_lists[rw])); | 909 | BUG_ON(tg->service_queue.nr_queued[rw] && |
910 | bio != throtl_peek_queued(&tg->service_queue.queued[rw])); | ||
639 | 911 | ||
640 | /* If tg->bps = -1, then BW is unlimited */ | 912 | /* If tg->bps = -1, then BW is unlimited */ |
641 | if (tg->bps[rw] == -1 && tg->iops[rw] == -1) { | 913 | if (tg->bps[rw] == -1 && tg->iops[rw] == -1) { |
@@ -649,15 +921,15 @@ static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg, | |||
649 | * existing slice to make sure it is at least throtl_slice interval | 921 | * existing slice to make sure it is at least throtl_slice interval |
650 | * long since now. | 922 | * long since now. |
651 | */ | 923 | */ |
652 | if (throtl_slice_used(td, tg, rw)) | 924 | if (throtl_slice_used(tg, rw)) |
653 | throtl_start_new_slice(td, tg, rw); | 925 | throtl_start_new_slice(tg, rw); |
654 | else { | 926 | else { |
655 | if (time_before(tg->slice_end[rw], jiffies + throtl_slice)) | 927 | if (time_before(tg->slice_end[rw], jiffies + throtl_slice)) |
656 | throtl_extend_slice(td, tg, rw, jiffies + throtl_slice); | 928 | throtl_extend_slice(tg, rw, jiffies + throtl_slice); |
657 | } | 929 | } |
658 | 930 | ||
659 | if (tg_with_in_bps_limit(td, tg, bio, &bps_wait) | 931 | if (tg_with_in_bps_limit(tg, bio, &bps_wait) && |
660 | && tg_with_in_iops_limit(td, tg, bio, &iops_wait)) { | 932 | tg_with_in_iops_limit(tg, bio, &iops_wait)) { |
661 | if (wait) | 933 | if (wait) |
662 | *wait = 0; | 934 | *wait = 0; |
663 | return 1; | 935 | return 1; |
@@ -669,7 +941,7 @@ static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg, | |||
669 | *wait = max_wait; | 941 | *wait = max_wait; |
670 | 942 | ||
671 | if (time_before(tg->slice_end[rw], jiffies + max_wait)) | 943 | if (time_before(tg->slice_end[rw], jiffies + max_wait)) |
672 | throtl_extend_slice(td, tg, rw, jiffies + max_wait); | 944 | throtl_extend_slice(tg, rw, jiffies + max_wait); |
673 | 945 | ||
674 | return 0; | 946 | return 0; |
675 | } | 947 | } |
@@ -708,65 +980,136 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) | |||
708 | tg->bytes_disp[rw] += bio->bi_size; | 980 | tg->bytes_disp[rw] += bio->bi_size; |
709 | tg->io_disp[rw]++; | 981 | tg->io_disp[rw]++; |
710 | 982 | ||
711 | throtl_update_dispatch_stats(tg_to_blkg(tg), bio->bi_size, bio->bi_rw); | 983 | /* |
984 | * REQ_THROTTLED is used to prevent the same bio to be throttled | ||
985 | * more than once as a throttled bio will go through blk-throtl the | ||
986 | * second time when it eventually gets issued. Set it when a bio | ||
987 | * is being charged to a tg. | ||
988 | * | ||
989 | * Dispatch stats aren't recursive and each @bio should only be | ||
990 | * accounted by the @tg it was originally associated with. Let's | ||
991 | * update the stats when setting REQ_THROTTLED for the first time | ||
992 | * which is guaranteed to be for the @bio's original tg. | ||
993 | */ | ||
994 | if (!(bio->bi_rw & REQ_THROTTLED)) { | ||
995 | bio->bi_rw |= REQ_THROTTLED; | ||
996 | throtl_update_dispatch_stats(tg_to_blkg(tg), bio->bi_size, | ||
997 | bio->bi_rw); | ||
998 | } | ||
712 | } | 999 | } |
713 | 1000 | ||
714 | static void throtl_add_bio_tg(struct throtl_data *td, struct throtl_grp *tg, | 1001 | /** |
715 | struct bio *bio) | 1002 | * throtl_add_bio_tg - add a bio to the specified throtl_grp |
1003 | * @bio: bio to add | ||
1004 | * @qn: qnode to use | ||
1005 | * @tg: the target throtl_grp | ||
1006 | * | ||
1007 | * Add @bio to @tg's service_queue using @qn. If @qn is not specified, | ||
1008 | * tg->qnode_on_self[] is used. | ||
1009 | */ | ||
1010 | static void throtl_add_bio_tg(struct bio *bio, struct throtl_qnode *qn, | ||
1011 | struct throtl_grp *tg) | ||
716 | { | 1012 | { |
1013 | struct throtl_service_queue *sq = &tg->service_queue; | ||
717 | bool rw = bio_data_dir(bio); | 1014 | bool rw = bio_data_dir(bio); |
718 | 1015 | ||
719 | bio_list_add(&tg->bio_lists[rw], bio); | 1016 | if (!qn) |
720 | /* Take a bio reference on tg */ | 1017 | qn = &tg->qnode_on_self[rw]; |
721 | blkg_get(tg_to_blkg(tg)); | 1018 | |
722 | tg->nr_queued[rw]++; | 1019 | /* |
723 | td->nr_queued[rw]++; | 1020 | * If @tg doesn't currently have any bios queued in the same |
724 | throtl_enqueue_tg(td, tg); | 1021 | * direction, queueing @bio can change when @tg should be |
1022 | * dispatched. Mark that @tg was empty. This is automatically | ||
1023 | * cleaered on the next tg_update_disptime(). | ||
1024 | */ | ||
1025 | if (!sq->nr_queued[rw]) | ||
1026 | tg->flags |= THROTL_TG_WAS_EMPTY; | ||
1027 | |||
1028 | throtl_qnode_add_bio(bio, qn, &sq->queued[rw]); | ||
1029 | |||
1030 | sq->nr_queued[rw]++; | ||
1031 | throtl_enqueue_tg(tg); | ||
725 | } | 1032 | } |
726 | 1033 | ||
727 | static void tg_update_disptime(struct throtl_data *td, struct throtl_grp *tg) | 1034 | static void tg_update_disptime(struct throtl_grp *tg) |
728 | { | 1035 | { |
1036 | struct throtl_service_queue *sq = &tg->service_queue; | ||
729 | unsigned long read_wait = -1, write_wait = -1, min_wait = -1, disptime; | 1037 | unsigned long read_wait = -1, write_wait = -1, min_wait = -1, disptime; |
730 | struct bio *bio; | 1038 | struct bio *bio; |
731 | 1039 | ||
732 | if ((bio = bio_list_peek(&tg->bio_lists[READ]))) | 1040 | if ((bio = throtl_peek_queued(&sq->queued[READ]))) |
733 | tg_may_dispatch(td, tg, bio, &read_wait); | 1041 | tg_may_dispatch(tg, bio, &read_wait); |
734 | 1042 | ||
735 | if ((bio = bio_list_peek(&tg->bio_lists[WRITE]))) | 1043 | if ((bio = throtl_peek_queued(&sq->queued[WRITE]))) |
736 | tg_may_dispatch(td, tg, bio, &write_wait); | 1044 | tg_may_dispatch(tg, bio, &write_wait); |
737 | 1045 | ||
738 | min_wait = min(read_wait, write_wait); | 1046 | min_wait = min(read_wait, write_wait); |
739 | disptime = jiffies + min_wait; | 1047 | disptime = jiffies + min_wait; |
740 | 1048 | ||
741 | /* Update dispatch time */ | 1049 | /* Update dispatch time */ |
742 | throtl_dequeue_tg(td, tg); | 1050 | throtl_dequeue_tg(tg); |
743 | tg->disptime = disptime; | 1051 | tg->disptime = disptime; |
744 | throtl_enqueue_tg(td, tg); | 1052 | throtl_enqueue_tg(tg); |
1053 | |||
1054 | /* see throtl_add_bio_tg() */ | ||
1055 | tg->flags &= ~THROTL_TG_WAS_EMPTY; | ||
745 | } | 1056 | } |
746 | 1057 | ||
747 | static void tg_dispatch_one_bio(struct throtl_data *td, struct throtl_grp *tg, | 1058 | static void start_parent_slice_with_credit(struct throtl_grp *child_tg, |
748 | bool rw, struct bio_list *bl) | 1059 | struct throtl_grp *parent_tg, bool rw) |
749 | { | 1060 | { |
750 | struct bio *bio; | 1061 | if (throtl_slice_used(parent_tg, rw)) { |
1062 | throtl_start_new_slice_with_credit(parent_tg, rw, | ||
1063 | child_tg->slice_start[rw]); | ||
1064 | } | ||
1065 | |||
1066 | } | ||
751 | 1067 | ||
752 | bio = bio_list_pop(&tg->bio_lists[rw]); | 1068 | static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw) |
753 | tg->nr_queued[rw]--; | 1069 | { |
754 | /* Drop bio reference on blkg */ | 1070 | struct throtl_service_queue *sq = &tg->service_queue; |
755 | blkg_put(tg_to_blkg(tg)); | 1071 | struct throtl_service_queue *parent_sq = sq->parent_sq; |
1072 | struct throtl_grp *parent_tg = sq_to_tg(parent_sq); | ||
1073 | struct throtl_grp *tg_to_put = NULL; | ||
1074 | struct bio *bio; | ||
756 | 1075 | ||
757 | BUG_ON(td->nr_queued[rw] <= 0); | 1076 | /* |
758 | td->nr_queued[rw]--; | 1077 | * @bio is being transferred from @tg to @parent_sq. Popping a bio |
1078 | * from @tg may put its reference and @parent_sq might end up | ||
1079 | * getting released prematurely. Remember the tg to put and put it | ||
1080 | * after @bio is transferred to @parent_sq. | ||
1081 | */ | ||
1082 | bio = throtl_pop_queued(&sq->queued[rw], &tg_to_put); | ||
1083 | sq->nr_queued[rw]--; | ||
759 | 1084 | ||
760 | throtl_charge_bio(tg, bio); | 1085 | throtl_charge_bio(tg, bio); |
761 | bio_list_add(bl, bio); | ||
762 | bio->bi_rw |= REQ_THROTTLED; | ||
763 | 1086 | ||
764 | throtl_trim_slice(td, tg, rw); | 1087 | /* |
1088 | * If our parent is another tg, we just need to transfer @bio to | ||
1089 | * the parent using throtl_add_bio_tg(). If our parent is | ||
1090 | * @td->service_queue, @bio is ready to be issued. Put it on its | ||
1091 | * bio_lists[] and decrease total number queued. The caller is | ||
1092 | * responsible for issuing these bios. | ||
1093 | */ | ||
1094 | if (parent_tg) { | ||
1095 | throtl_add_bio_tg(bio, &tg->qnode_on_parent[rw], parent_tg); | ||
1096 | start_parent_slice_with_credit(tg, parent_tg, rw); | ||
1097 | } else { | ||
1098 | throtl_qnode_add_bio(bio, &tg->qnode_on_parent[rw], | ||
1099 | &parent_sq->queued[rw]); | ||
1100 | BUG_ON(tg->td->nr_queued[rw] <= 0); | ||
1101 | tg->td->nr_queued[rw]--; | ||
1102 | } | ||
1103 | |||
1104 | throtl_trim_slice(tg, rw); | ||
1105 | |||
1106 | if (tg_to_put) | ||
1107 | blkg_put(tg_to_blkg(tg_to_put)); | ||
765 | } | 1108 | } |
766 | 1109 | ||
767 | static int throtl_dispatch_tg(struct throtl_data *td, struct throtl_grp *tg, | 1110 | static int throtl_dispatch_tg(struct throtl_grp *tg) |
768 | struct bio_list *bl) | ||
769 | { | 1111 | { |
1112 | struct throtl_service_queue *sq = &tg->service_queue; | ||
770 | unsigned int nr_reads = 0, nr_writes = 0; | 1113 | unsigned int nr_reads = 0, nr_writes = 0; |
771 | unsigned int max_nr_reads = throtl_grp_quantum*3/4; | 1114 | unsigned int max_nr_reads = throtl_grp_quantum*3/4; |
772 | unsigned int max_nr_writes = throtl_grp_quantum - max_nr_reads; | 1115 | unsigned int max_nr_writes = throtl_grp_quantum - max_nr_reads; |
@@ -774,20 +1117,20 @@ static int throtl_dispatch_tg(struct throtl_data *td, struct throtl_grp *tg, | |||
774 | 1117 | ||
775 | /* Try to dispatch 75% READS and 25% WRITES */ | 1118 | /* Try to dispatch 75% READS and 25% WRITES */ |
776 | 1119 | ||
777 | while ((bio = bio_list_peek(&tg->bio_lists[READ])) | 1120 | while ((bio = throtl_peek_queued(&sq->queued[READ])) && |
778 | && tg_may_dispatch(td, tg, bio, NULL)) { | 1121 | tg_may_dispatch(tg, bio, NULL)) { |
779 | 1122 | ||
780 | tg_dispatch_one_bio(td, tg, bio_data_dir(bio), bl); | 1123 | tg_dispatch_one_bio(tg, bio_data_dir(bio)); |
781 | nr_reads++; | 1124 | nr_reads++; |
782 | 1125 | ||
783 | if (nr_reads >= max_nr_reads) | 1126 | if (nr_reads >= max_nr_reads) |
784 | break; | 1127 | break; |
785 | } | 1128 | } |
786 | 1129 | ||
787 | while ((bio = bio_list_peek(&tg->bio_lists[WRITE])) | 1130 | while ((bio = throtl_peek_queued(&sq->queued[WRITE])) && |
788 | && tg_may_dispatch(td, tg, bio, NULL)) { | 1131 | tg_may_dispatch(tg, bio, NULL)) { |
789 | 1132 | ||
790 | tg_dispatch_one_bio(td, tg, bio_data_dir(bio), bl); | 1133 | tg_dispatch_one_bio(tg, bio_data_dir(bio)); |
791 | nr_writes++; | 1134 | nr_writes++; |
792 | 1135 | ||
793 | if (nr_writes >= max_nr_writes) | 1136 | if (nr_writes >= max_nr_writes) |
@@ -797,14 +1140,13 @@ static int throtl_dispatch_tg(struct throtl_data *td, struct throtl_grp *tg, | |||
797 | return nr_reads + nr_writes; | 1140 | return nr_reads + nr_writes; |
798 | } | 1141 | } |
799 | 1142 | ||
800 | static int throtl_select_dispatch(struct throtl_data *td, struct bio_list *bl) | 1143 | static int throtl_select_dispatch(struct throtl_service_queue *parent_sq) |
801 | { | 1144 | { |
802 | unsigned int nr_disp = 0; | 1145 | unsigned int nr_disp = 0; |
803 | struct throtl_grp *tg; | ||
804 | struct throtl_rb_root *st = &td->tg_service_tree; | ||
805 | 1146 | ||
806 | while (1) { | 1147 | while (1) { |
807 | tg = throtl_rb_first(st); | 1148 | struct throtl_grp *tg = throtl_rb_first(parent_sq); |
1149 | struct throtl_service_queue *sq = &tg->service_queue; | ||
808 | 1150 | ||
809 | if (!tg) | 1151 | if (!tg) |
810 | break; | 1152 | break; |
@@ -812,14 +1154,12 @@ static int throtl_select_dispatch(struct throtl_data *td, struct bio_list *bl) | |||
812 | if (time_before(jiffies, tg->disptime)) | 1154 | if (time_before(jiffies, tg->disptime)) |
813 | break; | 1155 | break; |
814 | 1156 | ||
815 | throtl_dequeue_tg(td, tg); | 1157 | throtl_dequeue_tg(tg); |
816 | 1158 | ||
817 | nr_disp += throtl_dispatch_tg(td, tg, bl); | 1159 | nr_disp += throtl_dispatch_tg(tg); |
818 | 1160 | ||
819 | if (tg->nr_queued[0] || tg->nr_queued[1]) { | 1161 | if (sq->nr_queued[0] || sq->nr_queued[1]) |
820 | tg_update_disptime(td, tg); | 1162 | tg_update_disptime(tg); |
821 | throtl_enqueue_tg(td, tg); | ||
822 | } | ||
823 | 1163 | ||
824 | if (nr_disp >= throtl_quantum) | 1164 | if (nr_disp >= throtl_quantum) |
825 | break; | 1165 | break; |
@@ -828,111 +1168,111 @@ static int throtl_select_dispatch(struct throtl_data *td, struct bio_list *bl) | |||
828 | return nr_disp; | 1168 | return nr_disp; |
829 | } | 1169 | } |
830 | 1170 | ||
831 | static void throtl_process_limit_change(struct throtl_data *td) | 1171 | /** |
1172 | * throtl_pending_timer_fn - timer function for service_queue->pending_timer | ||
1173 | * @arg: the throtl_service_queue being serviced | ||
1174 | * | ||
1175 | * This timer is armed when a child throtl_grp with active bio's become | ||
1176 | * pending and queued on the service_queue's pending_tree and expires when | ||
1177 | * the first child throtl_grp should be dispatched. This function | ||
1178 | * dispatches bio's from the children throtl_grps to the parent | ||
1179 | * service_queue. | ||
1180 | * | ||
1181 | * If the parent's parent is another throtl_grp, dispatching is propagated | ||
1182 | * by either arming its pending_timer or repeating dispatch directly. If | ||
1183 | * the top-level service_tree is reached, throtl_data->dispatch_work is | ||
1184 | * kicked so that the ready bio's are issued. | ||
1185 | */ | ||
1186 | static void throtl_pending_timer_fn(unsigned long arg) | ||
832 | { | 1187 | { |
1188 | struct throtl_service_queue *sq = (void *)arg; | ||
1189 | struct throtl_grp *tg = sq_to_tg(sq); | ||
1190 | struct throtl_data *td = sq_to_td(sq); | ||
833 | struct request_queue *q = td->queue; | 1191 | struct request_queue *q = td->queue; |
834 | struct blkcg_gq *blkg, *n; | 1192 | struct throtl_service_queue *parent_sq; |
835 | 1193 | bool dispatched; | |
836 | if (!td->limits_changed) | 1194 | int ret; |
837 | return; | ||
838 | |||
839 | xchg(&td->limits_changed, false); | ||
840 | |||
841 | throtl_log(td, "limits changed"); | ||
842 | |||
843 | list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) { | ||
844 | struct throtl_grp *tg = blkg_to_tg(blkg); | ||
845 | 1195 | ||
846 | if (!tg->limits_changed) | 1196 | spin_lock_irq(q->queue_lock); |
847 | continue; | 1197 | again: |
1198 | parent_sq = sq->parent_sq; | ||
1199 | dispatched = false; | ||
1200 | |||
1201 | while (true) { | ||
1202 | throtl_log(sq, "dispatch nr_queued=%u read=%u write=%u", | ||
1203 | sq->nr_queued[READ] + sq->nr_queued[WRITE], | ||
1204 | sq->nr_queued[READ], sq->nr_queued[WRITE]); | ||
1205 | |||
1206 | ret = throtl_select_dispatch(sq); | ||
1207 | if (ret) { | ||
1208 | throtl_log(sq, "bios disp=%u", ret); | ||
1209 | dispatched = true; | ||
1210 | } | ||
848 | 1211 | ||
849 | if (!xchg(&tg->limits_changed, false)) | 1212 | if (throtl_schedule_next_dispatch(sq, false)) |
850 | continue; | 1213 | break; |
851 | 1214 | ||
852 | throtl_log_tg(td, tg, "limit change rbps=%llu wbps=%llu" | 1215 | /* this dispatch windows is still open, relax and repeat */ |
853 | " riops=%u wiops=%u", tg->bps[READ], tg->bps[WRITE], | 1216 | spin_unlock_irq(q->queue_lock); |
854 | tg->iops[READ], tg->iops[WRITE]); | 1217 | cpu_relax(); |
1218 | spin_lock_irq(q->queue_lock); | ||
1219 | } | ||
855 | 1220 | ||
856 | /* | 1221 | if (!dispatched) |
857 | * Restart the slices for both READ and WRITES. It | 1222 | goto out_unlock; |
858 | * might happen that a group's limit are dropped | ||
859 | * suddenly and we don't want to account recently | ||
860 | * dispatched IO with new low rate | ||
861 | */ | ||
862 | throtl_start_new_slice(td, tg, 0); | ||
863 | throtl_start_new_slice(td, tg, 1); | ||
864 | 1223 | ||
865 | if (throtl_tg_on_rr(tg)) | 1224 | if (parent_sq) { |
866 | tg_update_disptime(td, tg); | 1225 | /* @parent_sq is another throl_grp, propagate dispatch */ |
1226 | if (tg->flags & THROTL_TG_WAS_EMPTY) { | ||
1227 | tg_update_disptime(tg); | ||
1228 | if (!throtl_schedule_next_dispatch(parent_sq, false)) { | ||
1229 | /* window is already open, repeat dispatching */ | ||
1230 | sq = parent_sq; | ||
1231 | tg = sq_to_tg(sq); | ||
1232 | goto again; | ||
1233 | } | ||
1234 | } | ||
1235 | } else { | ||
1236 | /* reached the top-level, queue issueing */ | ||
1237 | queue_work(kthrotld_workqueue, &td->dispatch_work); | ||
867 | } | 1238 | } |
1239 | out_unlock: | ||
1240 | spin_unlock_irq(q->queue_lock); | ||
868 | } | 1241 | } |
869 | 1242 | ||
870 | /* Dispatch throttled bios. Should be called without queue lock held. */ | 1243 | /** |
871 | static int throtl_dispatch(struct request_queue *q) | 1244 | * blk_throtl_dispatch_work_fn - work function for throtl_data->dispatch_work |
1245 | * @work: work item being executed | ||
1246 | * | ||
1247 | * This function is queued for execution when bio's reach the bio_lists[] | ||
1248 | * of throtl_data->service_queue. Those bio's are ready and issued by this | ||
1249 | * function. | ||
1250 | */ | ||
1251 | void blk_throtl_dispatch_work_fn(struct work_struct *work) | ||
872 | { | 1252 | { |
873 | struct throtl_data *td = q->td; | 1253 | struct throtl_data *td = container_of(work, struct throtl_data, |
874 | unsigned int nr_disp = 0; | 1254 | dispatch_work); |
1255 | struct throtl_service_queue *td_sq = &td->service_queue; | ||
1256 | struct request_queue *q = td->queue; | ||
875 | struct bio_list bio_list_on_stack; | 1257 | struct bio_list bio_list_on_stack; |
876 | struct bio *bio; | 1258 | struct bio *bio; |
877 | struct blk_plug plug; | 1259 | struct blk_plug plug; |
878 | 1260 | int rw; | |
879 | spin_lock_irq(q->queue_lock); | ||
880 | |||
881 | throtl_process_limit_change(td); | ||
882 | |||
883 | if (!total_nr_queued(td)) | ||
884 | goto out; | ||
885 | 1261 | ||
886 | bio_list_init(&bio_list_on_stack); | 1262 | bio_list_init(&bio_list_on_stack); |
887 | 1263 | ||
888 | throtl_log(td, "dispatch nr_queued=%u read=%u write=%u", | 1264 | spin_lock_irq(q->queue_lock); |
889 | total_nr_queued(td), td->nr_queued[READ], | 1265 | for (rw = READ; rw <= WRITE; rw++) |
890 | td->nr_queued[WRITE]); | 1266 | while ((bio = throtl_pop_queued(&td_sq->queued[rw], NULL))) |
891 | 1267 | bio_list_add(&bio_list_on_stack, bio); | |
892 | nr_disp = throtl_select_dispatch(td, &bio_list_on_stack); | ||
893 | |||
894 | if (nr_disp) | ||
895 | throtl_log(td, "bios disp=%u", nr_disp); | ||
896 | |||
897 | throtl_schedule_next_dispatch(td); | ||
898 | out: | ||
899 | spin_unlock_irq(q->queue_lock); | 1268 | spin_unlock_irq(q->queue_lock); |
900 | 1269 | ||
901 | /* | 1270 | if (!bio_list_empty(&bio_list_on_stack)) { |
902 | * If we dispatched some requests, unplug the queue to make sure | ||
903 | * immediate dispatch | ||
904 | */ | ||
905 | if (nr_disp) { | ||
906 | blk_start_plug(&plug); | 1271 | blk_start_plug(&plug); |
907 | while((bio = bio_list_pop(&bio_list_on_stack))) | 1272 | while((bio = bio_list_pop(&bio_list_on_stack))) |
908 | generic_make_request(bio); | 1273 | generic_make_request(bio); |
909 | blk_finish_plug(&plug); | 1274 | blk_finish_plug(&plug); |
910 | } | 1275 | } |
911 | return nr_disp; | ||
912 | } | ||
913 | |||
914 | void blk_throtl_work(struct work_struct *work) | ||
915 | { | ||
916 | struct throtl_data *td = container_of(work, struct throtl_data, | ||
917 | throtl_work.work); | ||
918 | struct request_queue *q = td->queue; | ||
919 | |||
920 | throtl_dispatch(q); | ||
921 | } | ||
922 | |||
923 | /* Call with queue lock held */ | ||
924 | static void | ||
925 | throtl_schedule_delayed_work(struct throtl_data *td, unsigned long delay) | ||
926 | { | ||
927 | |||
928 | struct delayed_work *dwork = &td->throtl_work; | ||
929 | |||
930 | /* schedule work if limits changed even if no bio is queued */ | ||
931 | if (total_nr_queued(td) || td->limits_changed) { | ||
932 | mod_delayed_work(kthrotld_workqueue, dwork, delay); | ||
933 | throtl_log(td, "schedule work. delay=%lu jiffies=%lu", | ||
934 | delay, jiffies); | ||
935 | } | ||
936 | } | 1276 | } |
937 | 1277 | ||
938 | static u64 tg_prfill_cpu_rwstat(struct seq_file *sf, | 1278 | static u64 tg_prfill_cpu_rwstat(struct seq_file *sf, |
@@ -1007,7 +1347,9 @@ static int tg_set_conf(struct cgroup *cgrp, struct cftype *cft, const char *buf, | |||
1007 | struct blkcg *blkcg = cgroup_to_blkcg(cgrp); | 1347 | struct blkcg *blkcg = cgroup_to_blkcg(cgrp); |
1008 | struct blkg_conf_ctx ctx; | 1348 | struct blkg_conf_ctx ctx; |
1009 | struct throtl_grp *tg; | 1349 | struct throtl_grp *tg; |
1010 | struct throtl_data *td; | 1350 | struct throtl_service_queue *sq; |
1351 | struct blkcg_gq *blkg; | ||
1352 | struct cgroup *pos_cgrp; | ||
1011 | int ret; | 1353 | int ret; |
1012 | 1354 | ||
1013 | ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx); | 1355 | ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx); |
@@ -1015,7 +1357,7 @@ static int tg_set_conf(struct cgroup *cgrp, struct cftype *cft, const char *buf, | |||
1015 | return ret; | 1357 | return ret; |
1016 | 1358 | ||
1017 | tg = blkg_to_tg(ctx.blkg); | 1359 | tg = blkg_to_tg(ctx.blkg); |
1018 | td = ctx.blkg->q->td; | 1360 | sq = &tg->service_queue; |
1019 | 1361 | ||
1020 | if (!ctx.v) | 1362 | if (!ctx.v) |
1021 | ctx.v = -1; | 1363 | ctx.v = -1; |
@@ -1025,10 +1367,37 @@ static int tg_set_conf(struct cgroup *cgrp, struct cftype *cft, const char *buf, | |||
1025 | else | 1367 | else |
1026 | *(unsigned int *)((void *)tg + cft->private) = ctx.v; | 1368 | *(unsigned int *)((void *)tg + cft->private) = ctx.v; |
1027 | 1369 | ||
1028 | /* XXX: we don't need the following deferred processing */ | 1370 | throtl_log(&tg->service_queue, |
1029 | xchg(&tg->limits_changed, true); | 1371 | "limit change rbps=%llu wbps=%llu riops=%u wiops=%u", |
1030 | xchg(&td->limits_changed, true); | 1372 | tg->bps[READ], tg->bps[WRITE], |
1031 | throtl_schedule_delayed_work(td, 0); | 1373 | tg->iops[READ], tg->iops[WRITE]); |
1374 | |||
1375 | /* | ||
1376 | * Update has_rules[] flags for the updated tg's subtree. A tg is | ||
1377 | * considered to have rules if either the tg itself or any of its | ||
1378 | * ancestors has rules. This identifies groups without any | ||
1379 | * restrictions in the whole hierarchy and allows them to bypass | ||
1380 | * blk-throttle. | ||
1381 | */ | ||
1382 | tg_update_has_rules(tg); | ||
1383 | blkg_for_each_descendant_pre(blkg, pos_cgrp, ctx.blkg) | ||
1384 | tg_update_has_rules(blkg_to_tg(blkg)); | ||
1385 | |||
1386 | /* | ||
1387 | * We're already holding queue_lock and know @tg is valid. Let's | ||
1388 | * apply the new config directly. | ||
1389 | * | ||
1390 | * Restart the slices for both READ and WRITES. It might happen | ||
1391 | * that a group's limit are dropped suddenly and we don't want to | ||
1392 | * account recently dispatched IO with new low rate. | ||
1393 | */ | ||
1394 | throtl_start_new_slice(tg, 0); | ||
1395 | throtl_start_new_slice(tg, 1); | ||
1396 | |||
1397 | if (tg->flags & THROTL_TG_PENDING) { | ||
1398 | tg_update_disptime(tg); | ||
1399 | throtl_schedule_next_dispatch(sq->parent_sq, true); | ||
1400 | } | ||
1032 | 1401 | ||
1033 | blkg_conf_finish(&ctx); | 1402 | blkg_conf_finish(&ctx); |
1034 | return 0; | 1403 | return 0; |
@@ -1092,7 +1461,7 @@ static void throtl_shutdown_wq(struct request_queue *q) | |||
1092 | { | 1461 | { |
1093 | struct throtl_data *td = q->td; | 1462 | struct throtl_data *td = q->td; |
1094 | 1463 | ||
1095 | cancel_delayed_work_sync(&td->throtl_work); | 1464 | cancel_work_sync(&td->dispatch_work); |
1096 | } | 1465 | } |
1097 | 1466 | ||
1098 | static struct blkcg_policy blkcg_policy_throtl = { | 1467 | static struct blkcg_policy blkcg_policy_throtl = { |
@@ -1100,6 +1469,7 @@ static struct blkcg_policy blkcg_policy_throtl = { | |||
1100 | .cftypes = throtl_files, | 1469 | .cftypes = throtl_files, |
1101 | 1470 | ||
1102 | .pd_init_fn = throtl_pd_init, | 1471 | .pd_init_fn = throtl_pd_init, |
1472 | .pd_online_fn = throtl_pd_online, | ||
1103 | .pd_exit_fn = throtl_pd_exit, | 1473 | .pd_exit_fn = throtl_pd_exit, |
1104 | .pd_reset_stats_fn = throtl_pd_reset_stats, | 1474 | .pd_reset_stats_fn = throtl_pd_reset_stats, |
1105 | }; | 1475 | }; |
@@ -1107,15 +1477,16 @@ static struct blkcg_policy blkcg_policy_throtl = { | |||
1107 | bool blk_throtl_bio(struct request_queue *q, struct bio *bio) | 1477 | bool blk_throtl_bio(struct request_queue *q, struct bio *bio) |
1108 | { | 1478 | { |
1109 | struct throtl_data *td = q->td; | 1479 | struct throtl_data *td = q->td; |
1480 | struct throtl_qnode *qn = NULL; | ||
1110 | struct throtl_grp *tg; | 1481 | struct throtl_grp *tg; |
1111 | bool rw = bio_data_dir(bio), update_disptime = true; | 1482 | struct throtl_service_queue *sq; |
1483 | bool rw = bio_data_dir(bio); | ||
1112 | struct blkcg *blkcg; | 1484 | struct blkcg *blkcg; |
1113 | bool throttled = false; | 1485 | bool throttled = false; |
1114 | 1486 | ||
1115 | if (bio->bi_rw & REQ_THROTTLED) { | 1487 | /* see throtl_charge_bio() */ |
1116 | bio->bi_rw &= ~REQ_THROTTLED; | 1488 | if (bio->bi_rw & REQ_THROTTLED) |
1117 | goto out; | 1489 | goto out; |
1118 | } | ||
1119 | 1490 | ||
1120 | /* | 1491 | /* |
1121 | * A throtl_grp pointer retrieved under rcu can be used to access | 1492 | * A throtl_grp pointer retrieved under rcu can be used to access |
@@ -1126,7 +1497,7 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio) | |||
1126 | blkcg = bio_blkcg(bio); | 1497 | blkcg = bio_blkcg(bio); |
1127 | tg = throtl_lookup_tg(td, blkcg); | 1498 | tg = throtl_lookup_tg(td, blkcg); |
1128 | if (tg) { | 1499 | if (tg) { |
1129 | if (tg_no_rule_group(tg, rw)) { | 1500 | if (!tg->has_rules[rw]) { |
1130 | throtl_update_dispatch_stats(tg_to_blkg(tg), | 1501 | throtl_update_dispatch_stats(tg_to_blkg(tg), |
1131 | bio->bi_size, bio->bi_rw); | 1502 | bio->bi_size, bio->bi_rw); |
1132 | goto out_unlock_rcu; | 1503 | goto out_unlock_rcu; |
@@ -1142,18 +1513,18 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio) | |||
1142 | if (unlikely(!tg)) | 1513 | if (unlikely(!tg)) |
1143 | goto out_unlock; | 1514 | goto out_unlock; |
1144 | 1515 | ||
1145 | if (tg->nr_queued[rw]) { | 1516 | sq = &tg->service_queue; |
1146 | /* | ||
1147 | * There is already another bio queued in same dir. No | ||
1148 | * need to update dispatch time. | ||
1149 | */ | ||
1150 | update_disptime = false; | ||
1151 | goto queue_bio; | ||
1152 | 1517 | ||
1153 | } | 1518 | while (true) { |
1519 | /* throtl is FIFO - if bios are already queued, should queue */ | ||
1520 | if (sq->nr_queued[rw]) | ||
1521 | break; | ||
1522 | |||
1523 | /* if above limits, break to queue */ | ||
1524 | if (!tg_may_dispatch(tg, bio, NULL)) | ||
1525 | break; | ||
1154 | 1526 | ||
1155 | /* Bio is with-in rate limit of group */ | 1527 | /* within limits, let's charge and dispatch directly */ |
1156 | if (tg_may_dispatch(td, tg, bio, NULL)) { | ||
1157 | throtl_charge_bio(tg, bio); | 1528 | throtl_charge_bio(tg, bio); |
1158 | 1529 | ||
1159 | /* | 1530 | /* |
@@ -1167,25 +1538,41 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio) | |||
1167 | * | 1538 | * |
1168 | * So keep on trimming slice even if bio is not queued. | 1539 | * So keep on trimming slice even if bio is not queued. |
1169 | */ | 1540 | */ |
1170 | throtl_trim_slice(td, tg, rw); | 1541 | throtl_trim_slice(tg, rw); |
1171 | goto out_unlock; | 1542 | |
1543 | /* | ||
1544 | * @bio passed through this layer without being throttled. | ||
1545 | * Climb up the ladder. If we''re already at the top, it | ||
1546 | * can be executed directly. | ||
1547 | */ | ||
1548 | qn = &tg->qnode_on_parent[rw]; | ||
1549 | sq = sq->parent_sq; | ||
1550 | tg = sq_to_tg(sq); | ||
1551 | if (!tg) | ||
1552 | goto out_unlock; | ||
1172 | } | 1553 | } |
1173 | 1554 | ||
1174 | queue_bio: | 1555 | /* out-of-limit, queue to @tg */ |
1175 | throtl_log_tg(td, tg, "[%c] bio. bdisp=%llu sz=%u bps=%llu" | 1556 | throtl_log(sq, "[%c] bio. bdisp=%llu sz=%u bps=%llu iodisp=%u iops=%u queued=%d/%d", |
1176 | " iodisp=%u iops=%u queued=%d/%d", | 1557 | rw == READ ? 'R' : 'W', |
1177 | rw == READ ? 'R' : 'W', | 1558 | tg->bytes_disp[rw], bio->bi_size, tg->bps[rw], |
1178 | tg->bytes_disp[rw], bio->bi_size, tg->bps[rw], | 1559 | tg->io_disp[rw], tg->iops[rw], |
1179 | tg->io_disp[rw], tg->iops[rw], | 1560 | sq->nr_queued[READ], sq->nr_queued[WRITE]); |
1180 | tg->nr_queued[READ], tg->nr_queued[WRITE]); | ||
1181 | 1561 | ||
1182 | bio_associate_current(bio); | 1562 | bio_associate_current(bio); |
1183 | throtl_add_bio_tg(q->td, tg, bio); | 1563 | tg->td->nr_queued[rw]++; |
1564 | throtl_add_bio_tg(bio, qn, tg); | ||
1184 | throttled = true; | 1565 | throttled = true; |
1185 | 1566 | ||
1186 | if (update_disptime) { | 1567 | /* |
1187 | tg_update_disptime(td, tg); | 1568 | * Update @tg's dispatch time and force schedule dispatch if @tg |
1188 | throtl_schedule_next_dispatch(td); | 1569 | * was empty before @bio. The forced scheduling isn't likely to |
1570 | * cause undue delay as @bio is likely to be dispatched directly if | ||
1571 | * its @tg's disptime is not in the future. | ||
1572 | */ | ||
1573 | if (tg->flags & THROTL_TG_WAS_EMPTY) { | ||
1574 | tg_update_disptime(tg); | ||
1575 | throtl_schedule_next_dispatch(tg->service_queue.parent_sq, true); | ||
1189 | } | 1576 | } |
1190 | 1577 | ||
1191 | out_unlock: | 1578 | out_unlock: |
@@ -1193,9 +1580,38 @@ out_unlock: | |||
1193 | out_unlock_rcu: | 1580 | out_unlock_rcu: |
1194 | rcu_read_unlock(); | 1581 | rcu_read_unlock(); |
1195 | out: | 1582 | out: |
1583 | /* | ||
1584 | * As multiple blk-throtls may stack in the same issue path, we | ||
1585 | * don't want bios to leave with the flag set. Clear the flag if | ||
1586 | * being issued. | ||
1587 | */ | ||
1588 | if (!throttled) | ||
1589 | bio->bi_rw &= ~REQ_THROTTLED; | ||
1196 | return throttled; | 1590 | return throttled; |
1197 | } | 1591 | } |
1198 | 1592 | ||
1593 | /* | ||
1594 | * Dispatch all bios from all children tg's queued on @parent_sq. On | ||
1595 | * return, @parent_sq is guaranteed to not have any active children tg's | ||
1596 | * and all bios from previously active tg's are on @parent_sq->bio_lists[]. | ||
1597 | */ | ||
1598 | static void tg_drain_bios(struct throtl_service_queue *parent_sq) | ||
1599 | { | ||
1600 | struct throtl_grp *tg; | ||
1601 | |||
1602 | while ((tg = throtl_rb_first(parent_sq))) { | ||
1603 | struct throtl_service_queue *sq = &tg->service_queue; | ||
1604 | struct bio *bio; | ||
1605 | |||
1606 | throtl_dequeue_tg(tg); | ||
1607 | |||
1608 | while ((bio = throtl_peek_queued(&sq->queued[READ]))) | ||
1609 | tg_dispatch_one_bio(tg, bio_data_dir(bio)); | ||
1610 | while ((bio = throtl_peek_queued(&sq->queued[WRITE]))) | ||
1611 | tg_dispatch_one_bio(tg, bio_data_dir(bio)); | ||
1612 | } | ||
1613 | } | ||
1614 | |||
1199 | /** | 1615 | /** |
1200 | * blk_throtl_drain - drain throttled bios | 1616 | * blk_throtl_drain - drain throttled bios |
1201 | * @q: request_queue to drain throttled bios for | 1617 | * @q: request_queue to drain throttled bios for |
@@ -1206,27 +1622,36 @@ void blk_throtl_drain(struct request_queue *q) | |||
1206 | __releases(q->queue_lock) __acquires(q->queue_lock) | 1622 | __releases(q->queue_lock) __acquires(q->queue_lock) |
1207 | { | 1623 | { |
1208 | struct throtl_data *td = q->td; | 1624 | struct throtl_data *td = q->td; |
1209 | struct throtl_rb_root *st = &td->tg_service_tree; | 1625 | struct blkcg_gq *blkg; |
1210 | struct throtl_grp *tg; | 1626 | struct cgroup *pos_cgrp; |
1211 | struct bio_list bl; | ||
1212 | struct bio *bio; | 1627 | struct bio *bio; |
1628 | int rw; | ||
1213 | 1629 | ||
1214 | queue_lockdep_assert_held(q); | 1630 | queue_lockdep_assert_held(q); |
1631 | rcu_read_lock(); | ||
1632 | |||
1633 | /* | ||
1634 | * Drain each tg while doing post-order walk on the blkg tree, so | ||
1635 | * that all bios are propagated to td->service_queue. It'd be | ||
1636 | * better to walk service_queue tree directly but blkg walk is | ||
1637 | * easier. | ||
1638 | */ | ||
1639 | blkg_for_each_descendant_post(blkg, pos_cgrp, td->queue->root_blkg) | ||
1640 | tg_drain_bios(&blkg_to_tg(blkg)->service_queue); | ||
1215 | 1641 | ||
1216 | bio_list_init(&bl); | 1642 | tg_drain_bios(&td_root_tg(td)->service_queue); |
1217 | 1643 | ||
1218 | while ((tg = throtl_rb_first(st))) { | 1644 | /* finally, transfer bios from top-level tg's into the td */ |
1219 | throtl_dequeue_tg(td, tg); | 1645 | tg_drain_bios(&td->service_queue); |
1220 | 1646 | ||
1221 | while ((bio = bio_list_peek(&tg->bio_lists[READ]))) | 1647 | rcu_read_unlock(); |
1222 | tg_dispatch_one_bio(td, tg, bio_data_dir(bio), &bl); | ||
1223 | while ((bio = bio_list_peek(&tg->bio_lists[WRITE]))) | ||
1224 | tg_dispatch_one_bio(td, tg, bio_data_dir(bio), &bl); | ||
1225 | } | ||
1226 | spin_unlock_irq(q->queue_lock); | 1648 | spin_unlock_irq(q->queue_lock); |
1227 | 1649 | ||
1228 | while ((bio = bio_list_pop(&bl))) | 1650 | /* all bios now should be in td->service_queue, issue them */ |
1229 | generic_make_request(bio); | 1651 | for (rw = READ; rw <= WRITE; rw++) |
1652 | while ((bio = throtl_pop_queued(&td->service_queue.queued[rw], | ||
1653 | NULL))) | ||
1654 | generic_make_request(bio); | ||
1230 | 1655 | ||
1231 | spin_lock_irq(q->queue_lock); | 1656 | spin_lock_irq(q->queue_lock); |
1232 | } | 1657 | } |
@@ -1240,9 +1665,8 @@ int blk_throtl_init(struct request_queue *q) | |||
1240 | if (!td) | 1665 | if (!td) |
1241 | return -ENOMEM; | 1666 | return -ENOMEM; |
1242 | 1667 | ||
1243 | td->tg_service_tree = THROTL_RB_ROOT; | 1668 | INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn); |
1244 | td->limits_changed = false; | 1669 | throtl_service_queue_init(&td->service_queue, NULL); |
1245 | INIT_DELAYED_WORK(&td->throtl_work, blk_throtl_work); | ||
1246 | 1670 | ||
1247 | q->td = td; | 1671 | q->td = td; |
1248 | td->queue = q; | 1672 | td->queue = q; |
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig index b81ddfea1da0..e07a5fd58ad7 100644 --- a/drivers/block/Kconfig +++ b/drivers/block/Kconfig | |||
@@ -532,11 +532,11 @@ config BLK_DEV_RBD | |||
532 | If unsure, say N. | 532 | If unsure, say N. |
533 | 533 | ||
534 | config BLK_DEV_RSXX | 534 | config BLK_DEV_RSXX |
535 | tristate "IBM FlashSystem 70/80 PCIe SSD Device Driver" | 535 | tristate "IBM Flash Adapter 900GB Full Height PCIe Device Driver" |
536 | depends on PCI | 536 | depends on PCI |
537 | help | 537 | help |
538 | Device driver for IBM's high speed PCIe SSD | 538 | Device driver for IBM's high speed PCIe SSD |
539 | storage devices: FlashSystem-70 and FlashSystem-80. | 539 | storage device: Flash Adapter 900GB Full Height. |
540 | 540 | ||
541 | To compile this driver as a module, choose M here: the | 541 | To compile this driver as a module, choose M here: the |
542 | module will be called rsxx. | 542 | module will be called rsxx. |
diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c index 6608076dc39e..28c73ca320a8 100644 --- a/drivers/block/drbd/drbd_actlog.c +++ b/drivers/block/drbd/drbd_actlog.c | |||
@@ -659,6 +659,27 @@ void drbd_al_shrink(struct drbd_conf *mdev) | |||
659 | wake_up(&mdev->al_wait); | 659 | wake_up(&mdev->al_wait); |
660 | } | 660 | } |
661 | 661 | ||
662 | int drbd_initialize_al(struct drbd_conf *mdev, void *buffer) | ||
663 | { | ||
664 | struct al_transaction_on_disk *al = buffer; | ||
665 | struct drbd_md *md = &mdev->ldev->md; | ||
666 | sector_t al_base = md->md_offset + md->al_offset; | ||
667 | int al_size_4k = md->al_stripes * md->al_stripe_size_4k; | ||
668 | int i; | ||
669 | |||
670 | memset(al, 0, 4096); | ||
671 | al->magic = cpu_to_be32(DRBD_AL_MAGIC); | ||
672 | al->transaction_type = cpu_to_be16(AL_TR_INITIALIZED); | ||
673 | al->crc32c = cpu_to_be32(crc32c(0, al, 4096)); | ||
674 | |||
675 | for (i = 0; i < al_size_4k; i++) { | ||
676 | int err = drbd_md_sync_page_io(mdev, mdev->ldev, al_base + i * 8, WRITE); | ||
677 | if (err) | ||
678 | return err; | ||
679 | } | ||
680 | return 0; | ||
681 | } | ||
682 | |||
662 | static int w_update_odbm(struct drbd_work *w, int unused) | 683 | static int w_update_odbm(struct drbd_work *w, int unused) |
663 | { | 684 | { |
664 | struct update_odbm_work *udw = container_of(w, struct update_odbm_work, w); | 685 | struct update_odbm_work *udw = container_of(w, struct update_odbm_work, w); |
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index f943aacfdad8..2d7f608d181c 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h | |||
@@ -832,6 +832,7 @@ struct drbd_tconn { /* is a resource from the config file */ | |||
832 | unsigned susp_nod:1; /* IO suspended because no data */ | 832 | unsigned susp_nod:1; /* IO suspended because no data */ |
833 | unsigned susp_fen:1; /* IO suspended because fence peer handler runs */ | 833 | unsigned susp_fen:1; /* IO suspended because fence peer handler runs */ |
834 | struct mutex cstate_mutex; /* Protects graceful disconnects */ | 834 | struct mutex cstate_mutex; /* Protects graceful disconnects */ |
835 | unsigned int connect_cnt; /* Inc each time a connection is established */ | ||
835 | 836 | ||
836 | unsigned long flags; | 837 | unsigned long flags; |
837 | struct net_conf *net_conf; /* content protected by rcu */ | 838 | struct net_conf *net_conf; /* content protected by rcu */ |
@@ -1132,6 +1133,7 @@ extern void drbd_mdev_cleanup(struct drbd_conf *mdev); | |||
1132 | void drbd_print_uuids(struct drbd_conf *mdev, const char *text); | 1133 | void drbd_print_uuids(struct drbd_conf *mdev, const char *text); |
1133 | 1134 | ||
1134 | extern void conn_md_sync(struct drbd_tconn *tconn); | 1135 | extern void conn_md_sync(struct drbd_tconn *tconn); |
1136 | extern void drbd_md_write(struct drbd_conf *mdev, void *buffer); | ||
1135 | extern void drbd_md_sync(struct drbd_conf *mdev); | 1137 | extern void drbd_md_sync(struct drbd_conf *mdev); |
1136 | extern int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev); | 1138 | extern int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev); |
1137 | extern void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local); | 1139 | extern void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local); |
@@ -1466,8 +1468,16 @@ extern void drbd_suspend_io(struct drbd_conf *mdev); | |||
1466 | extern void drbd_resume_io(struct drbd_conf *mdev); | 1468 | extern void drbd_resume_io(struct drbd_conf *mdev); |
1467 | extern char *ppsize(char *buf, unsigned long long size); | 1469 | extern char *ppsize(char *buf, unsigned long long size); |
1468 | extern sector_t drbd_new_dev_size(struct drbd_conf *, struct drbd_backing_dev *, sector_t, int); | 1470 | extern sector_t drbd_new_dev_size(struct drbd_conf *, struct drbd_backing_dev *, sector_t, int); |
1469 | enum determine_dev_size { dev_size_error = -1, unchanged = 0, shrunk = 1, grew = 2 }; | 1471 | enum determine_dev_size { |
1470 | extern enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *, enum dds_flags) __must_hold(local); | 1472 | DS_ERROR_SHRINK = -3, |
1473 | DS_ERROR_SPACE_MD = -2, | ||
1474 | DS_ERROR = -1, | ||
1475 | DS_UNCHANGED = 0, | ||
1476 | DS_SHRUNK = 1, | ||
1477 | DS_GREW = 2 | ||
1478 | }; | ||
1479 | extern enum determine_dev_size | ||
1480 | drbd_determine_dev_size(struct drbd_conf *, enum dds_flags, struct resize_parms *) __must_hold(local); | ||
1471 | extern void resync_after_online_grow(struct drbd_conf *); | 1481 | extern void resync_after_online_grow(struct drbd_conf *); |
1472 | extern void drbd_reconsider_max_bio_size(struct drbd_conf *mdev); | 1482 | extern void drbd_reconsider_max_bio_size(struct drbd_conf *mdev); |
1473 | extern enum drbd_state_rv drbd_set_role(struct drbd_conf *mdev, | 1483 | extern enum drbd_state_rv drbd_set_role(struct drbd_conf *mdev, |
@@ -1633,6 +1643,7 @@ extern int __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector, | |||
1633 | #define drbd_set_out_of_sync(mdev, sector, size) \ | 1643 | #define drbd_set_out_of_sync(mdev, sector, size) \ |
1634 | __drbd_set_out_of_sync(mdev, sector, size, __FILE__, __LINE__) | 1644 | __drbd_set_out_of_sync(mdev, sector, size, __FILE__, __LINE__) |
1635 | extern void drbd_al_shrink(struct drbd_conf *mdev); | 1645 | extern void drbd_al_shrink(struct drbd_conf *mdev); |
1646 | extern int drbd_initialize_al(struct drbd_conf *, void *); | ||
1636 | 1647 | ||
1637 | /* drbd_nl.c */ | 1648 | /* drbd_nl.c */ |
1638 | /* state info broadcast */ | 1649 | /* state info broadcast */ |
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index a5dca6affcbb..55635edf563b 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c | |||
@@ -2762,8 +2762,6 @@ int __init drbd_init(void) | |||
2762 | /* | 2762 | /* |
2763 | * allocate all necessary structs | 2763 | * allocate all necessary structs |
2764 | */ | 2764 | */ |
2765 | err = -ENOMEM; | ||
2766 | |||
2767 | init_waitqueue_head(&drbd_pp_wait); | 2765 | init_waitqueue_head(&drbd_pp_wait); |
2768 | 2766 | ||
2769 | drbd_proc = NULL; /* play safe for drbd_cleanup */ | 2767 | drbd_proc = NULL; /* play safe for drbd_cleanup */ |
@@ -2773,6 +2771,7 @@ int __init drbd_init(void) | |||
2773 | if (err) | 2771 | if (err) |
2774 | goto fail; | 2772 | goto fail; |
2775 | 2773 | ||
2774 | err = -ENOMEM; | ||
2776 | drbd_proc = proc_create_data("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops, NULL); | 2775 | drbd_proc = proc_create_data("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops, NULL); |
2777 | if (!drbd_proc) { | 2776 | if (!drbd_proc) { |
2778 | printk(KERN_ERR "drbd: unable to register proc file\n"); | 2777 | printk(KERN_ERR "drbd: unable to register proc file\n"); |
@@ -2803,7 +2802,6 @@ int __init drbd_init(void) | |||
2803 | fail: | 2802 | fail: |
2804 | drbd_cleanup(); | 2803 | drbd_cleanup(); |
2805 | if (err == -ENOMEM) | 2804 | if (err == -ENOMEM) |
2806 | /* currently always the case */ | ||
2807 | printk(KERN_ERR "drbd: ran out of memory\n"); | 2805 | printk(KERN_ERR "drbd: ran out of memory\n"); |
2808 | else | 2806 | else |
2809 | printk(KERN_ERR "drbd: initialization failure\n"); | 2807 | printk(KERN_ERR "drbd: initialization failure\n"); |
@@ -2881,34 +2879,14 @@ struct meta_data_on_disk { | |||
2881 | u8 reserved_u8[4096 - (7*8 + 10*4)]; | 2879 | u8 reserved_u8[4096 - (7*8 + 10*4)]; |
2882 | } __packed; | 2880 | } __packed; |
2883 | 2881 | ||
2884 | /** | 2882 | |
2885 | * drbd_md_sync() - Writes the meta data super block if the MD_DIRTY flag bit is set | 2883 | |
2886 | * @mdev: DRBD device. | 2884 | void drbd_md_write(struct drbd_conf *mdev, void *b) |
2887 | */ | ||
2888 | void drbd_md_sync(struct drbd_conf *mdev) | ||
2889 | { | 2885 | { |
2890 | struct meta_data_on_disk *buffer; | 2886 | struct meta_data_on_disk *buffer = b; |
2891 | sector_t sector; | 2887 | sector_t sector; |
2892 | int i; | 2888 | int i; |
2893 | 2889 | ||
2894 | /* Don't accidentally change the DRBD meta data layout. */ | ||
2895 | BUILD_BUG_ON(UI_SIZE != 4); | ||
2896 | BUILD_BUG_ON(sizeof(struct meta_data_on_disk) != 4096); | ||
2897 | |||
2898 | del_timer(&mdev->md_sync_timer); | ||
2899 | /* timer may be rearmed by drbd_md_mark_dirty() now. */ | ||
2900 | if (!test_and_clear_bit(MD_DIRTY, &mdev->flags)) | ||
2901 | return; | ||
2902 | |||
2903 | /* We use here D_FAILED and not D_ATTACHING because we try to write | ||
2904 | * metadata even if we detach due to a disk failure! */ | ||
2905 | if (!get_ldev_if_state(mdev, D_FAILED)) | ||
2906 | return; | ||
2907 | |||
2908 | buffer = drbd_md_get_buffer(mdev); | ||
2909 | if (!buffer) | ||
2910 | goto out; | ||
2911 | |||
2912 | memset(buffer, 0, sizeof(*buffer)); | 2890 | memset(buffer, 0, sizeof(*buffer)); |
2913 | 2891 | ||
2914 | buffer->la_size_sect = cpu_to_be64(drbd_get_capacity(mdev->this_bdev)); | 2892 | buffer->la_size_sect = cpu_to_be64(drbd_get_capacity(mdev->this_bdev)); |
@@ -2937,6 +2915,35 @@ void drbd_md_sync(struct drbd_conf *mdev) | |||
2937 | dev_err(DEV, "meta data update failed!\n"); | 2915 | dev_err(DEV, "meta data update failed!\n"); |
2938 | drbd_chk_io_error(mdev, 1, DRBD_META_IO_ERROR); | 2916 | drbd_chk_io_error(mdev, 1, DRBD_META_IO_ERROR); |
2939 | } | 2917 | } |
2918 | } | ||
2919 | |||
2920 | /** | ||
2921 | * drbd_md_sync() - Writes the meta data super block if the MD_DIRTY flag bit is set | ||
2922 | * @mdev: DRBD device. | ||
2923 | */ | ||
2924 | void drbd_md_sync(struct drbd_conf *mdev) | ||
2925 | { | ||
2926 | struct meta_data_on_disk *buffer; | ||
2927 | |||
2928 | /* Don't accidentally change the DRBD meta data layout. */ | ||
2929 | BUILD_BUG_ON(UI_SIZE != 4); | ||
2930 | BUILD_BUG_ON(sizeof(struct meta_data_on_disk) != 4096); | ||
2931 | |||
2932 | del_timer(&mdev->md_sync_timer); | ||
2933 | /* timer may be rearmed by drbd_md_mark_dirty() now. */ | ||
2934 | if (!test_and_clear_bit(MD_DIRTY, &mdev->flags)) | ||
2935 | return; | ||
2936 | |||
2937 | /* We use here D_FAILED and not D_ATTACHING because we try to write | ||
2938 | * metadata even if we detach due to a disk failure! */ | ||
2939 | if (!get_ldev_if_state(mdev, D_FAILED)) | ||
2940 | return; | ||
2941 | |||
2942 | buffer = drbd_md_get_buffer(mdev); | ||
2943 | if (!buffer) | ||
2944 | goto out; | ||
2945 | |||
2946 | drbd_md_write(mdev, buffer); | ||
2940 | 2947 | ||
2941 | /* Update mdev->ldev->md.la_size_sect, | 2948 | /* Update mdev->ldev->md.la_size_sect, |
2942 | * since we updated it on metadata. */ | 2949 | * since we updated it on metadata. */ |
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index 9e3f441e7e84..8cc1e640f485 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c | |||
@@ -417,6 +417,7 @@ static enum drbd_fencing_p highest_fencing_policy(struct drbd_tconn *tconn) | |||
417 | 417 | ||
418 | bool conn_try_outdate_peer(struct drbd_tconn *tconn) | 418 | bool conn_try_outdate_peer(struct drbd_tconn *tconn) |
419 | { | 419 | { |
420 | unsigned int connect_cnt; | ||
420 | union drbd_state mask = { }; | 421 | union drbd_state mask = { }; |
421 | union drbd_state val = { }; | 422 | union drbd_state val = { }; |
422 | enum drbd_fencing_p fp; | 423 | enum drbd_fencing_p fp; |
@@ -428,6 +429,10 @@ bool conn_try_outdate_peer(struct drbd_tconn *tconn) | |||
428 | return false; | 429 | return false; |
429 | } | 430 | } |
430 | 431 | ||
432 | spin_lock_irq(&tconn->req_lock); | ||
433 | connect_cnt = tconn->connect_cnt; | ||
434 | spin_unlock_irq(&tconn->req_lock); | ||
435 | |||
431 | fp = highest_fencing_policy(tconn); | 436 | fp = highest_fencing_policy(tconn); |
432 | switch (fp) { | 437 | switch (fp) { |
433 | case FP_NOT_AVAIL: | 438 | case FP_NOT_AVAIL: |
@@ -492,8 +497,14 @@ bool conn_try_outdate_peer(struct drbd_tconn *tconn) | |||
492 | here, because we might were able to re-establish the connection in the | 497 | here, because we might were able to re-establish the connection in the |
493 | meantime. */ | 498 | meantime. */ |
494 | spin_lock_irq(&tconn->req_lock); | 499 | spin_lock_irq(&tconn->req_lock); |
495 | if (tconn->cstate < C_WF_REPORT_PARAMS && !test_bit(STATE_SENT, &tconn->flags)) | 500 | if (tconn->cstate < C_WF_REPORT_PARAMS && !test_bit(STATE_SENT, &tconn->flags)) { |
496 | _conn_request_state(tconn, mask, val, CS_VERBOSE); | 501 | if (tconn->connect_cnt != connect_cnt) |
502 | /* In case the connection was established and droped | ||
503 | while the fence-peer handler was running, ignore it */ | ||
504 | conn_info(tconn, "Ignoring fence-peer exit code\n"); | ||
505 | else | ||
506 | _conn_request_state(tconn, mask, val, CS_VERBOSE); | ||
507 | } | ||
497 | spin_unlock_irq(&tconn->req_lock); | 508 | spin_unlock_irq(&tconn->req_lock); |
498 | 509 | ||
499 | return conn_highest_pdsk(tconn) <= D_OUTDATED; | 510 | return conn_highest_pdsk(tconn) <= D_OUTDATED; |
@@ -816,15 +827,20 @@ void drbd_resume_io(struct drbd_conf *mdev) | |||
816 | * Returns 0 on success, negative return values indicate errors. | 827 | * Returns 0 on success, negative return values indicate errors. |
817 | * You should call drbd_md_sync() after calling this function. | 828 | * You should call drbd_md_sync() after calling this function. |
818 | */ | 829 | */ |
819 | enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds_flags flags) __must_hold(local) | 830 | enum determine_dev_size |
831 | drbd_determine_dev_size(struct drbd_conf *mdev, enum dds_flags flags, struct resize_parms *rs) __must_hold(local) | ||
820 | { | 832 | { |
821 | sector_t prev_first_sect, prev_size; /* previous meta location */ | 833 | sector_t prev_first_sect, prev_size; /* previous meta location */ |
822 | sector_t la_size_sect, u_size; | 834 | sector_t la_size_sect, u_size; |
835 | struct drbd_md *md = &mdev->ldev->md; | ||
836 | u32 prev_al_stripe_size_4k; | ||
837 | u32 prev_al_stripes; | ||
823 | sector_t size; | 838 | sector_t size; |
824 | char ppb[10]; | 839 | char ppb[10]; |
840 | void *buffer; | ||
825 | 841 | ||
826 | int md_moved, la_size_changed; | 842 | int md_moved, la_size_changed; |
827 | enum determine_dev_size rv = unchanged; | 843 | enum determine_dev_size rv = DS_UNCHANGED; |
828 | 844 | ||
829 | /* race: | 845 | /* race: |
830 | * application request passes inc_ap_bio, | 846 | * application request passes inc_ap_bio, |
@@ -836,6 +852,11 @@ enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds | |||
836 | * still lock the act_log to not trigger ASSERTs there. | 852 | * still lock the act_log to not trigger ASSERTs there. |
837 | */ | 853 | */ |
838 | drbd_suspend_io(mdev); | 854 | drbd_suspend_io(mdev); |
855 | buffer = drbd_md_get_buffer(mdev); /* Lock meta-data IO */ | ||
856 | if (!buffer) { | ||
857 | drbd_resume_io(mdev); | ||
858 | return DS_ERROR; | ||
859 | } | ||
839 | 860 | ||
840 | /* no wait necessary anymore, actually we could assert that */ | 861 | /* no wait necessary anymore, actually we could assert that */ |
841 | wait_event(mdev->al_wait, lc_try_lock(mdev->act_log)); | 862 | wait_event(mdev->al_wait, lc_try_lock(mdev->act_log)); |
@@ -844,7 +865,17 @@ enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds | |||
844 | prev_size = mdev->ldev->md.md_size_sect; | 865 | prev_size = mdev->ldev->md.md_size_sect; |
845 | la_size_sect = mdev->ldev->md.la_size_sect; | 866 | la_size_sect = mdev->ldev->md.la_size_sect; |
846 | 867 | ||
847 | /* TODO: should only be some assert here, not (re)init... */ | 868 | if (rs) { |
869 | /* rs is non NULL if we should change the AL layout only */ | ||
870 | |||
871 | prev_al_stripes = md->al_stripes; | ||
872 | prev_al_stripe_size_4k = md->al_stripe_size_4k; | ||
873 | |||
874 | md->al_stripes = rs->al_stripes; | ||
875 | md->al_stripe_size_4k = rs->al_stripe_size / 4; | ||
876 | md->al_size_4k = (u64)rs->al_stripes * rs->al_stripe_size / 4; | ||
877 | } | ||
878 | |||
848 | drbd_md_set_sector_offsets(mdev, mdev->ldev); | 879 | drbd_md_set_sector_offsets(mdev, mdev->ldev); |
849 | 880 | ||
850 | rcu_read_lock(); | 881 | rcu_read_lock(); |
@@ -852,6 +883,21 @@ enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds | |||
852 | rcu_read_unlock(); | 883 | rcu_read_unlock(); |
853 | size = drbd_new_dev_size(mdev, mdev->ldev, u_size, flags & DDSF_FORCED); | 884 | size = drbd_new_dev_size(mdev, mdev->ldev, u_size, flags & DDSF_FORCED); |
854 | 885 | ||
886 | if (size < la_size_sect) { | ||
887 | if (rs && u_size == 0) { | ||
888 | /* Remove "rs &&" later. This check should always be active, but | ||
889 | right now the receiver expects the permissive behavior */ | ||
890 | dev_warn(DEV, "Implicit shrink not allowed. " | ||
891 | "Use --size=%llus for explicit shrink.\n", | ||
892 | (unsigned long long)size); | ||
893 | rv = DS_ERROR_SHRINK; | ||
894 | } | ||
895 | if (u_size > size) | ||
896 | rv = DS_ERROR_SPACE_MD; | ||
897 | if (rv != DS_UNCHANGED) | ||
898 | goto err_out; | ||
899 | } | ||
900 | |||
855 | if (drbd_get_capacity(mdev->this_bdev) != size || | 901 | if (drbd_get_capacity(mdev->this_bdev) != size || |
856 | drbd_bm_capacity(mdev) != size) { | 902 | drbd_bm_capacity(mdev) != size) { |
857 | int err; | 903 | int err; |
@@ -867,7 +913,7 @@ enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds | |||
867 | "Leaving size unchanged at size = %lu KB\n", | 913 | "Leaving size unchanged at size = %lu KB\n", |
868 | (unsigned long)size); | 914 | (unsigned long)size); |
869 | } | 915 | } |
870 | rv = dev_size_error; | 916 | rv = DS_ERROR; |
871 | } | 917 | } |
872 | /* racy, see comments above. */ | 918 | /* racy, see comments above. */ |
873 | drbd_set_my_capacity(mdev, size); | 919 | drbd_set_my_capacity(mdev, size); |
@@ -875,38 +921,57 @@ enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds | |||
875 | dev_info(DEV, "size = %s (%llu KB)\n", ppsize(ppb, size>>1), | 921 | dev_info(DEV, "size = %s (%llu KB)\n", ppsize(ppb, size>>1), |
876 | (unsigned long long)size>>1); | 922 | (unsigned long long)size>>1); |
877 | } | 923 | } |
878 | if (rv == dev_size_error) | 924 | if (rv <= DS_ERROR) |
879 | goto out; | 925 | goto err_out; |
880 | 926 | ||
881 | la_size_changed = (la_size_sect != mdev->ldev->md.la_size_sect); | 927 | la_size_changed = (la_size_sect != mdev->ldev->md.la_size_sect); |
882 | 928 | ||
883 | md_moved = prev_first_sect != drbd_md_first_sector(mdev->ldev) | 929 | md_moved = prev_first_sect != drbd_md_first_sector(mdev->ldev) |
884 | || prev_size != mdev->ldev->md.md_size_sect; | 930 | || prev_size != mdev->ldev->md.md_size_sect; |
885 | 931 | ||
886 | if (la_size_changed || md_moved) { | 932 | if (la_size_changed || md_moved || rs) { |
887 | int err; | 933 | u32 prev_flags; |
888 | 934 | ||
889 | drbd_al_shrink(mdev); /* All extents inactive. */ | 935 | drbd_al_shrink(mdev); /* All extents inactive. */ |
936 | |||
937 | prev_flags = md->flags; | ||
938 | md->flags &= ~MDF_PRIMARY_IND; | ||
939 | drbd_md_write(mdev, buffer); | ||
940 | |||
890 | dev_info(DEV, "Writing the whole bitmap, %s\n", | 941 | dev_info(DEV, "Writing the whole bitmap, %s\n", |
891 | la_size_changed && md_moved ? "size changed and md moved" : | 942 | la_size_changed && md_moved ? "size changed and md moved" : |
892 | la_size_changed ? "size changed" : "md moved"); | 943 | la_size_changed ? "size changed" : "md moved"); |
893 | /* next line implicitly does drbd_suspend_io()+drbd_resume_io() */ | 944 | /* next line implicitly does drbd_suspend_io()+drbd_resume_io() */ |
894 | err = drbd_bitmap_io(mdev, md_moved ? &drbd_bm_write_all : &drbd_bm_write, | 945 | drbd_bitmap_io(mdev, md_moved ? &drbd_bm_write_all : &drbd_bm_write, |
895 | "size changed", BM_LOCKED_MASK); | 946 | "size changed", BM_LOCKED_MASK); |
896 | if (err) { | 947 | drbd_initialize_al(mdev, buffer); |
897 | rv = dev_size_error; | 948 | |
898 | goto out; | 949 | md->flags = prev_flags; |
899 | } | 950 | drbd_md_write(mdev, buffer); |
900 | drbd_md_mark_dirty(mdev); | 951 | |
952 | if (rs) | ||
953 | dev_info(DEV, "Changed AL layout to al-stripes = %d, al-stripe-size-kB = %d\n", | ||
954 | md->al_stripes, md->al_stripe_size_4k * 4); | ||
901 | } | 955 | } |
902 | 956 | ||
903 | if (size > la_size_sect) | 957 | if (size > la_size_sect) |
904 | rv = grew; | 958 | rv = DS_GREW; |
905 | if (size < la_size_sect) | 959 | if (size < la_size_sect) |
906 | rv = shrunk; | 960 | rv = DS_SHRUNK; |
907 | out: | 961 | |
962 | if (0) { | ||
963 | err_out: | ||
964 | if (rs) { | ||
965 | md->al_stripes = prev_al_stripes; | ||
966 | md->al_stripe_size_4k = prev_al_stripe_size_4k; | ||
967 | md->al_size_4k = (u64)prev_al_stripes * prev_al_stripe_size_4k; | ||
968 | |||
969 | drbd_md_set_sector_offsets(mdev, mdev->ldev); | ||
970 | } | ||
971 | } | ||
908 | lc_unlock(mdev->act_log); | 972 | lc_unlock(mdev->act_log); |
909 | wake_up(&mdev->al_wait); | 973 | wake_up(&mdev->al_wait); |
974 | drbd_md_put_buffer(mdev); | ||
910 | drbd_resume_io(mdev); | 975 | drbd_resume_io(mdev); |
911 | 976 | ||
912 | return rv; | 977 | return rv; |
@@ -1607,11 +1672,11 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) | |||
1607 | !drbd_md_test_flag(mdev->ldev, MDF_CONNECTED_IND)) | 1672 | !drbd_md_test_flag(mdev->ldev, MDF_CONNECTED_IND)) |
1608 | set_bit(USE_DEGR_WFC_T, &mdev->flags); | 1673 | set_bit(USE_DEGR_WFC_T, &mdev->flags); |
1609 | 1674 | ||
1610 | dd = drbd_determine_dev_size(mdev, 0); | 1675 | dd = drbd_determine_dev_size(mdev, 0, NULL); |
1611 | if (dd == dev_size_error) { | 1676 | if (dd <= DS_ERROR) { |
1612 | retcode = ERR_NOMEM_BITMAP; | 1677 | retcode = ERR_NOMEM_BITMAP; |
1613 | goto force_diskless_dec; | 1678 | goto force_diskless_dec; |
1614 | } else if (dd == grew) | 1679 | } else if (dd == DS_GREW) |
1615 | set_bit(RESYNC_AFTER_NEG, &mdev->flags); | 1680 | set_bit(RESYNC_AFTER_NEG, &mdev->flags); |
1616 | 1681 | ||
1617 | if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC) || | 1682 | if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC) || |
@@ -2305,6 +2370,7 @@ int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info) | |||
2305 | struct drbd_conf *mdev; | 2370 | struct drbd_conf *mdev; |
2306 | enum drbd_ret_code retcode; | 2371 | enum drbd_ret_code retcode; |
2307 | enum determine_dev_size dd; | 2372 | enum determine_dev_size dd; |
2373 | bool change_al_layout = false; | ||
2308 | enum dds_flags ddsf; | 2374 | enum dds_flags ddsf; |
2309 | sector_t u_size; | 2375 | sector_t u_size; |
2310 | int err; | 2376 | int err; |
@@ -2315,31 +2381,33 @@ int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info) | |||
2315 | if (retcode != NO_ERROR) | 2381 | if (retcode != NO_ERROR) |
2316 | goto fail; | 2382 | goto fail; |
2317 | 2383 | ||
2384 | mdev = adm_ctx.mdev; | ||
2385 | if (!get_ldev(mdev)) { | ||
2386 | retcode = ERR_NO_DISK; | ||
2387 | goto fail; | ||
2388 | } | ||
2389 | |||
2318 | memset(&rs, 0, sizeof(struct resize_parms)); | 2390 | memset(&rs, 0, sizeof(struct resize_parms)); |
2391 | rs.al_stripes = mdev->ldev->md.al_stripes; | ||
2392 | rs.al_stripe_size = mdev->ldev->md.al_stripe_size_4k * 4; | ||
2319 | if (info->attrs[DRBD_NLA_RESIZE_PARMS]) { | 2393 | if (info->attrs[DRBD_NLA_RESIZE_PARMS]) { |
2320 | err = resize_parms_from_attrs(&rs, info); | 2394 | err = resize_parms_from_attrs(&rs, info); |
2321 | if (err) { | 2395 | if (err) { |
2322 | retcode = ERR_MANDATORY_TAG; | 2396 | retcode = ERR_MANDATORY_TAG; |
2323 | drbd_msg_put_info(from_attrs_err_to_txt(err)); | 2397 | drbd_msg_put_info(from_attrs_err_to_txt(err)); |
2324 | goto fail; | 2398 | goto fail_ldev; |
2325 | } | 2399 | } |
2326 | } | 2400 | } |
2327 | 2401 | ||
2328 | mdev = adm_ctx.mdev; | ||
2329 | if (mdev->state.conn > C_CONNECTED) { | 2402 | if (mdev->state.conn > C_CONNECTED) { |
2330 | retcode = ERR_RESIZE_RESYNC; | 2403 | retcode = ERR_RESIZE_RESYNC; |
2331 | goto fail; | 2404 | goto fail_ldev; |
2332 | } | 2405 | } |
2333 | 2406 | ||
2334 | if (mdev->state.role == R_SECONDARY && | 2407 | if (mdev->state.role == R_SECONDARY && |
2335 | mdev->state.peer == R_SECONDARY) { | 2408 | mdev->state.peer == R_SECONDARY) { |
2336 | retcode = ERR_NO_PRIMARY; | 2409 | retcode = ERR_NO_PRIMARY; |
2337 | goto fail; | 2410 | goto fail_ldev; |
2338 | } | ||
2339 | |||
2340 | if (!get_ldev(mdev)) { | ||
2341 | retcode = ERR_NO_DISK; | ||
2342 | goto fail; | ||
2343 | } | 2411 | } |
2344 | 2412 | ||
2345 | if (rs.no_resync && mdev->tconn->agreed_pro_version < 93) { | 2413 | if (rs.no_resync && mdev->tconn->agreed_pro_version < 93) { |
@@ -2358,6 +2426,28 @@ int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info) | |||
2358 | } | 2426 | } |
2359 | } | 2427 | } |
2360 | 2428 | ||
2429 | if (mdev->ldev->md.al_stripes != rs.al_stripes || | ||
2430 | mdev->ldev->md.al_stripe_size_4k != rs.al_stripe_size / 4) { | ||
2431 | u32 al_size_k = rs.al_stripes * rs.al_stripe_size; | ||
2432 | |||
2433 | if (al_size_k > (16 * 1024 * 1024)) { | ||
2434 | retcode = ERR_MD_LAYOUT_TOO_BIG; | ||
2435 | goto fail_ldev; | ||
2436 | } | ||
2437 | |||
2438 | if (al_size_k < MD_32kB_SECT/2) { | ||
2439 | retcode = ERR_MD_LAYOUT_TOO_SMALL; | ||
2440 | goto fail_ldev; | ||
2441 | } | ||
2442 | |||
2443 | if (mdev->state.conn != C_CONNECTED) { | ||
2444 | retcode = ERR_MD_LAYOUT_CONNECTED; | ||
2445 | goto fail_ldev; | ||
2446 | } | ||
2447 | |||
2448 | change_al_layout = true; | ||
2449 | } | ||
2450 | |||
2361 | if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev)) | 2451 | if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev)) |
2362 | mdev->ldev->known_size = drbd_get_capacity(mdev->ldev->backing_bdev); | 2452 | mdev->ldev->known_size = drbd_get_capacity(mdev->ldev->backing_bdev); |
2363 | 2453 | ||
@@ -2373,16 +2463,22 @@ int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info) | |||
2373 | } | 2463 | } |
2374 | 2464 | ||
2375 | ddsf = (rs.resize_force ? DDSF_FORCED : 0) | (rs.no_resync ? DDSF_NO_RESYNC : 0); | 2465 | ddsf = (rs.resize_force ? DDSF_FORCED : 0) | (rs.no_resync ? DDSF_NO_RESYNC : 0); |
2376 | dd = drbd_determine_dev_size(mdev, ddsf); | 2466 | dd = drbd_determine_dev_size(mdev, ddsf, change_al_layout ? &rs : NULL); |
2377 | drbd_md_sync(mdev); | 2467 | drbd_md_sync(mdev); |
2378 | put_ldev(mdev); | 2468 | put_ldev(mdev); |
2379 | if (dd == dev_size_error) { | 2469 | if (dd == DS_ERROR) { |
2380 | retcode = ERR_NOMEM_BITMAP; | 2470 | retcode = ERR_NOMEM_BITMAP; |
2381 | goto fail; | 2471 | goto fail; |
2472 | } else if (dd == DS_ERROR_SPACE_MD) { | ||
2473 | retcode = ERR_MD_LAYOUT_NO_FIT; | ||
2474 | goto fail; | ||
2475 | } else if (dd == DS_ERROR_SHRINK) { | ||
2476 | retcode = ERR_IMPLICIT_SHRINK; | ||
2477 | goto fail; | ||
2382 | } | 2478 | } |
2383 | 2479 | ||
2384 | if (mdev->state.conn == C_CONNECTED) { | 2480 | if (mdev->state.conn == C_CONNECTED) { |
2385 | if (dd == grew) | 2481 | if (dd == DS_GREW) |
2386 | set_bit(RESIZE_PENDING, &mdev->flags); | 2482 | set_bit(RESIZE_PENDING, &mdev->flags); |
2387 | 2483 | ||
2388 | drbd_send_uuids(mdev); | 2484 | drbd_send_uuids(mdev); |
@@ -2658,7 +2754,6 @@ int nla_put_status_info(struct sk_buff *skb, struct drbd_conf *mdev, | |||
2658 | const struct sib_info *sib) | 2754 | const struct sib_info *sib) |
2659 | { | 2755 | { |
2660 | struct state_info *si = NULL; /* for sizeof(si->member); */ | 2756 | struct state_info *si = NULL; /* for sizeof(si->member); */ |
2661 | struct net_conf *nc; | ||
2662 | struct nlattr *nla; | 2757 | struct nlattr *nla; |
2663 | int got_ldev; | 2758 | int got_ldev; |
2664 | int err = 0; | 2759 | int err = 0; |
@@ -2688,13 +2783,19 @@ int nla_put_status_info(struct sk_buff *skb, struct drbd_conf *mdev, | |||
2688 | goto nla_put_failure; | 2783 | goto nla_put_failure; |
2689 | 2784 | ||
2690 | rcu_read_lock(); | 2785 | rcu_read_lock(); |
2691 | if (got_ldev) | 2786 | if (got_ldev) { |
2692 | if (disk_conf_to_skb(skb, rcu_dereference(mdev->ldev->disk_conf), exclude_sensitive)) | 2787 | struct disk_conf *disk_conf; |
2693 | goto nla_put_failure; | ||
2694 | 2788 | ||
2695 | nc = rcu_dereference(mdev->tconn->net_conf); | 2789 | disk_conf = rcu_dereference(mdev->ldev->disk_conf); |
2696 | if (nc) | 2790 | err = disk_conf_to_skb(skb, disk_conf, exclude_sensitive); |
2697 | err = net_conf_to_skb(skb, nc, exclude_sensitive); | 2791 | } |
2792 | if (!err) { | ||
2793 | struct net_conf *nc; | ||
2794 | |||
2795 | nc = rcu_dereference(mdev->tconn->net_conf); | ||
2796 | if (nc) | ||
2797 | err = net_conf_to_skb(skb, nc, exclude_sensitive); | ||
2798 | } | ||
2698 | rcu_read_unlock(); | 2799 | rcu_read_unlock(); |
2699 | if (err) | 2800 | if (err) |
2700 | goto nla_put_failure; | 2801 | goto nla_put_failure; |
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 4222affff488..cc29cd3bf78b 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c | |||
@@ -1039,6 +1039,8 @@ randomize: | |||
1039 | rcu_read_lock(); | 1039 | rcu_read_lock(); |
1040 | idr_for_each_entry(&tconn->volumes, mdev, vnr) { | 1040 | idr_for_each_entry(&tconn->volumes, mdev, vnr) { |
1041 | kref_get(&mdev->kref); | 1041 | kref_get(&mdev->kref); |
1042 | rcu_read_unlock(); | ||
1043 | |||
1042 | /* Prevent a race between resync-handshake and | 1044 | /* Prevent a race between resync-handshake and |
1043 | * being promoted to Primary. | 1045 | * being promoted to Primary. |
1044 | * | 1046 | * |
@@ -1049,8 +1051,6 @@ randomize: | |||
1049 | mutex_lock(mdev->state_mutex); | 1051 | mutex_lock(mdev->state_mutex); |
1050 | mutex_unlock(mdev->state_mutex); | 1052 | mutex_unlock(mdev->state_mutex); |
1051 | 1053 | ||
1052 | rcu_read_unlock(); | ||
1053 | |||
1054 | if (discard_my_data) | 1054 | if (discard_my_data) |
1055 | set_bit(DISCARD_MY_DATA, &mdev->flags); | 1055 | set_bit(DISCARD_MY_DATA, &mdev->flags); |
1056 | else | 1056 | else |
@@ -3545,7 +3545,7 @@ static int receive_sizes(struct drbd_tconn *tconn, struct packet_info *pi) | |||
3545 | { | 3545 | { |
3546 | struct drbd_conf *mdev; | 3546 | struct drbd_conf *mdev; |
3547 | struct p_sizes *p = pi->data; | 3547 | struct p_sizes *p = pi->data; |
3548 | enum determine_dev_size dd = unchanged; | 3548 | enum determine_dev_size dd = DS_UNCHANGED; |
3549 | sector_t p_size, p_usize, my_usize; | 3549 | sector_t p_size, p_usize, my_usize; |
3550 | int ldsc = 0; /* local disk size changed */ | 3550 | int ldsc = 0; /* local disk size changed */ |
3551 | enum dds_flags ddsf; | 3551 | enum dds_flags ddsf; |
@@ -3617,9 +3617,9 @@ static int receive_sizes(struct drbd_tconn *tconn, struct packet_info *pi) | |||
3617 | 3617 | ||
3618 | ddsf = be16_to_cpu(p->dds_flags); | 3618 | ddsf = be16_to_cpu(p->dds_flags); |
3619 | if (get_ldev(mdev)) { | 3619 | if (get_ldev(mdev)) { |
3620 | dd = drbd_determine_dev_size(mdev, ddsf); | 3620 | dd = drbd_determine_dev_size(mdev, ddsf, NULL); |
3621 | put_ldev(mdev); | 3621 | put_ldev(mdev); |
3622 | if (dd == dev_size_error) | 3622 | if (dd == DS_ERROR) |
3623 | return -EIO; | 3623 | return -EIO; |
3624 | drbd_md_sync(mdev); | 3624 | drbd_md_sync(mdev); |
3625 | } else { | 3625 | } else { |
@@ -3647,7 +3647,7 @@ static int receive_sizes(struct drbd_tconn *tconn, struct packet_info *pi) | |||
3647 | drbd_send_sizes(mdev, 0, ddsf); | 3647 | drbd_send_sizes(mdev, 0, ddsf); |
3648 | } | 3648 | } |
3649 | if (test_and_clear_bit(RESIZE_PENDING, &mdev->flags) || | 3649 | if (test_and_clear_bit(RESIZE_PENDING, &mdev->flags) || |
3650 | (dd == grew && mdev->state.conn == C_CONNECTED)) { | 3650 | (dd == DS_GREW && mdev->state.conn == C_CONNECTED)) { |
3651 | if (mdev->state.pdsk >= D_INCONSISTENT && | 3651 | if (mdev->state.pdsk >= D_INCONSISTENT && |
3652 | mdev->state.disk >= D_INCONSISTENT) { | 3652 | mdev->state.disk >= D_INCONSISTENT) { |
3653 | if (ddsf & DDSF_NO_RESYNC) | 3653 | if (ddsf & DDSF_NO_RESYNC) |
diff --git a/drivers/block/drbd/drbd_state.c b/drivers/block/drbd/drbd_state.c index 90c5be2b1d30..216d47b7e88b 100644 --- a/drivers/block/drbd/drbd_state.c +++ b/drivers/block/drbd/drbd_state.c | |||
@@ -1115,8 +1115,10 @@ __drbd_set_state(struct drbd_conf *mdev, union drbd_state ns, | |||
1115 | drbd_thread_restart_nowait(&mdev->tconn->receiver); | 1115 | drbd_thread_restart_nowait(&mdev->tconn->receiver); |
1116 | 1116 | ||
1117 | /* Resume AL writing if we get a connection */ | 1117 | /* Resume AL writing if we get a connection */ |
1118 | if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) | 1118 | if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) { |
1119 | drbd_resume_al(mdev); | 1119 | drbd_resume_al(mdev); |
1120 | mdev->tconn->connect_cnt++; | ||
1121 | } | ||
1120 | 1122 | ||
1121 | /* remember last attach time so request_timer_fn() won't | 1123 | /* remember last attach time so request_timer_fn() won't |
1122 | * kill newly established sessions while we are still trying to thaw | 1124 | * kill newly established sessions while we are still trying to thaw |
diff --git a/drivers/block/rsxx/core.c b/drivers/block/rsxx/core.c index 5af21f2db29c..6e85e21445eb 100644 --- a/drivers/block/rsxx/core.c +++ b/drivers/block/rsxx/core.c | |||
@@ -31,6 +31,8 @@ | |||
31 | #include <linux/slab.h> | 31 | #include <linux/slab.h> |
32 | #include <linux/bitops.h> | 32 | #include <linux/bitops.h> |
33 | #include <linux/delay.h> | 33 | #include <linux/delay.h> |
34 | #include <linux/debugfs.h> | ||
35 | #include <linux/seq_file.h> | ||
34 | 36 | ||
35 | #include <linux/genhd.h> | 37 | #include <linux/genhd.h> |
36 | #include <linux/idr.h> | 38 | #include <linux/idr.h> |
@@ -39,8 +41,9 @@ | |||
39 | #include "rsxx_cfg.h" | 41 | #include "rsxx_cfg.h" |
40 | 42 | ||
41 | #define NO_LEGACY 0 | 43 | #define NO_LEGACY 0 |
44 | #define SYNC_START_TIMEOUT (10 * 60) /* 10 minutes */ | ||
42 | 45 | ||
43 | MODULE_DESCRIPTION("IBM FlashSystem 70/80 PCIe SSD Device Driver"); | 46 | MODULE_DESCRIPTION("IBM Flash Adapter 900GB Full Height Device Driver"); |
44 | MODULE_AUTHOR("Joshua Morris/Philip Kelleher, IBM"); | 47 | MODULE_AUTHOR("Joshua Morris/Philip Kelleher, IBM"); |
45 | MODULE_LICENSE("GPL"); | 48 | MODULE_LICENSE("GPL"); |
46 | MODULE_VERSION(DRIVER_VERSION); | 49 | MODULE_VERSION(DRIVER_VERSION); |
@@ -49,9 +52,282 @@ static unsigned int force_legacy = NO_LEGACY; | |||
49 | module_param(force_legacy, uint, 0444); | 52 | module_param(force_legacy, uint, 0444); |
50 | MODULE_PARM_DESC(force_legacy, "Force the use of legacy type PCI interrupts"); | 53 | MODULE_PARM_DESC(force_legacy, "Force the use of legacy type PCI interrupts"); |
51 | 54 | ||
55 | static unsigned int sync_start = 1; | ||
56 | module_param(sync_start, uint, 0444); | ||
57 | MODULE_PARM_DESC(sync_start, "On by Default: Driver load will not complete " | ||
58 | "until the card startup has completed."); | ||
59 | |||
52 | static DEFINE_IDA(rsxx_disk_ida); | 60 | static DEFINE_IDA(rsxx_disk_ida); |
53 | static DEFINE_SPINLOCK(rsxx_ida_lock); | 61 | static DEFINE_SPINLOCK(rsxx_ida_lock); |
54 | 62 | ||
63 | /* --------------------Debugfs Setup ------------------- */ | ||
64 | |||
65 | struct rsxx_cram { | ||
66 | u32 f_pos; | ||
67 | u32 offset; | ||
68 | void *i_private; | ||
69 | }; | ||
70 | |||
71 | static int rsxx_attr_pci_regs_show(struct seq_file *m, void *p) | ||
72 | { | ||
73 | struct rsxx_cardinfo *card = m->private; | ||
74 | |||
75 | seq_printf(m, "HWID 0x%08x\n", | ||
76 | ioread32(card->regmap + HWID)); | ||
77 | seq_printf(m, "SCRATCH 0x%08x\n", | ||
78 | ioread32(card->regmap + SCRATCH)); | ||
79 | seq_printf(m, "IER 0x%08x\n", | ||
80 | ioread32(card->regmap + IER)); | ||
81 | seq_printf(m, "IPR 0x%08x\n", | ||
82 | ioread32(card->regmap + IPR)); | ||
83 | seq_printf(m, "CREG_CMD 0x%08x\n", | ||
84 | ioread32(card->regmap + CREG_CMD)); | ||
85 | seq_printf(m, "CREG_ADD 0x%08x\n", | ||
86 | ioread32(card->regmap + CREG_ADD)); | ||
87 | seq_printf(m, "CREG_CNT 0x%08x\n", | ||
88 | ioread32(card->regmap + CREG_CNT)); | ||
89 | seq_printf(m, "CREG_STAT 0x%08x\n", | ||
90 | ioread32(card->regmap + CREG_STAT)); | ||
91 | seq_printf(m, "CREG_DATA0 0x%08x\n", | ||
92 | ioread32(card->regmap + CREG_DATA0)); | ||
93 | seq_printf(m, "CREG_DATA1 0x%08x\n", | ||
94 | ioread32(card->regmap + CREG_DATA1)); | ||
95 | seq_printf(m, "CREG_DATA2 0x%08x\n", | ||
96 | ioread32(card->regmap + CREG_DATA2)); | ||
97 | seq_printf(m, "CREG_DATA3 0x%08x\n", | ||
98 | ioread32(card->regmap + CREG_DATA3)); | ||
99 | seq_printf(m, "CREG_DATA4 0x%08x\n", | ||
100 | ioread32(card->regmap + CREG_DATA4)); | ||
101 | seq_printf(m, "CREG_DATA5 0x%08x\n", | ||
102 | ioread32(card->regmap + CREG_DATA5)); | ||
103 | seq_printf(m, "CREG_DATA6 0x%08x\n", | ||
104 | ioread32(card->regmap + CREG_DATA6)); | ||
105 | seq_printf(m, "CREG_DATA7 0x%08x\n", | ||
106 | ioread32(card->regmap + CREG_DATA7)); | ||
107 | seq_printf(m, "INTR_COAL 0x%08x\n", | ||
108 | ioread32(card->regmap + INTR_COAL)); | ||
109 | seq_printf(m, "HW_ERROR 0x%08x\n", | ||
110 | ioread32(card->regmap + HW_ERROR)); | ||
111 | seq_printf(m, "DEBUG0 0x%08x\n", | ||
112 | ioread32(card->regmap + PCI_DEBUG0)); | ||
113 | seq_printf(m, "DEBUG1 0x%08x\n", | ||
114 | ioread32(card->regmap + PCI_DEBUG1)); | ||
115 | seq_printf(m, "DEBUG2 0x%08x\n", | ||
116 | ioread32(card->regmap + PCI_DEBUG2)); | ||
117 | seq_printf(m, "DEBUG3 0x%08x\n", | ||
118 | ioread32(card->regmap + PCI_DEBUG3)); | ||
119 | seq_printf(m, "DEBUG4 0x%08x\n", | ||
120 | ioread32(card->regmap + PCI_DEBUG4)); | ||
121 | seq_printf(m, "DEBUG5 0x%08x\n", | ||
122 | ioread32(card->regmap + PCI_DEBUG5)); | ||
123 | seq_printf(m, "DEBUG6 0x%08x\n", | ||
124 | ioread32(card->regmap + PCI_DEBUG6)); | ||
125 | seq_printf(m, "DEBUG7 0x%08x\n", | ||
126 | ioread32(card->regmap + PCI_DEBUG7)); | ||
127 | seq_printf(m, "RECONFIG 0x%08x\n", | ||
128 | ioread32(card->regmap + PCI_RECONFIG)); | ||
129 | |||
130 | return 0; | ||
131 | } | ||
132 | |||
133 | static int rsxx_attr_stats_show(struct seq_file *m, void *p) | ||
134 | { | ||
135 | struct rsxx_cardinfo *card = m->private; | ||
136 | int i; | ||
137 | |||
138 | for (i = 0; i < card->n_targets; i++) { | ||
139 | seq_printf(m, "Ctrl %d CRC Errors = %d\n", | ||
140 | i, card->ctrl[i].stats.crc_errors); | ||
141 | seq_printf(m, "Ctrl %d Hard Errors = %d\n", | ||
142 | i, card->ctrl[i].stats.hard_errors); | ||
143 | seq_printf(m, "Ctrl %d Soft Errors = %d\n", | ||
144 | i, card->ctrl[i].stats.soft_errors); | ||
145 | seq_printf(m, "Ctrl %d Writes Issued = %d\n", | ||
146 | i, card->ctrl[i].stats.writes_issued); | ||
147 | seq_printf(m, "Ctrl %d Writes Failed = %d\n", | ||
148 | i, card->ctrl[i].stats.writes_failed); | ||
149 | seq_printf(m, "Ctrl %d Reads Issued = %d\n", | ||
150 | i, card->ctrl[i].stats.reads_issued); | ||
151 | seq_printf(m, "Ctrl %d Reads Failed = %d\n", | ||
152 | i, card->ctrl[i].stats.reads_failed); | ||
153 | seq_printf(m, "Ctrl %d Reads Retried = %d\n", | ||
154 | i, card->ctrl[i].stats.reads_retried); | ||
155 | seq_printf(m, "Ctrl %d Discards Issued = %d\n", | ||
156 | i, card->ctrl[i].stats.discards_issued); | ||
157 | seq_printf(m, "Ctrl %d Discards Failed = %d\n", | ||
158 | i, card->ctrl[i].stats.discards_failed); | ||
159 | seq_printf(m, "Ctrl %d DMA SW Errors = %d\n", | ||
160 | i, card->ctrl[i].stats.dma_sw_err); | ||
161 | seq_printf(m, "Ctrl %d DMA HW Faults = %d\n", | ||
162 | i, card->ctrl[i].stats.dma_hw_fault); | ||
163 | seq_printf(m, "Ctrl %d DMAs Cancelled = %d\n", | ||
164 | i, card->ctrl[i].stats.dma_cancelled); | ||
165 | seq_printf(m, "Ctrl %d SW Queue Depth = %d\n", | ||
166 | i, card->ctrl[i].stats.sw_q_depth); | ||
167 | seq_printf(m, "Ctrl %d HW Queue Depth = %d\n", | ||
168 | i, atomic_read(&card->ctrl[i].stats.hw_q_depth)); | ||
169 | } | ||
170 | |||
171 | return 0; | ||
172 | } | ||
173 | |||
174 | static int rsxx_attr_stats_open(struct inode *inode, struct file *file) | ||
175 | { | ||
176 | return single_open(file, rsxx_attr_stats_show, inode->i_private); | ||
177 | } | ||
178 | |||
179 | static int rsxx_attr_pci_regs_open(struct inode *inode, struct file *file) | ||
180 | { | ||
181 | return single_open(file, rsxx_attr_pci_regs_show, inode->i_private); | ||
182 | } | ||
183 | |||
184 | static ssize_t rsxx_cram_read(struct file *fp, char __user *ubuf, | ||
185 | size_t cnt, loff_t *ppos) | ||
186 | { | ||
187 | struct rsxx_cram *info = fp->private_data; | ||
188 | struct rsxx_cardinfo *card = info->i_private; | ||
189 | char *buf; | ||
190 | int st; | ||
191 | |||
192 | buf = kzalloc(sizeof(*buf) * cnt, GFP_KERNEL); | ||
193 | if (!buf) | ||
194 | return -ENOMEM; | ||
195 | |||
196 | info->f_pos = (u32)*ppos + info->offset; | ||
197 | |||
198 | st = rsxx_creg_read(card, CREG_ADD_CRAM + info->f_pos, cnt, buf, 1); | ||
199 | if (st) | ||
200 | return st; | ||
201 | |||
202 | st = copy_to_user(ubuf, buf, cnt); | ||
203 | if (st) | ||
204 | return st; | ||
205 | |||
206 | info->offset += cnt; | ||
207 | |||
208 | kfree(buf); | ||
209 | |||
210 | return cnt; | ||
211 | } | ||
212 | |||
213 | static ssize_t rsxx_cram_write(struct file *fp, const char __user *ubuf, | ||
214 | size_t cnt, loff_t *ppos) | ||
215 | { | ||
216 | struct rsxx_cram *info = fp->private_data; | ||
217 | struct rsxx_cardinfo *card = info->i_private; | ||
218 | char *buf; | ||
219 | int st; | ||
220 | |||
221 | buf = kzalloc(sizeof(*buf) * cnt, GFP_KERNEL); | ||
222 | if (!buf) | ||
223 | return -ENOMEM; | ||
224 | |||
225 | st = copy_from_user(buf, ubuf, cnt); | ||
226 | if (st) | ||
227 | return st; | ||
228 | |||
229 | info->f_pos = (u32)*ppos + info->offset; | ||
230 | |||
231 | st = rsxx_creg_write(card, CREG_ADD_CRAM + info->f_pos, cnt, buf, 1); | ||
232 | if (st) | ||
233 | return st; | ||
234 | |||
235 | info->offset += cnt; | ||
236 | |||
237 | kfree(buf); | ||
238 | |||
239 | return cnt; | ||
240 | } | ||
241 | |||
242 | static int rsxx_cram_open(struct inode *inode, struct file *file) | ||
243 | { | ||
244 | struct rsxx_cram *info = kzalloc(sizeof(*info), GFP_KERNEL); | ||
245 | if (!info) | ||
246 | return -ENOMEM; | ||
247 | |||
248 | info->i_private = inode->i_private; | ||
249 | info->f_pos = file->f_pos; | ||
250 | file->private_data = info; | ||
251 | |||
252 | return 0; | ||
253 | } | ||
254 | |||
255 | static int rsxx_cram_release(struct inode *inode, struct file *file) | ||
256 | { | ||
257 | struct rsxx_cram *info = file->private_data; | ||
258 | |||
259 | if (!info) | ||
260 | return 0; | ||
261 | |||
262 | kfree(info); | ||
263 | file->private_data = NULL; | ||
264 | |||
265 | return 0; | ||
266 | } | ||
267 | |||
268 | static const struct file_operations debugfs_cram_fops = { | ||
269 | .owner = THIS_MODULE, | ||
270 | .open = rsxx_cram_open, | ||
271 | .read = rsxx_cram_read, | ||
272 | .write = rsxx_cram_write, | ||
273 | .release = rsxx_cram_release, | ||
274 | }; | ||
275 | |||
276 | static const struct file_operations debugfs_stats_fops = { | ||
277 | .owner = THIS_MODULE, | ||
278 | .open = rsxx_attr_stats_open, | ||
279 | .read = seq_read, | ||
280 | .llseek = seq_lseek, | ||
281 | .release = single_release, | ||
282 | }; | ||
283 | |||
284 | static const struct file_operations debugfs_pci_regs_fops = { | ||
285 | .owner = THIS_MODULE, | ||
286 | .open = rsxx_attr_pci_regs_open, | ||
287 | .read = seq_read, | ||
288 | .llseek = seq_lseek, | ||
289 | .release = single_release, | ||
290 | }; | ||
291 | |||
292 | static void rsxx_debugfs_dev_new(struct rsxx_cardinfo *card) | ||
293 | { | ||
294 | struct dentry *debugfs_stats; | ||
295 | struct dentry *debugfs_pci_regs; | ||
296 | struct dentry *debugfs_cram; | ||
297 | |||
298 | card->debugfs_dir = debugfs_create_dir(card->gendisk->disk_name, NULL); | ||
299 | if (IS_ERR_OR_NULL(card->debugfs_dir)) | ||
300 | goto failed_debugfs_dir; | ||
301 | |||
302 | debugfs_stats = debugfs_create_file("stats", S_IRUGO, | ||
303 | card->debugfs_dir, card, | ||
304 | &debugfs_stats_fops); | ||
305 | if (IS_ERR_OR_NULL(debugfs_stats)) | ||
306 | goto failed_debugfs_stats; | ||
307 | |||
308 | debugfs_pci_regs = debugfs_create_file("pci_regs", S_IRUGO, | ||
309 | card->debugfs_dir, card, | ||
310 | &debugfs_pci_regs_fops); | ||
311 | if (IS_ERR_OR_NULL(debugfs_pci_regs)) | ||
312 | goto failed_debugfs_pci_regs; | ||
313 | |||
314 | debugfs_cram = debugfs_create_file("cram", S_IRUGO | S_IWUSR, | ||
315 | card->debugfs_dir, card, | ||
316 | &debugfs_cram_fops); | ||
317 | if (IS_ERR_OR_NULL(debugfs_cram)) | ||
318 | goto failed_debugfs_cram; | ||
319 | |||
320 | return; | ||
321 | failed_debugfs_cram: | ||
322 | debugfs_remove(debugfs_pci_regs); | ||
323 | failed_debugfs_pci_regs: | ||
324 | debugfs_remove(debugfs_stats); | ||
325 | failed_debugfs_stats: | ||
326 | debugfs_remove(card->debugfs_dir); | ||
327 | failed_debugfs_dir: | ||
328 | card->debugfs_dir = NULL; | ||
329 | } | ||
330 | |||
55 | /*----------------- Interrupt Control & Handling -------------------*/ | 331 | /*----------------- Interrupt Control & Handling -------------------*/ |
56 | 332 | ||
57 | static void rsxx_mask_interrupts(struct rsxx_cardinfo *card) | 333 | static void rsxx_mask_interrupts(struct rsxx_cardinfo *card) |
@@ -163,12 +439,13 @@ static irqreturn_t rsxx_isr(int irq, void *pdata) | |||
163 | } | 439 | } |
164 | 440 | ||
165 | if (isr & CR_INTR_CREG) { | 441 | if (isr & CR_INTR_CREG) { |
166 | schedule_work(&card->creg_ctrl.done_work); | 442 | queue_work(card->creg_ctrl.creg_wq, |
443 | &card->creg_ctrl.done_work); | ||
167 | handled++; | 444 | handled++; |
168 | } | 445 | } |
169 | 446 | ||
170 | if (isr & CR_INTR_EVENT) { | 447 | if (isr & CR_INTR_EVENT) { |
171 | schedule_work(&card->event_work); | 448 | queue_work(card->event_wq, &card->event_work); |
172 | rsxx_disable_ier_and_isr(card, CR_INTR_EVENT); | 449 | rsxx_disable_ier_and_isr(card, CR_INTR_EVENT); |
173 | handled++; | 450 | handled++; |
174 | } | 451 | } |
@@ -329,7 +606,7 @@ static int rsxx_eeh_frozen(struct pci_dev *dev) | |||
329 | int i; | 606 | int i; |
330 | int st; | 607 | int st; |
331 | 608 | ||
332 | dev_warn(&dev->dev, "IBM FlashSystem PCI: preparing for slot reset.\n"); | 609 | dev_warn(&dev->dev, "IBM Flash Adapter PCI: preparing for slot reset.\n"); |
333 | 610 | ||
334 | card->eeh_state = 1; | 611 | card->eeh_state = 1; |
335 | rsxx_mask_interrupts(card); | 612 | rsxx_mask_interrupts(card); |
@@ -367,15 +644,26 @@ static void rsxx_eeh_failure(struct pci_dev *dev) | |||
367 | { | 644 | { |
368 | struct rsxx_cardinfo *card = pci_get_drvdata(dev); | 645 | struct rsxx_cardinfo *card = pci_get_drvdata(dev); |
369 | int i; | 646 | int i; |
647 | int cnt = 0; | ||
370 | 648 | ||
371 | dev_err(&dev->dev, "IBM FlashSystem PCI: disabling failed card.\n"); | 649 | dev_err(&dev->dev, "IBM Flash Adapter PCI: disabling failed card.\n"); |
372 | 650 | ||
373 | card->eeh_state = 1; | 651 | card->eeh_state = 1; |
652 | card->halt = 1; | ||
374 | 653 | ||
375 | for (i = 0; i < card->n_targets; i++) | 654 | for (i = 0; i < card->n_targets; i++) { |
376 | del_timer_sync(&card->ctrl[i].activity_timer); | 655 | spin_lock_bh(&card->ctrl[i].queue_lock); |
656 | cnt = rsxx_cleanup_dma_queue(&card->ctrl[i], | ||
657 | &card->ctrl[i].queue); | ||
658 | spin_unlock_bh(&card->ctrl[i].queue_lock); | ||
659 | |||
660 | cnt += rsxx_dma_cancel(&card->ctrl[i]); | ||
377 | 661 | ||
378 | rsxx_eeh_cancel_dmas(card); | 662 | if (cnt) |
663 | dev_info(CARD_TO_DEV(card), | ||
664 | "Freed %d queued DMAs on channel %d\n", | ||
665 | cnt, card->ctrl[i].id); | ||
666 | } | ||
379 | } | 667 | } |
380 | 668 | ||
381 | static int rsxx_eeh_fifo_flush_poll(struct rsxx_cardinfo *card) | 669 | static int rsxx_eeh_fifo_flush_poll(struct rsxx_cardinfo *card) |
@@ -432,7 +720,7 @@ static pci_ers_result_t rsxx_slot_reset(struct pci_dev *dev) | |||
432 | int st; | 720 | int st; |
433 | 721 | ||
434 | dev_warn(&dev->dev, | 722 | dev_warn(&dev->dev, |
435 | "IBM FlashSystem PCI: recovering from slot reset.\n"); | 723 | "IBM Flash Adapter PCI: recovering from slot reset.\n"); |
436 | 724 | ||
437 | st = pci_enable_device(dev); | 725 | st = pci_enable_device(dev); |
438 | if (st) | 726 | if (st) |
@@ -485,7 +773,7 @@ static pci_ers_result_t rsxx_slot_reset(struct pci_dev *dev) | |||
485 | &card->ctrl[i].issue_dma_work); | 773 | &card->ctrl[i].issue_dma_work); |
486 | } | 774 | } |
487 | 775 | ||
488 | dev_info(&dev->dev, "IBM FlashSystem PCI: recovery complete.\n"); | 776 | dev_info(&dev->dev, "IBM Flash Adapter PCI: recovery complete.\n"); |
489 | 777 | ||
490 | return PCI_ERS_RESULT_RECOVERED; | 778 | return PCI_ERS_RESULT_RECOVERED; |
491 | 779 | ||
@@ -528,6 +816,7 @@ static int rsxx_pci_probe(struct pci_dev *dev, | |||
528 | { | 816 | { |
529 | struct rsxx_cardinfo *card; | 817 | struct rsxx_cardinfo *card; |
530 | int st; | 818 | int st; |
819 | unsigned int sync_timeout; | ||
531 | 820 | ||
532 | dev_info(&dev->dev, "PCI-Flash SSD discovered\n"); | 821 | dev_info(&dev->dev, "PCI-Flash SSD discovered\n"); |
533 | 822 | ||
@@ -610,7 +899,11 @@ static int rsxx_pci_probe(struct pci_dev *dev, | |||
610 | } | 899 | } |
611 | 900 | ||
612 | /************* Setup Processor Command Interface *************/ | 901 | /************* Setup Processor Command Interface *************/ |
613 | rsxx_creg_setup(card); | 902 | st = rsxx_creg_setup(card); |
903 | if (st) { | ||
904 | dev_err(CARD_TO_DEV(card), "Failed to setup creg interface.\n"); | ||
905 | goto failed_creg_setup; | ||
906 | } | ||
614 | 907 | ||
615 | spin_lock_irq(&card->irq_lock); | 908 | spin_lock_irq(&card->irq_lock); |
616 | rsxx_enable_ier_and_isr(card, CR_INTR_CREG); | 909 | rsxx_enable_ier_and_isr(card, CR_INTR_CREG); |
@@ -650,6 +943,12 @@ static int rsxx_pci_probe(struct pci_dev *dev, | |||
650 | } | 943 | } |
651 | 944 | ||
652 | /************* Setup Card Event Handler *************/ | 945 | /************* Setup Card Event Handler *************/ |
946 | card->event_wq = create_singlethread_workqueue(DRIVER_NAME"_event"); | ||
947 | if (!card->event_wq) { | ||
948 | dev_err(CARD_TO_DEV(card), "Failed card event setup.\n"); | ||
949 | goto failed_event_handler; | ||
950 | } | ||
951 | |||
653 | INIT_WORK(&card->event_work, card_event_handler); | 952 | INIT_WORK(&card->event_work, card_event_handler); |
654 | 953 | ||
655 | st = rsxx_setup_dev(card); | 954 | st = rsxx_setup_dev(card); |
@@ -676,6 +975,33 @@ static int rsxx_pci_probe(struct pci_dev *dev, | |||
676 | if (st) | 975 | if (st) |
677 | dev_crit(CARD_TO_DEV(card), | 976 | dev_crit(CARD_TO_DEV(card), |
678 | "Failed issuing card startup\n"); | 977 | "Failed issuing card startup\n"); |
978 | if (sync_start) { | ||
979 | sync_timeout = SYNC_START_TIMEOUT; | ||
980 | |||
981 | dev_info(CARD_TO_DEV(card), | ||
982 | "Waiting for card to startup\n"); | ||
983 | |||
984 | do { | ||
985 | ssleep(1); | ||
986 | sync_timeout--; | ||
987 | |||
988 | rsxx_get_card_state(card, &card->state); | ||
989 | } while (sync_timeout && | ||
990 | (card->state == CARD_STATE_STARTING)); | ||
991 | |||
992 | if (card->state == CARD_STATE_STARTING) { | ||
993 | dev_warn(CARD_TO_DEV(card), | ||
994 | "Card startup timed out\n"); | ||
995 | card->size8 = 0; | ||
996 | } else { | ||
997 | dev_info(CARD_TO_DEV(card), | ||
998 | "card state: %s\n", | ||
999 | rsxx_card_state_to_str(card->state)); | ||
1000 | st = rsxx_get_card_size8(card, &card->size8); | ||
1001 | if (st) | ||
1002 | card->size8 = 0; | ||
1003 | } | ||
1004 | } | ||
679 | } else if (card->state == CARD_STATE_GOOD || | 1005 | } else if (card->state == CARD_STATE_GOOD || |
680 | card->state == CARD_STATE_RD_ONLY_FAULT) { | 1006 | card->state == CARD_STATE_RD_ONLY_FAULT) { |
681 | st = rsxx_get_card_size8(card, &card->size8); | 1007 | st = rsxx_get_card_size8(card, &card->size8); |
@@ -685,12 +1011,21 @@ static int rsxx_pci_probe(struct pci_dev *dev, | |||
685 | 1011 | ||
686 | rsxx_attach_dev(card); | 1012 | rsxx_attach_dev(card); |
687 | 1013 | ||
1014 | /************* Setup Debugfs *************/ | ||
1015 | rsxx_debugfs_dev_new(card); | ||
1016 | |||
688 | return 0; | 1017 | return 0; |
689 | 1018 | ||
690 | failed_create_dev: | 1019 | failed_create_dev: |
1020 | destroy_workqueue(card->event_wq); | ||
1021 | card->event_wq = NULL; | ||
1022 | failed_event_handler: | ||
691 | rsxx_dma_destroy(card); | 1023 | rsxx_dma_destroy(card); |
692 | failed_dma_setup: | 1024 | failed_dma_setup: |
693 | failed_compatiblity_check: | 1025 | failed_compatiblity_check: |
1026 | destroy_workqueue(card->creg_ctrl.creg_wq); | ||
1027 | card->creg_ctrl.creg_wq = NULL; | ||
1028 | failed_creg_setup: | ||
694 | spin_lock_irq(&card->irq_lock); | 1029 | spin_lock_irq(&card->irq_lock); |
695 | rsxx_disable_ier_and_isr(card, CR_INTR_ALL); | 1030 | rsxx_disable_ier_and_isr(card, CR_INTR_ALL); |
696 | spin_unlock_irq(&card->irq_lock); | 1031 | spin_unlock_irq(&card->irq_lock); |
@@ -756,6 +1091,8 @@ static void rsxx_pci_remove(struct pci_dev *dev) | |||
756 | /* Prevent work_structs from re-queuing themselves. */ | 1091 | /* Prevent work_structs from re-queuing themselves. */ |
757 | card->halt = 1; | 1092 | card->halt = 1; |
758 | 1093 | ||
1094 | debugfs_remove_recursive(card->debugfs_dir); | ||
1095 | |||
759 | free_irq(dev->irq, card); | 1096 | free_irq(dev->irq, card); |
760 | 1097 | ||
761 | if (!force_legacy) | 1098 | if (!force_legacy) |
diff --git a/drivers/block/rsxx/cregs.c b/drivers/block/rsxx/cregs.c index 4b5c020a0a65..926dce9c452f 100644 --- a/drivers/block/rsxx/cregs.c +++ b/drivers/block/rsxx/cregs.c | |||
@@ -431,6 +431,15 @@ static int __issue_creg_rw(struct rsxx_cardinfo *card, | |||
431 | *hw_stat = completion.creg_status; | 431 | *hw_stat = completion.creg_status; |
432 | 432 | ||
433 | if (completion.st) { | 433 | if (completion.st) { |
434 | /* | ||
435 | * This read is needed to verify that there has not been any | ||
436 | * extreme errors that might have occurred, i.e. EEH. The | ||
437 | * function iowrite32 will not detect EEH errors, so it is | ||
438 | * necessary that we recover if such an error is the reason | ||
439 | * for the timeout. This is a dummy read. | ||
440 | */ | ||
441 | ioread32(card->regmap + SCRATCH); | ||
442 | |||
434 | dev_warn(CARD_TO_DEV(card), | 443 | dev_warn(CARD_TO_DEV(card), |
435 | "creg command failed(%d x%08x)\n", | 444 | "creg command failed(%d x%08x)\n", |
436 | completion.st, addr); | 445 | completion.st, addr); |
@@ -727,6 +736,11 @@ int rsxx_creg_setup(struct rsxx_cardinfo *card) | |||
727 | { | 736 | { |
728 | card->creg_ctrl.active_cmd = NULL; | 737 | card->creg_ctrl.active_cmd = NULL; |
729 | 738 | ||
739 | card->creg_ctrl.creg_wq = | ||
740 | create_singlethread_workqueue(DRIVER_NAME"_creg"); | ||
741 | if (!card->creg_ctrl.creg_wq) | ||
742 | return -ENOMEM; | ||
743 | |||
730 | INIT_WORK(&card->creg_ctrl.done_work, creg_cmd_done); | 744 | INIT_WORK(&card->creg_ctrl.done_work, creg_cmd_done); |
731 | mutex_init(&card->creg_ctrl.reset_lock); | 745 | mutex_init(&card->creg_ctrl.reset_lock); |
732 | INIT_LIST_HEAD(&card->creg_ctrl.queue); | 746 | INIT_LIST_HEAD(&card->creg_ctrl.queue); |
diff --git a/drivers/block/rsxx/dev.c b/drivers/block/rsxx/dev.c index 4346d17d2949..d7af441880be 100644 --- a/drivers/block/rsxx/dev.c +++ b/drivers/block/rsxx/dev.c | |||
@@ -155,7 +155,8 @@ static void bio_dma_done_cb(struct rsxx_cardinfo *card, | |||
155 | atomic_set(&meta->error, 1); | 155 | atomic_set(&meta->error, 1); |
156 | 156 | ||
157 | if (atomic_dec_and_test(&meta->pending_dmas)) { | 157 | if (atomic_dec_and_test(&meta->pending_dmas)) { |
158 | disk_stats_complete(card, meta->bio, meta->start_time); | 158 | if (!card->eeh_state && card->gendisk) |
159 | disk_stats_complete(card, meta->bio, meta->start_time); | ||
159 | 160 | ||
160 | bio_endio(meta->bio, atomic_read(&meta->error) ? -EIO : 0); | 161 | bio_endio(meta->bio, atomic_read(&meta->error) ? -EIO : 0); |
161 | kmem_cache_free(bio_meta_pool, meta); | 162 | kmem_cache_free(bio_meta_pool, meta); |
@@ -170,6 +171,12 @@ static void rsxx_make_request(struct request_queue *q, struct bio *bio) | |||
170 | 171 | ||
171 | might_sleep(); | 172 | might_sleep(); |
172 | 173 | ||
174 | if (!card) | ||
175 | goto req_err; | ||
176 | |||
177 | if (bio->bi_sector + (bio->bi_size >> 9) > get_capacity(card->gendisk)) | ||
178 | goto req_err; | ||
179 | |||
173 | if (unlikely(card->halt)) { | 180 | if (unlikely(card->halt)) { |
174 | st = -EFAULT; | 181 | st = -EFAULT; |
175 | goto req_err; | 182 | goto req_err; |
@@ -196,7 +203,8 @@ static void rsxx_make_request(struct request_queue *q, struct bio *bio) | |||
196 | atomic_set(&bio_meta->pending_dmas, 0); | 203 | atomic_set(&bio_meta->pending_dmas, 0); |
197 | bio_meta->start_time = jiffies; | 204 | bio_meta->start_time = jiffies; |
198 | 205 | ||
199 | disk_stats_start(card, bio); | 206 | if (!unlikely(card->halt)) |
207 | disk_stats_start(card, bio); | ||
200 | 208 | ||
201 | dev_dbg(CARD_TO_DEV(card), "BIO[%c]: meta: %p addr8: x%llx size: %d\n", | 209 | dev_dbg(CARD_TO_DEV(card), "BIO[%c]: meta: %p addr8: x%llx size: %d\n", |
202 | bio_data_dir(bio) ? 'W' : 'R', bio_meta, | 210 | bio_data_dir(bio) ? 'W' : 'R', bio_meta, |
@@ -225,24 +233,6 @@ static bool rsxx_discard_supported(struct rsxx_cardinfo *card) | |||
225 | return (pci_rev >= RSXX_DISCARD_SUPPORT); | 233 | return (pci_rev >= RSXX_DISCARD_SUPPORT); |
226 | } | 234 | } |
227 | 235 | ||
228 | static unsigned short rsxx_get_logical_block_size( | ||
229 | struct rsxx_cardinfo *card) | ||
230 | { | ||
231 | u32 capabilities = 0; | ||
232 | int st; | ||
233 | |||
234 | st = rsxx_get_card_capabilities(card, &capabilities); | ||
235 | if (st) | ||
236 | dev_warn(CARD_TO_DEV(card), | ||
237 | "Failed reading card capabilities register\n"); | ||
238 | |||
239 | /* Earlier firmware did not have support for 512 byte accesses */ | ||
240 | if (capabilities & CARD_CAP_SUBPAGE_WRITES) | ||
241 | return 512; | ||
242 | else | ||
243 | return RSXX_HW_BLK_SIZE; | ||
244 | } | ||
245 | |||
246 | int rsxx_attach_dev(struct rsxx_cardinfo *card) | 236 | int rsxx_attach_dev(struct rsxx_cardinfo *card) |
247 | { | 237 | { |
248 | mutex_lock(&card->dev_lock); | 238 | mutex_lock(&card->dev_lock); |
@@ -305,7 +295,7 @@ int rsxx_setup_dev(struct rsxx_cardinfo *card) | |||
305 | return -ENOMEM; | 295 | return -ENOMEM; |
306 | } | 296 | } |
307 | 297 | ||
308 | blk_size = rsxx_get_logical_block_size(card); | 298 | blk_size = card->config.data.block_size; |
309 | 299 | ||
310 | blk_queue_make_request(card->queue, rsxx_make_request); | 300 | blk_queue_make_request(card->queue, rsxx_make_request); |
311 | blk_queue_bounce_limit(card->queue, BLK_BOUNCE_ANY); | 301 | blk_queue_bounce_limit(card->queue, BLK_BOUNCE_ANY); |
@@ -347,6 +337,7 @@ void rsxx_destroy_dev(struct rsxx_cardinfo *card) | |||
347 | card->gendisk = NULL; | 337 | card->gendisk = NULL; |
348 | 338 | ||
349 | blk_cleanup_queue(card->queue); | 339 | blk_cleanup_queue(card->queue); |
340 | card->queue->queuedata = NULL; | ||
350 | unregister_blkdev(card->major, DRIVER_NAME); | 341 | unregister_blkdev(card->major, DRIVER_NAME); |
351 | } | 342 | } |
352 | 343 | ||
diff --git a/drivers/block/rsxx/dma.c b/drivers/block/rsxx/dma.c index 0607513cfb41..bed32f16b084 100644 --- a/drivers/block/rsxx/dma.c +++ b/drivers/block/rsxx/dma.c | |||
@@ -245,6 +245,22 @@ static void rsxx_complete_dma(struct rsxx_dma_ctrl *ctrl, | |||
245 | kmem_cache_free(rsxx_dma_pool, dma); | 245 | kmem_cache_free(rsxx_dma_pool, dma); |
246 | } | 246 | } |
247 | 247 | ||
248 | int rsxx_cleanup_dma_queue(struct rsxx_dma_ctrl *ctrl, | ||
249 | struct list_head *q) | ||
250 | { | ||
251 | struct rsxx_dma *dma; | ||
252 | struct rsxx_dma *tmp; | ||
253 | int cnt = 0; | ||
254 | |||
255 | list_for_each_entry_safe(dma, tmp, q, list) { | ||
256 | list_del(&dma->list); | ||
257 | rsxx_complete_dma(ctrl, dma, DMA_CANCELLED); | ||
258 | cnt++; | ||
259 | } | ||
260 | |||
261 | return cnt; | ||
262 | } | ||
263 | |||
248 | static void rsxx_requeue_dma(struct rsxx_dma_ctrl *ctrl, | 264 | static void rsxx_requeue_dma(struct rsxx_dma_ctrl *ctrl, |
249 | struct rsxx_dma *dma) | 265 | struct rsxx_dma *dma) |
250 | { | 266 | { |
@@ -252,9 +268,10 @@ static void rsxx_requeue_dma(struct rsxx_dma_ctrl *ctrl, | |||
252 | * Requeued DMAs go to the front of the queue so they are issued | 268 | * Requeued DMAs go to the front of the queue so they are issued |
253 | * first. | 269 | * first. |
254 | */ | 270 | */ |
255 | spin_lock(&ctrl->queue_lock); | 271 | spin_lock_bh(&ctrl->queue_lock); |
272 | ctrl->stats.sw_q_depth++; | ||
256 | list_add(&dma->list, &ctrl->queue); | 273 | list_add(&dma->list, &ctrl->queue); |
257 | spin_unlock(&ctrl->queue_lock); | 274 | spin_unlock_bh(&ctrl->queue_lock); |
258 | } | 275 | } |
259 | 276 | ||
260 | static void rsxx_handle_dma_error(struct rsxx_dma_ctrl *ctrl, | 277 | static void rsxx_handle_dma_error(struct rsxx_dma_ctrl *ctrl, |
@@ -329,6 +346,7 @@ static void rsxx_handle_dma_error(struct rsxx_dma_ctrl *ctrl, | |||
329 | static void dma_engine_stalled(unsigned long data) | 346 | static void dma_engine_stalled(unsigned long data) |
330 | { | 347 | { |
331 | struct rsxx_dma_ctrl *ctrl = (struct rsxx_dma_ctrl *)data; | 348 | struct rsxx_dma_ctrl *ctrl = (struct rsxx_dma_ctrl *)data; |
349 | int cnt; | ||
332 | 350 | ||
333 | if (atomic_read(&ctrl->stats.hw_q_depth) == 0 || | 351 | if (atomic_read(&ctrl->stats.hw_q_depth) == 0 || |
334 | unlikely(ctrl->card->eeh_state)) | 352 | unlikely(ctrl->card->eeh_state)) |
@@ -349,18 +367,28 @@ static void dma_engine_stalled(unsigned long data) | |||
349 | "DMA channel %d has stalled, faulting interface.\n", | 367 | "DMA channel %d has stalled, faulting interface.\n", |
350 | ctrl->id); | 368 | ctrl->id); |
351 | ctrl->card->dma_fault = 1; | 369 | ctrl->card->dma_fault = 1; |
370 | |||
371 | /* Clean up the DMA queue */ | ||
372 | spin_lock(&ctrl->queue_lock); | ||
373 | cnt = rsxx_cleanup_dma_queue(ctrl, &ctrl->queue); | ||
374 | spin_unlock(&ctrl->queue_lock); | ||
375 | |||
376 | cnt += rsxx_dma_cancel(ctrl); | ||
377 | |||
378 | if (cnt) | ||
379 | dev_info(CARD_TO_DEV(ctrl->card), | ||
380 | "Freed %d queued DMAs on channel %d\n", | ||
381 | cnt, ctrl->id); | ||
352 | } | 382 | } |
353 | } | 383 | } |
354 | 384 | ||
355 | static void rsxx_issue_dmas(struct work_struct *work) | 385 | static void rsxx_issue_dmas(struct rsxx_dma_ctrl *ctrl) |
356 | { | 386 | { |
357 | struct rsxx_dma_ctrl *ctrl; | ||
358 | struct rsxx_dma *dma; | 387 | struct rsxx_dma *dma; |
359 | int tag; | 388 | int tag; |
360 | int cmds_pending = 0; | 389 | int cmds_pending = 0; |
361 | struct hw_cmd *hw_cmd_buf; | 390 | struct hw_cmd *hw_cmd_buf; |
362 | 391 | ||
363 | ctrl = container_of(work, struct rsxx_dma_ctrl, issue_dma_work); | ||
364 | hw_cmd_buf = ctrl->cmd.buf; | 392 | hw_cmd_buf = ctrl->cmd.buf; |
365 | 393 | ||
366 | if (unlikely(ctrl->card->halt) || | 394 | if (unlikely(ctrl->card->halt) || |
@@ -368,22 +396,22 @@ static void rsxx_issue_dmas(struct work_struct *work) | |||
368 | return; | 396 | return; |
369 | 397 | ||
370 | while (1) { | 398 | while (1) { |
371 | spin_lock(&ctrl->queue_lock); | 399 | spin_lock_bh(&ctrl->queue_lock); |
372 | if (list_empty(&ctrl->queue)) { | 400 | if (list_empty(&ctrl->queue)) { |
373 | spin_unlock(&ctrl->queue_lock); | 401 | spin_unlock_bh(&ctrl->queue_lock); |
374 | break; | 402 | break; |
375 | } | 403 | } |
376 | spin_unlock(&ctrl->queue_lock); | 404 | spin_unlock_bh(&ctrl->queue_lock); |
377 | 405 | ||
378 | tag = pop_tracker(ctrl->trackers); | 406 | tag = pop_tracker(ctrl->trackers); |
379 | if (tag == -1) | 407 | if (tag == -1) |
380 | break; | 408 | break; |
381 | 409 | ||
382 | spin_lock(&ctrl->queue_lock); | 410 | spin_lock_bh(&ctrl->queue_lock); |
383 | dma = list_entry(ctrl->queue.next, struct rsxx_dma, list); | 411 | dma = list_entry(ctrl->queue.next, struct rsxx_dma, list); |
384 | list_del(&dma->list); | 412 | list_del(&dma->list); |
385 | ctrl->stats.sw_q_depth--; | 413 | ctrl->stats.sw_q_depth--; |
386 | spin_unlock(&ctrl->queue_lock); | 414 | spin_unlock_bh(&ctrl->queue_lock); |
387 | 415 | ||
388 | /* | 416 | /* |
389 | * This will catch any DMAs that slipped in right before the | 417 | * This will catch any DMAs that slipped in right before the |
@@ -440,9 +468,8 @@ static void rsxx_issue_dmas(struct work_struct *work) | |||
440 | } | 468 | } |
441 | } | 469 | } |
442 | 470 | ||
443 | static void rsxx_dma_done(struct work_struct *work) | 471 | static void rsxx_dma_done(struct rsxx_dma_ctrl *ctrl) |
444 | { | 472 | { |
445 | struct rsxx_dma_ctrl *ctrl; | ||
446 | struct rsxx_dma *dma; | 473 | struct rsxx_dma *dma; |
447 | unsigned long flags; | 474 | unsigned long flags; |
448 | u16 count; | 475 | u16 count; |
@@ -450,7 +477,6 @@ static void rsxx_dma_done(struct work_struct *work) | |||
450 | u8 tag; | 477 | u8 tag; |
451 | struct hw_status *hw_st_buf; | 478 | struct hw_status *hw_st_buf; |
452 | 479 | ||
453 | ctrl = container_of(work, struct rsxx_dma_ctrl, dma_done_work); | ||
454 | hw_st_buf = ctrl->status.buf; | 480 | hw_st_buf = ctrl->status.buf; |
455 | 481 | ||
456 | if (unlikely(ctrl->card->halt) || | 482 | if (unlikely(ctrl->card->halt) || |
@@ -520,33 +546,32 @@ static void rsxx_dma_done(struct work_struct *work) | |||
520 | rsxx_enable_ier(ctrl->card, CR_INTR_DMA(ctrl->id)); | 546 | rsxx_enable_ier(ctrl->card, CR_INTR_DMA(ctrl->id)); |
521 | spin_unlock_irqrestore(&ctrl->card->irq_lock, flags); | 547 | spin_unlock_irqrestore(&ctrl->card->irq_lock, flags); |
522 | 548 | ||
523 | spin_lock(&ctrl->queue_lock); | 549 | spin_lock_bh(&ctrl->queue_lock); |
524 | if (ctrl->stats.sw_q_depth) | 550 | if (ctrl->stats.sw_q_depth) |
525 | queue_work(ctrl->issue_wq, &ctrl->issue_dma_work); | 551 | queue_work(ctrl->issue_wq, &ctrl->issue_dma_work); |
526 | spin_unlock(&ctrl->queue_lock); | 552 | spin_unlock_bh(&ctrl->queue_lock); |
527 | } | 553 | } |
528 | 554 | ||
529 | static int rsxx_cleanup_dma_queue(struct rsxx_cardinfo *card, | 555 | static void rsxx_schedule_issue(struct work_struct *work) |
530 | struct list_head *q) | ||
531 | { | 556 | { |
532 | struct rsxx_dma *dma; | 557 | struct rsxx_dma_ctrl *ctrl; |
533 | struct rsxx_dma *tmp; | ||
534 | int cnt = 0; | ||
535 | 558 | ||
536 | list_for_each_entry_safe(dma, tmp, q, list) { | 559 | ctrl = container_of(work, struct rsxx_dma_ctrl, issue_dma_work); |
537 | list_del(&dma->list); | ||
538 | 560 | ||
539 | if (dma->dma_addr) | 561 | mutex_lock(&ctrl->work_lock); |
540 | pci_unmap_page(card->dev, dma->dma_addr, | 562 | rsxx_issue_dmas(ctrl); |
541 | get_dma_size(dma), | 563 | mutex_unlock(&ctrl->work_lock); |
542 | (dma->cmd == HW_CMD_BLK_WRITE) ? | 564 | } |
543 | PCI_DMA_TODEVICE : | ||
544 | PCI_DMA_FROMDEVICE); | ||
545 | kmem_cache_free(rsxx_dma_pool, dma); | ||
546 | cnt++; | ||
547 | } | ||
548 | 565 | ||
549 | return cnt; | 566 | static void rsxx_schedule_done(struct work_struct *work) |
567 | { | ||
568 | struct rsxx_dma_ctrl *ctrl; | ||
569 | |||
570 | ctrl = container_of(work, struct rsxx_dma_ctrl, dma_done_work); | ||
571 | |||
572 | mutex_lock(&ctrl->work_lock); | ||
573 | rsxx_dma_done(ctrl); | ||
574 | mutex_unlock(&ctrl->work_lock); | ||
550 | } | 575 | } |
551 | 576 | ||
552 | static int rsxx_queue_discard(struct rsxx_cardinfo *card, | 577 | static int rsxx_queue_discard(struct rsxx_cardinfo *card, |
@@ -698,10 +723,10 @@ int rsxx_dma_queue_bio(struct rsxx_cardinfo *card, | |||
698 | 723 | ||
699 | for (i = 0; i < card->n_targets; i++) { | 724 | for (i = 0; i < card->n_targets; i++) { |
700 | if (!list_empty(&dma_list[i])) { | 725 | if (!list_empty(&dma_list[i])) { |
701 | spin_lock(&card->ctrl[i].queue_lock); | 726 | spin_lock_bh(&card->ctrl[i].queue_lock); |
702 | card->ctrl[i].stats.sw_q_depth += dma_cnt[i]; | 727 | card->ctrl[i].stats.sw_q_depth += dma_cnt[i]; |
703 | list_splice_tail(&dma_list[i], &card->ctrl[i].queue); | 728 | list_splice_tail(&dma_list[i], &card->ctrl[i].queue); |
704 | spin_unlock(&card->ctrl[i].queue_lock); | 729 | spin_unlock_bh(&card->ctrl[i].queue_lock); |
705 | 730 | ||
706 | queue_work(card->ctrl[i].issue_wq, | 731 | queue_work(card->ctrl[i].issue_wq, |
707 | &card->ctrl[i].issue_dma_work); | 732 | &card->ctrl[i].issue_dma_work); |
@@ -711,8 +736,11 @@ int rsxx_dma_queue_bio(struct rsxx_cardinfo *card, | |||
711 | return 0; | 736 | return 0; |
712 | 737 | ||
713 | bvec_err: | 738 | bvec_err: |
714 | for (i = 0; i < card->n_targets; i++) | 739 | for (i = 0; i < card->n_targets; i++) { |
715 | rsxx_cleanup_dma_queue(card, &dma_list[i]); | 740 | spin_lock_bh(&card->ctrl[i].queue_lock); |
741 | rsxx_cleanup_dma_queue(&card->ctrl[i], &dma_list[i]); | ||
742 | spin_unlock_bh(&card->ctrl[i].queue_lock); | ||
743 | } | ||
716 | 744 | ||
717 | return st; | 745 | return st; |
718 | } | 746 | } |
@@ -780,6 +808,7 @@ static int rsxx_dma_ctrl_init(struct pci_dev *dev, | |||
780 | spin_lock_init(&ctrl->trackers->lock); | 808 | spin_lock_init(&ctrl->trackers->lock); |
781 | 809 | ||
782 | spin_lock_init(&ctrl->queue_lock); | 810 | spin_lock_init(&ctrl->queue_lock); |
811 | mutex_init(&ctrl->work_lock); | ||
783 | INIT_LIST_HEAD(&ctrl->queue); | 812 | INIT_LIST_HEAD(&ctrl->queue); |
784 | 813 | ||
785 | setup_timer(&ctrl->activity_timer, dma_engine_stalled, | 814 | setup_timer(&ctrl->activity_timer, dma_engine_stalled, |
@@ -793,8 +822,8 @@ static int rsxx_dma_ctrl_init(struct pci_dev *dev, | |||
793 | if (!ctrl->done_wq) | 822 | if (!ctrl->done_wq) |
794 | return -ENOMEM; | 823 | return -ENOMEM; |
795 | 824 | ||
796 | INIT_WORK(&ctrl->issue_dma_work, rsxx_issue_dmas); | 825 | INIT_WORK(&ctrl->issue_dma_work, rsxx_schedule_issue); |
797 | INIT_WORK(&ctrl->dma_done_work, rsxx_dma_done); | 826 | INIT_WORK(&ctrl->dma_done_work, rsxx_schedule_done); |
798 | 827 | ||
799 | st = rsxx_hw_buffers_init(dev, ctrl); | 828 | st = rsxx_hw_buffers_init(dev, ctrl); |
800 | if (st) | 829 | if (st) |
@@ -918,13 +947,30 @@ failed_dma_setup: | |||
918 | return st; | 947 | return st; |
919 | } | 948 | } |
920 | 949 | ||
950 | int rsxx_dma_cancel(struct rsxx_dma_ctrl *ctrl) | ||
951 | { | ||
952 | struct rsxx_dma *dma; | ||
953 | int i; | ||
954 | int cnt = 0; | ||
955 | |||
956 | /* Clean up issued DMAs */ | ||
957 | for (i = 0; i < RSXX_MAX_OUTSTANDING_CMDS; i++) { | ||
958 | dma = get_tracker_dma(ctrl->trackers, i); | ||
959 | if (dma) { | ||
960 | atomic_dec(&ctrl->stats.hw_q_depth); | ||
961 | rsxx_complete_dma(ctrl, dma, DMA_CANCELLED); | ||
962 | push_tracker(ctrl->trackers, i); | ||
963 | cnt++; | ||
964 | } | ||
965 | } | ||
966 | |||
967 | return cnt; | ||
968 | } | ||
921 | 969 | ||
922 | void rsxx_dma_destroy(struct rsxx_cardinfo *card) | 970 | void rsxx_dma_destroy(struct rsxx_cardinfo *card) |
923 | { | 971 | { |
924 | struct rsxx_dma_ctrl *ctrl; | 972 | struct rsxx_dma_ctrl *ctrl; |
925 | struct rsxx_dma *dma; | 973 | int i; |
926 | int i, j; | ||
927 | int cnt = 0; | ||
928 | 974 | ||
929 | for (i = 0; i < card->n_targets; i++) { | 975 | for (i = 0; i < card->n_targets; i++) { |
930 | ctrl = &card->ctrl[i]; | 976 | ctrl = &card->ctrl[i]; |
@@ -943,33 +989,11 @@ void rsxx_dma_destroy(struct rsxx_cardinfo *card) | |||
943 | del_timer_sync(&ctrl->activity_timer); | 989 | del_timer_sync(&ctrl->activity_timer); |
944 | 990 | ||
945 | /* Clean up the DMA queue */ | 991 | /* Clean up the DMA queue */ |
946 | spin_lock(&ctrl->queue_lock); | 992 | spin_lock_bh(&ctrl->queue_lock); |
947 | cnt = rsxx_cleanup_dma_queue(card, &ctrl->queue); | 993 | rsxx_cleanup_dma_queue(ctrl, &ctrl->queue); |
948 | spin_unlock(&ctrl->queue_lock); | 994 | spin_unlock_bh(&ctrl->queue_lock); |
949 | |||
950 | if (cnt) | ||
951 | dev_info(CARD_TO_DEV(card), | ||
952 | "Freed %d queued DMAs on channel %d\n", | ||
953 | cnt, i); | ||
954 | |||
955 | /* Clean up issued DMAs */ | ||
956 | for (j = 0; j < RSXX_MAX_OUTSTANDING_CMDS; j++) { | ||
957 | dma = get_tracker_dma(ctrl->trackers, j); | ||
958 | if (dma) { | ||
959 | pci_unmap_page(card->dev, dma->dma_addr, | ||
960 | get_dma_size(dma), | ||
961 | (dma->cmd == HW_CMD_BLK_WRITE) ? | ||
962 | PCI_DMA_TODEVICE : | ||
963 | PCI_DMA_FROMDEVICE); | ||
964 | kmem_cache_free(rsxx_dma_pool, dma); | ||
965 | cnt++; | ||
966 | } | ||
967 | } | ||
968 | 995 | ||
969 | if (cnt) | 996 | rsxx_dma_cancel(ctrl); |
970 | dev_info(CARD_TO_DEV(card), | ||
971 | "Freed %d pending DMAs on channel %d\n", | ||
972 | cnt, i); | ||
973 | 997 | ||
974 | vfree(ctrl->trackers); | 998 | vfree(ctrl->trackers); |
975 | 999 | ||
@@ -1013,7 +1037,7 @@ int rsxx_eeh_save_issued_dmas(struct rsxx_cardinfo *card) | |||
1013 | cnt++; | 1037 | cnt++; |
1014 | } | 1038 | } |
1015 | 1039 | ||
1016 | spin_lock(&card->ctrl[i].queue_lock); | 1040 | spin_lock_bh(&card->ctrl[i].queue_lock); |
1017 | list_splice(&issued_dmas[i], &card->ctrl[i].queue); | 1041 | list_splice(&issued_dmas[i], &card->ctrl[i].queue); |
1018 | 1042 | ||
1019 | atomic_sub(cnt, &card->ctrl[i].stats.hw_q_depth); | 1043 | atomic_sub(cnt, &card->ctrl[i].stats.hw_q_depth); |
@@ -1028,7 +1052,7 @@ int rsxx_eeh_save_issued_dmas(struct rsxx_cardinfo *card) | |||
1028 | PCI_DMA_TODEVICE : | 1052 | PCI_DMA_TODEVICE : |
1029 | PCI_DMA_FROMDEVICE); | 1053 | PCI_DMA_FROMDEVICE); |
1030 | } | 1054 | } |
1031 | spin_unlock(&card->ctrl[i].queue_lock); | 1055 | spin_unlock_bh(&card->ctrl[i].queue_lock); |
1032 | } | 1056 | } |
1033 | 1057 | ||
1034 | kfree(issued_dmas); | 1058 | kfree(issued_dmas); |
@@ -1036,30 +1060,13 @@ int rsxx_eeh_save_issued_dmas(struct rsxx_cardinfo *card) | |||
1036 | return 0; | 1060 | return 0; |
1037 | } | 1061 | } |
1038 | 1062 | ||
1039 | void rsxx_eeh_cancel_dmas(struct rsxx_cardinfo *card) | ||
1040 | { | ||
1041 | struct rsxx_dma *dma; | ||
1042 | struct rsxx_dma *tmp; | ||
1043 | int i; | ||
1044 | |||
1045 | for (i = 0; i < card->n_targets; i++) { | ||
1046 | spin_lock(&card->ctrl[i].queue_lock); | ||
1047 | list_for_each_entry_safe(dma, tmp, &card->ctrl[i].queue, list) { | ||
1048 | list_del(&dma->list); | ||
1049 | |||
1050 | rsxx_complete_dma(&card->ctrl[i], dma, DMA_CANCELLED); | ||
1051 | } | ||
1052 | spin_unlock(&card->ctrl[i].queue_lock); | ||
1053 | } | ||
1054 | } | ||
1055 | |||
1056 | int rsxx_eeh_remap_dmas(struct rsxx_cardinfo *card) | 1063 | int rsxx_eeh_remap_dmas(struct rsxx_cardinfo *card) |
1057 | { | 1064 | { |
1058 | struct rsxx_dma *dma; | 1065 | struct rsxx_dma *dma; |
1059 | int i; | 1066 | int i; |
1060 | 1067 | ||
1061 | for (i = 0; i < card->n_targets; i++) { | 1068 | for (i = 0; i < card->n_targets; i++) { |
1062 | spin_lock(&card->ctrl[i].queue_lock); | 1069 | spin_lock_bh(&card->ctrl[i].queue_lock); |
1063 | list_for_each_entry(dma, &card->ctrl[i].queue, list) { | 1070 | list_for_each_entry(dma, &card->ctrl[i].queue, list) { |
1064 | dma->dma_addr = pci_map_page(card->dev, dma->page, | 1071 | dma->dma_addr = pci_map_page(card->dev, dma->page, |
1065 | dma->pg_off, get_dma_size(dma), | 1072 | dma->pg_off, get_dma_size(dma), |
@@ -1067,12 +1074,12 @@ int rsxx_eeh_remap_dmas(struct rsxx_cardinfo *card) | |||
1067 | PCI_DMA_TODEVICE : | 1074 | PCI_DMA_TODEVICE : |
1068 | PCI_DMA_FROMDEVICE); | 1075 | PCI_DMA_FROMDEVICE); |
1069 | if (!dma->dma_addr) { | 1076 | if (!dma->dma_addr) { |
1070 | spin_unlock(&card->ctrl[i].queue_lock); | 1077 | spin_unlock_bh(&card->ctrl[i].queue_lock); |
1071 | kmem_cache_free(rsxx_dma_pool, dma); | 1078 | kmem_cache_free(rsxx_dma_pool, dma); |
1072 | return -ENOMEM; | 1079 | return -ENOMEM; |
1073 | } | 1080 | } |
1074 | } | 1081 | } |
1075 | spin_unlock(&card->ctrl[i].queue_lock); | 1082 | spin_unlock_bh(&card->ctrl[i].queue_lock); |
1076 | } | 1083 | } |
1077 | 1084 | ||
1078 | return 0; | 1085 | return 0; |
diff --git a/drivers/block/rsxx/rsxx_priv.h b/drivers/block/rsxx/rsxx_priv.h index 382e8bf5c03b..5ad5055a4104 100644 --- a/drivers/block/rsxx/rsxx_priv.h +++ b/drivers/block/rsxx/rsxx_priv.h | |||
@@ -39,6 +39,7 @@ | |||
39 | #include <linux/vmalloc.h> | 39 | #include <linux/vmalloc.h> |
40 | #include <linux/timer.h> | 40 | #include <linux/timer.h> |
41 | #include <linux/ioctl.h> | 41 | #include <linux/ioctl.h> |
42 | #include <linux/delay.h> | ||
42 | 43 | ||
43 | #include "rsxx.h" | 44 | #include "rsxx.h" |
44 | #include "rsxx_cfg.h" | 45 | #include "rsxx_cfg.h" |
@@ -114,6 +115,7 @@ struct rsxx_dma_ctrl { | |||
114 | struct timer_list activity_timer; | 115 | struct timer_list activity_timer; |
115 | struct dma_tracker_list *trackers; | 116 | struct dma_tracker_list *trackers; |
116 | struct rsxx_dma_stats stats; | 117 | struct rsxx_dma_stats stats; |
118 | struct mutex work_lock; | ||
117 | }; | 119 | }; |
118 | 120 | ||
119 | struct rsxx_cardinfo { | 121 | struct rsxx_cardinfo { |
@@ -134,6 +136,7 @@ struct rsxx_cardinfo { | |||
134 | spinlock_t lock; | 136 | spinlock_t lock; |
135 | bool active; | 137 | bool active; |
136 | struct creg_cmd *active_cmd; | 138 | struct creg_cmd *active_cmd; |
139 | struct workqueue_struct *creg_wq; | ||
137 | struct work_struct done_work; | 140 | struct work_struct done_work; |
138 | struct list_head queue; | 141 | struct list_head queue; |
139 | unsigned int q_depth; | 142 | unsigned int q_depth; |
@@ -154,6 +157,7 @@ struct rsxx_cardinfo { | |||
154 | int buf_len; | 157 | int buf_len; |
155 | } log; | 158 | } log; |
156 | 159 | ||
160 | struct workqueue_struct *event_wq; | ||
157 | struct work_struct event_work; | 161 | struct work_struct event_work; |
158 | unsigned int state; | 162 | unsigned int state; |
159 | u64 size8; | 163 | u64 size8; |
@@ -181,6 +185,8 @@ struct rsxx_cardinfo { | |||
181 | 185 | ||
182 | int n_targets; | 186 | int n_targets; |
183 | struct rsxx_dma_ctrl *ctrl; | 187 | struct rsxx_dma_ctrl *ctrl; |
188 | |||
189 | struct dentry *debugfs_dir; | ||
184 | }; | 190 | }; |
185 | 191 | ||
186 | enum rsxx_pci_regmap { | 192 | enum rsxx_pci_regmap { |
@@ -283,6 +289,7 @@ enum rsxx_creg_addr { | |||
283 | CREG_ADD_CAPABILITIES = 0x80001050, | 289 | CREG_ADD_CAPABILITIES = 0x80001050, |
284 | CREG_ADD_LOG = 0x80002000, | 290 | CREG_ADD_LOG = 0x80002000, |
285 | CREG_ADD_NUM_TARGETS = 0x80003000, | 291 | CREG_ADD_NUM_TARGETS = 0x80003000, |
292 | CREG_ADD_CRAM = 0xA0000000, | ||
286 | CREG_ADD_CONFIG = 0xB0000000, | 293 | CREG_ADD_CONFIG = 0xB0000000, |
287 | }; | 294 | }; |
288 | 295 | ||
@@ -372,6 +379,8 @@ typedef void (*rsxx_dma_cb)(struct rsxx_cardinfo *card, | |||
372 | int rsxx_dma_setup(struct rsxx_cardinfo *card); | 379 | int rsxx_dma_setup(struct rsxx_cardinfo *card); |
373 | void rsxx_dma_destroy(struct rsxx_cardinfo *card); | 380 | void rsxx_dma_destroy(struct rsxx_cardinfo *card); |
374 | int rsxx_dma_init(void); | 381 | int rsxx_dma_init(void); |
382 | int rsxx_cleanup_dma_queue(struct rsxx_dma_ctrl *ctrl, struct list_head *q); | ||
383 | int rsxx_dma_cancel(struct rsxx_dma_ctrl *ctrl); | ||
375 | void rsxx_dma_cleanup(void); | 384 | void rsxx_dma_cleanup(void); |
376 | void rsxx_dma_queue_reset(struct rsxx_cardinfo *card); | 385 | void rsxx_dma_queue_reset(struct rsxx_cardinfo *card); |
377 | int rsxx_dma_configure(struct rsxx_cardinfo *card); | 386 | int rsxx_dma_configure(struct rsxx_cardinfo *card); |
@@ -382,7 +391,6 @@ int rsxx_dma_queue_bio(struct rsxx_cardinfo *card, | |||
382 | void *cb_data); | 391 | void *cb_data); |
383 | int rsxx_hw_buffers_init(struct pci_dev *dev, struct rsxx_dma_ctrl *ctrl); | 392 | int rsxx_hw_buffers_init(struct pci_dev *dev, struct rsxx_dma_ctrl *ctrl); |
384 | int rsxx_eeh_save_issued_dmas(struct rsxx_cardinfo *card); | 393 | int rsxx_eeh_save_issued_dmas(struct rsxx_cardinfo *card); |
385 | void rsxx_eeh_cancel_dmas(struct rsxx_cardinfo *card); | ||
386 | int rsxx_eeh_remap_dmas(struct rsxx_cardinfo *card); | 394 | int rsxx_eeh_remap_dmas(struct rsxx_cardinfo *card); |
387 | 395 | ||
388 | /***** cregs.c *****/ | 396 | /***** cregs.c *****/ |
diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c index dd5b2fed97e9..bf4b9d282c04 100644 --- a/drivers/block/xen-blkback/blkback.c +++ b/drivers/block/xen-blkback/blkback.c | |||
@@ -50,110 +50,118 @@ | |||
50 | #include "common.h" | 50 | #include "common.h" |
51 | 51 | ||
52 | /* | 52 | /* |
53 | * These are rather arbitrary. They are fairly large because adjacent requests | 53 | * Maximum number of unused free pages to keep in the internal buffer. |
54 | * pulled from a communication ring are quite likely to end up being part of | 54 | * Setting this to a value too low will reduce memory used in each backend, |
55 | * the same scatter/gather request at the disc. | 55 | * but can have a performance penalty. |
56 | * | 56 | * |
57 | * ** TRY INCREASING 'xen_blkif_reqs' IF WRITE SPEEDS SEEM TOO LOW ** | 57 | * A sane value is xen_blkif_reqs * BLKIF_MAX_SEGMENTS_PER_REQUEST, but can |
58 | * | 58 | * be set to a lower value that might degrade performance on some intensive |
59 | * This will increase the chances of being able to write whole tracks. | 59 | * IO workloads. |
60 | * 64 should be enough to keep us competitive with Linux. | ||
61 | */ | 60 | */ |
62 | static int xen_blkif_reqs = 64; | ||
63 | module_param_named(reqs, xen_blkif_reqs, int, 0); | ||
64 | MODULE_PARM_DESC(reqs, "Number of blkback requests to allocate"); | ||
65 | 61 | ||
66 | /* Run-time switchable: /sys/module/blkback/parameters/ */ | 62 | static int xen_blkif_max_buffer_pages = 1024; |
67 | static unsigned int log_stats; | 63 | module_param_named(max_buffer_pages, xen_blkif_max_buffer_pages, int, 0644); |
68 | module_param(log_stats, int, 0644); | 64 | MODULE_PARM_DESC(max_buffer_pages, |
65 | "Maximum number of free pages to keep in each block backend buffer"); | ||
69 | 66 | ||
70 | /* | 67 | /* |
71 | * Each outstanding request that we've passed to the lower device layers has a | 68 | * Maximum number of grants to map persistently in blkback. For maximum |
72 | * 'pending_req' allocated to it. Each buffer_head that completes decrements | 69 | * performance this should be the total numbers of grants that can be used |
73 | * the pendcnt towards zero. When it hits zero, the specified domain has a | 70 | * to fill the ring, but since this might become too high, specially with |
74 | * response queued for it, with the saved 'id' passed back. | 71 | * the use of indirect descriptors, we set it to a value that provides good |
72 | * performance without using too much memory. | ||
73 | * | ||
74 | * When the list of persistent grants is full we clean it up using a LRU | ||
75 | * algorithm. | ||
75 | */ | 76 | */ |
76 | struct pending_req { | ||
77 | struct xen_blkif *blkif; | ||
78 | u64 id; | ||
79 | int nr_pages; | ||
80 | atomic_t pendcnt; | ||
81 | unsigned short operation; | ||
82 | int status; | ||
83 | struct list_head free_list; | ||
84 | DECLARE_BITMAP(unmap_seg, BLKIF_MAX_SEGMENTS_PER_REQUEST); | ||
85 | }; | ||
86 | 77 | ||
87 | #define BLKBACK_INVALID_HANDLE (~0) | 78 | static int xen_blkif_max_pgrants = 1056; |
79 | module_param_named(max_persistent_grants, xen_blkif_max_pgrants, int, 0644); | ||
80 | MODULE_PARM_DESC(max_persistent_grants, | ||
81 | "Maximum number of grants to map persistently"); | ||
88 | 82 | ||
89 | struct xen_blkbk { | 83 | /* |
90 | struct pending_req *pending_reqs; | 84 | * The LRU mechanism to clean the lists of persistent grants needs to |
91 | /* List of all 'pending_req' available */ | 85 | * be executed periodically. The time interval between consecutive executions |
92 | struct list_head pending_free; | 86 | * of the purge mechanism is set in ms. |
93 | /* And its spinlock. */ | 87 | */ |
94 | spinlock_t pending_free_lock; | 88 | #define LRU_INTERVAL 100 |
95 | wait_queue_head_t pending_free_wq; | ||
96 | /* The list of all pages that are available. */ | ||
97 | struct page **pending_pages; | ||
98 | /* And the grant handles that are available. */ | ||
99 | grant_handle_t *pending_grant_handles; | ||
100 | }; | ||
101 | |||
102 | static struct xen_blkbk *blkbk; | ||
103 | 89 | ||
104 | /* | 90 | /* |
105 | * Maximum number of grant pages that can be mapped in blkback. | 91 | * When the persistent grants list is full we will remove unused grants |
106 | * BLKIF_MAX_SEGMENTS_PER_REQUEST * RING_SIZE is the maximum number of | 92 | * from the list. The percent number of grants to be removed at each LRU |
107 | * pages that blkback will persistently map. | 93 | * execution. |
108 | * Currently, this is: | ||
109 | * RING_SIZE = 32 (for all known ring types) | ||
110 | * BLKIF_MAX_SEGMENTS_PER_REQUEST = 11 | ||
111 | * sizeof(struct persistent_gnt) = 48 | ||
112 | * So the maximum memory used to store the grants is: | ||
113 | * 32 * 11 * 48 = 16896 bytes | ||
114 | */ | 94 | */ |
115 | static inline unsigned int max_mapped_grant_pages(enum blkif_protocol protocol) | 95 | #define LRU_PERCENT_CLEAN 5 |
96 | |||
97 | /* Run-time switchable: /sys/module/blkback/parameters/ */ | ||
98 | static unsigned int log_stats; | ||
99 | module_param(log_stats, int, 0644); | ||
100 | |||
101 | #define BLKBACK_INVALID_HANDLE (~0) | ||
102 | |||
103 | /* Number of free pages to remove on each call to free_xenballooned_pages */ | ||
104 | #define NUM_BATCH_FREE_PAGES 10 | ||
105 | |||
106 | static inline int get_free_page(struct xen_blkif *blkif, struct page **page) | ||
116 | { | 107 | { |
117 | switch (protocol) { | 108 | unsigned long flags; |
118 | case BLKIF_PROTOCOL_NATIVE: | 109 | |
119 | return __CONST_RING_SIZE(blkif, PAGE_SIZE) * | 110 | spin_lock_irqsave(&blkif->free_pages_lock, flags); |
120 | BLKIF_MAX_SEGMENTS_PER_REQUEST; | 111 | if (list_empty(&blkif->free_pages)) { |
121 | case BLKIF_PROTOCOL_X86_32: | 112 | BUG_ON(blkif->free_pages_num != 0); |
122 | return __CONST_RING_SIZE(blkif_x86_32, PAGE_SIZE) * | 113 | spin_unlock_irqrestore(&blkif->free_pages_lock, flags); |
123 | BLKIF_MAX_SEGMENTS_PER_REQUEST; | 114 | return alloc_xenballooned_pages(1, page, false); |
124 | case BLKIF_PROTOCOL_X86_64: | ||
125 | return __CONST_RING_SIZE(blkif_x86_64, PAGE_SIZE) * | ||
126 | BLKIF_MAX_SEGMENTS_PER_REQUEST; | ||
127 | default: | ||
128 | BUG(); | ||
129 | } | 115 | } |
116 | BUG_ON(blkif->free_pages_num == 0); | ||
117 | page[0] = list_first_entry(&blkif->free_pages, struct page, lru); | ||
118 | list_del(&page[0]->lru); | ||
119 | blkif->free_pages_num--; | ||
120 | spin_unlock_irqrestore(&blkif->free_pages_lock, flags); | ||
121 | |||
130 | return 0; | 122 | return 0; |
131 | } | 123 | } |
132 | 124 | ||
133 | 125 | static inline void put_free_pages(struct xen_blkif *blkif, struct page **page, | |
134 | /* | 126 | int num) |
135 | * Little helpful macro to figure out the index and virtual address of the | ||
136 | * pending_pages[..]. For each 'pending_req' we have have up to | ||
137 | * BLKIF_MAX_SEGMENTS_PER_REQUEST (11) pages. The seg would be from 0 through | ||
138 | * 10 and would index in the pending_pages[..]. | ||
139 | */ | ||
140 | static inline int vaddr_pagenr(struct pending_req *req, int seg) | ||
141 | { | 127 | { |
142 | return (req - blkbk->pending_reqs) * | 128 | unsigned long flags; |
143 | BLKIF_MAX_SEGMENTS_PER_REQUEST + seg; | 129 | int i; |
144 | } | ||
145 | 130 | ||
146 | #define pending_page(req, seg) pending_pages[vaddr_pagenr(req, seg)] | 131 | spin_lock_irqsave(&blkif->free_pages_lock, flags); |
132 | for (i = 0; i < num; i++) | ||
133 | list_add(&page[i]->lru, &blkif->free_pages); | ||
134 | blkif->free_pages_num += num; | ||
135 | spin_unlock_irqrestore(&blkif->free_pages_lock, flags); | ||
136 | } | ||
147 | 137 | ||
148 | static inline unsigned long vaddr(struct pending_req *req, int seg) | 138 | static inline void shrink_free_pagepool(struct xen_blkif *blkif, int num) |
149 | { | 139 | { |
150 | unsigned long pfn = page_to_pfn(blkbk->pending_page(req, seg)); | 140 | /* Remove requested pages in batches of NUM_BATCH_FREE_PAGES */ |
151 | return (unsigned long)pfn_to_kaddr(pfn); | 141 | struct page *page[NUM_BATCH_FREE_PAGES]; |
152 | } | 142 | unsigned int num_pages = 0; |
143 | unsigned long flags; | ||
153 | 144 | ||
154 | #define pending_handle(_req, _seg) \ | 145 | spin_lock_irqsave(&blkif->free_pages_lock, flags); |
155 | (blkbk->pending_grant_handles[vaddr_pagenr(_req, _seg)]) | 146 | while (blkif->free_pages_num > num) { |
147 | BUG_ON(list_empty(&blkif->free_pages)); | ||
148 | page[num_pages] = list_first_entry(&blkif->free_pages, | ||
149 | struct page, lru); | ||
150 | list_del(&page[num_pages]->lru); | ||
151 | blkif->free_pages_num--; | ||
152 | if (++num_pages == NUM_BATCH_FREE_PAGES) { | ||
153 | spin_unlock_irqrestore(&blkif->free_pages_lock, flags); | ||
154 | free_xenballooned_pages(num_pages, page); | ||
155 | spin_lock_irqsave(&blkif->free_pages_lock, flags); | ||
156 | num_pages = 0; | ||
157 | } | ||
158 | } | ||
159 | spin_unlock_irqrestore(&blkif->free_pages_lock, flags); | ||
160 | if (num_pages != 0) | ||
161 | free_xenballooned_pages(num_pages, page); | ||
162 | } | ||
156 | 163 | ||
164 | #define vaddr(page) ((unsigned long)pfn_to_kaddr(page_to_pfn(page))) | ||
157 | 165 | ||
158 | static int do_block_io_op(struct xen_blkif *blkif); | 166 | static int do_block_io_op(struct xen_blkif *blkif); |
159 | static int dispatch_rw_block_io(struct xen_blkif *blkif, | 167 | static int dispatch_rw_block_io(struct xen_blkif *blkif, |
@@ -170,13 +178,29 @@ static void make_response(struct xen_blkif *blkif, u64 id, | |||
170 | (n) = (&(pos)->node != NULL) ? rb_next(&(pos)->node) : NULL) | 178 | (n) = (&(pos)->node != NULL) ? rb_next(&(pos)->node) : NULL) |
171 | 179 | ||
172 | 180 | ||
173 | static void add_persistent_gnt(struct rb_root *root, | 181 | /* |
182 | * We don't need locking around the persistent grant helpers | ||
183 | * because blkback uses a single-thread for each backed, so we | ||
184 | * can be sure that this functions will never be called recursively. | ||
185 | * | ||
186 | * The only exception to that is put_persistent_grant, that can be called | ||
187 | * from interrupt context (by xen_blkbk_unmap), so we have to use atomic | ||
188 | * bit operations to modify the flags of a persistent grant and to count | ||
189 | * the number of used grants. | ||
190 | */ | ||
191 | static int add_persistent_gnt(struct xen_blkif *blkif, | ||
174 | struct persistent_gnt *persistent_gnt) | 192 | struct persistent_gnt *persistent_gnt) |
175 | { | 193 | { |
176 | struct rb_node **new = &(root->rb_node), *parent = NULL; | 194 | struct rb_node **new = NULL, *parent = NULL; |
177 | struct persistent_gnt *this; | 195 | struct persistent_gnt *this; |
178 | 196 | ||
197 | if (blkif->persistent_gnt_c >= xen_blkif_max_pgrants) { | ||
198 | if (!blkif->vbd.overflow_max_grants) | ||
199 | blkif->vbd.overflow_max_grants = 1; | ||
200 | return -EBUSY; | ||
201 | } | ||
179 | /* Figure out where to put new node */ | 202 | /* Figure out where to put new node */ |
203 | new = &blkif->persistent_gnts.rb_node; | ||
180 | while (*new) { | 204 | while (*new) { |
181 | this = container_of(*new, struct persistent_gnt, node); | 205 | this = container_of(*new, struct persistent_gnt, node); |
182 | 206 | ||
@@ -186,22 +210,28 @@ static void add_persistent_gnt(struct rb_root *root, | |||
186 | else if (persistent_gnt->gnt > this->gnt) | 210 | else if (persistent_gnt->gnt > this->gnt) |
187 | new = &((*new)->rb_right); | 211 | new = &((*new)->rb_right); |
188 | else { | 212 | else { |
189 | pr_alert(DRV_PFX " trying to add a gref that's already in the tree\n"); | 213 | pr_alert_ratelimited(DRV_PFX " trying to add a gref that's already in the tree\n"); |
190 | BUG(); | 214 | return -EINVAL; |
191 | } | 215 | } |
192 | } | 216 | } |
193 | 217 | ||
218 | bitmap_zero(persistent_gnt->flags, PERSISTENT_GNT_FLAGS_SIZE); | ||
219 | set_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags); | ||
194 | /* Add new node and rebalance tree. */ | 220 | /* Add new node and rebalance tree. */ |
195 | rb_link_node(&(persistent_gnt->node), parent, new); | 221 | rb_link_node(&(persistent_gnt->node), parent, new); |
196 | rb_insert_color(&(persistent_gnt->node), root); | 222 | rb_insert_color(&(persistent_gnt->node), &blkif->persistent_gnts); |
223 | blkif->persistent_gnt_c++; | ||
224 | atomic_inc(&blkif->persistent_gnt_in_use); | ||
225 | return 0; | ||
197 | } | 226 | } |
198 | 227 | ||
199 | static struct persistent_gnt *get_persistent_gnt(struct rb_root *root, | 228 | static struct persistent_gnt *get_persistent_gnt(struct xen_blkif *blkif, |
200 | grant_ref_t gref) | 229 | grant_ref_t gref) |
201 | { | 230 | { |
202 | struct persistent_gnt *data; | 231 | struct persistent_gnt *data; |
203 | struct rb_node *node = root->rb_node; | 232 | struct rb_node *node = NULL; |
204 | 233 | ||
234 | node = blkif->persistent_gnts.rb_node; | ||
205 | while (node) { | 235 | while (node) { |
206 | data = container_of(node, struct persistent_gnt, node); | 236 | data = container_of(node, struct persistent_gnt, node); |
207 | 237 | ||
@@ -209,13 +239,31 @@ static struct persistent_gnt *get_persistent_gnt(struct rb_root *root, | |||
209 | node = node->rb_left; | 239 | node = node->rb_left; |
210 | else if (gref > data->gnt) | 240 | else if (gref > data->gnt) |
211 | node = node->rb_right; | 241 | node = node->rb_right; |
212 | else | 242 | else { |
243 | if(test_bit(PERSISTENT_GNT_ACTIVE, data->flags)) { | ||
244 | pr_alert_ratelimited(DRV_PFX " requesting a grant already in use\n"); | ||
245 | return NULL; | ||
246 | } | ||
247 | set_bit(PERSISTENT_GNT_ACTIVE, data->flags); | ||
248 | atomic_inc(&blkif->persistent_gnt_in_use); | ||
213 | return data; | 249 | return data; |
250 | } | ||
214 | } | 251 | } |
215 | return NULL; | 252 | return NULL; |
216 | } | 253 | } |
217 | 254 | ||
218 | static void free_persistent_gnts(struct rb_root *root, unsigned int num) | 255 | static void put_persistent_gnt(struct xen_blkif *blkif, |
256 | struct persistent_gnt *persistent_gnt) | ||
257 | { | ||
258 | if(!test_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags)) | ||
259 | pr_alert_ratelimited(DRV_PFX " freeing a grant already unused"); | ||
260 | set_bit(PERSISTENT_GNT_WAS_ACTIVE, persistent_gnt->flags); | ||
261 | clear_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags); | ||
262 | atomic_dec(&blkif->persistent_gnt_in_use); | ||
263 | } | ||
264 | |||
265 | static void free_persistent_gnts(struct xen_blkif *blkif, struct rb_root *root, | ||
266 | unsigned int num) | ||
219 | { | 267 | { |
220 | struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST]; | 268 | struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST]; |
221 | struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST]; | 269 | struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST]; |
@@ -240,7 +288,7 @@ static void free_persistent_gnts(struct rb_root *root, unsigned int num) | |||
240 | ret = gnttab_unmap_refs(unmap, NULL, pages, | 288 | ret = gnttab_unmap_refs(unmap, NULL, pages, |
241 | segs_to_unmap); | 289 | segs_to_unmap); |
242 | BUG_ON(ret); | 290 | BUG_ON(ret); |
243 | free_xenballooned_pages(segs_to_unmap, pages); | 291 | put_free_pages(blkif, pages, segs_to_unmap); |
244 | segs_to_unmap = 0; | 292 | segs_to_unmap = 0; |
245 | } | 293 | } |
246 | 294 | ||
@@ -251,21 +299,148 @@ static void free_persistent_gnts(struct rb_root *root, unsigned int num) | |||
251 | BUG_ON(num != 0); | 299 | BUG_ON(num != 0); |
252 | } | 300 | } |
253 | 301 | ||
302 | static void unmap_purged_grants(struct work_struct *work) | ||
303 | { | ||
304 | struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST]; | ||
305 | struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST]; | ||
306 | struct persistent_gnt *persistent_gnt; | ||
307 | int ret, segs_to_unmap = 0; | ||
308 | struct xen_blkif *blkif = container_of(work, typeof(*blkif), persistent_purge_work); | ||
309 | |||
310 | while(!list_empty(&blkif->persistent_purge_list)) { | ||
311 | persistent_gnt = list_first_entry(&blkif->persistent_purge_list, | ||
312 | struct persistent_gnt, | ||
313 | remove_node); | ||
314 | list_del(&persistent_gnt->remove_node); | ||
315 | |||
316 | gnttab_set_unmap_op(&unmap[segs_to_unmap], | ||
317 | vaddr(persistent_gnt->page), | ||
318 | GNTMAP_host_map, | ||
319 | persistent_gnt->handle); | ||
320 | |||
321 | pages[segs_to_unmap] = persistent_gnt->page; | ||
322 | |||
323 | if (++segs_to_unmap == BLKIF_MAX_SEGMENTS_PER_REQUEST) { | ||
324 | ret = gnttab_unmap_refs(unmap, NULL, pages, | ||
325 | segs_to_unmap); | ||
326 | BUG_ON(ret); | ||
327 | put_free_pages(blkif, pages, segs_to_unmap); | ||
328 | segs_to_unmap = 0; | ||
329 | } | ||
330 | kfree(persistent_gnt); | ||
331 | } | ||
332 | if (segs_to_unmap > 0) { | ||
333 | ret = gnttab_unmap_refs(unmap, NULL, pages, segs_to_unmap); | ||
334 | BUG_ON(ret); | ||
335 | put_free_pages(blkif, pages, segs_to_unmap); | ||
336 | } | ||
337 | } | ||
338 | |||
339 | static void purge_persistent_gnt(struct xen_blkif *blkif) | ||
340 | { | ||
341 | struct persistent_gnt *persistent_gnt; | ||
342 | struct rb_node *n; | ||
343 | unsigned int num_clean, total; | ||
344 | bool scan_used = false, clean_used = false; | ||
345 | struct rb_root *root; | ||
346 | |||
347 | if (blkif->persistent_gnt_c < xen_blkif_max_pgrants || | ||
348 | (blkif->persistent_gnt_c == xen_blkif_max_pgrants && | ||
349 | !blkif->vbd.overflow_max_grants)) { | ||
350 | return; | ||
351 | } | ||
352 | |||
353 | if (work_pending(&blkif->persistent_purge_work)) { | ||
354 | pr_alert_ratelimited(DRV_PFX "Scheduled work from previous purge is still pending, cannot purge list\n"); | ||
355 | return; | ||
356 | } | ||
357 | |||
358 | num_clean = (xen_blkif_max_pgrants / 100) * LRU_PERCENT_CLEAN; | ||
359 | num_clean = blkif->persistent_gnt_c - xen_blkif_max_pgrants + num_clean; | ||
360 | num_clean = min(blkif->persistent_gnt_c, num_clean); | ||
361 | if ((num_clean == 0) || | ||
362 | (num_clean > (blkif->persistent_gnt_c - atomic_read(&blkif->persistent_gnt_in_use)))) | ||
363 | return; | ||
364 | |||
365 | /* | ||
366 | * At this point, we can assure that there will be no calls | ||
367 | * to get_persistent_grant (because we are executing this code from | ||
368 | * xen_blkif_schedule), there can only be calls to put_persistent_gnt, | ||
369 | * which means that the number of currently used grants will go down, | ||
370 | * but never up, so we will always be able to remove the requested | ||
371 | * number of grants. | ||
372 | */ | ||
373 | |||
374 | total = num_clean; | ||
375 | |||
376 | pr_debug(DRV_PFX "Going to purge %u persistent grants\n", num_clean); | ||
377 | |||
378 | INIT_LIST_HEAD(&blkif->persistent_purge_list); | ||
379 | root = &blkif->persistent_gnts; | ||
380 | purge_list: | ||
381 | foreach_grant_safe(persistent_gnt, n, root, node) { | ||
382 | BUG_ON(persistent_gnt->handle == | ||
383 | BLKBACK_INVALID_HANDLE); | ||
384 | |||
385 | if (clean_used) { | ||
386 | clear_bit(PERSISTENT_GNT_WAS_ACTIVE, persistent_gnt->flags); | ||
387 | continue; | ||
388 | } | ||
389 | |||
390 | if (test_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags)) | ||
391 | continue; | ||
392 | if (!scan_used && | ||
393 | (test_bit(PERSISTENT_GNT_WAS_ACTIVE, persistent_gnt->flags))) | ||
394 | continue; | ||
395 | |||
396 | rb_erase(&persistent_gnt->node, root); | ||
397 | list_add(&persistent_gnt->remove_node, | ||
398 | &blkif->persistent_purge_list); | ||
399 | if (--num_clean == 0) | ||
400 | goto finished; | ||
401 | } | ||
402 | /* | ||
403 | * If we get here it means we also need to start cleaning | ||
404 | * grants that were used since last purge in order to cope | ||
405 | * with the requested num | ||
406 | */ | ||
407 | if (!scan_used && !clean_used) { | ||
408 | pr_debug(DRV_PFX "Still missing %u purged frames\n", num_clean); | ||
409 | scan_used = true; | ||
410 | goto purge_list; | ||
411 | } | ||
412 | finished: | ||
413 | if (!clean_used) { | ||
414 | pr_debug(DRV_PFX "Finished scanning for grants to clean, removing used flag\n"); | ||
415 | clean_used = true; | ||
416 | goto purge_list; | ||
417 | } | ||
418 | |||
419 | blkif->persistent_gnt_c -= (total - num_clean); | ||
420 | blkif->vbd.overflow_max_grants = 0; | ||
421 | |||
422 | /* We can defer this work */ | ||
423 | INIT_WORK(&blkif->persistent_purge_work, unmap_purged_grants); | ||
424 | schedule_work(&blkif->persistent_purge_work); | ||
425 | pr_debug(DRV_PFX "Purged %u/%u\n", (total - num_clean), total); | ||
426 | return; | ||
427 | } | ||
428 | |||
254 | /* | 429 | /* |
255 | * Retrieve from the 'pending_reqs' a free pending_req structure to be used. | 430 | * Retrieve from the 'pending_reqs' a free pending_req structure to be used. |
256 | */ | 431 | */ |
257 | static struct pending_req *alloc_req(void) | 432 | static struct pending_req *alloc_req(struct xen_blkif *blkif) |
258 | { | 433 | { |
259 | struct pending_req *req = NULL; | 434 | struct pending_req *req = NULL; |
260 | unsigned long flags; | 435 | unsigned long flags; |
261 | 436 | ||
262 | spin_lock_irqsave(&blkbk->pending_free_lock, flags); | 437 | spin_lock_irqsave(&blkif->pending_free_lock, flags); |
263 | if (!list_empty(&blkbk->pending_free)) { | 438 | if (!list_empty(&blkif->pending_free)) { |
264 | req = list_entry(blkbk->pending_free.next, struct pending_req, | 439 | req = list_entry(blkif->pending_free.next, struct pending_req, |
265 | free_list); | 440 | free_list); |
266 | list_del(&req->free_list); | 441 | list_del(&req->free_list); |
267 | } | 442 | } |
268 | spin_unlock_irqrestore(&blkbk->pending_free_lock, flags); | 443 | spin_unlock_irqrestore(&blkif->pending_free_lock, flags); |
269 | return req; | 444 | return req; |
270 | } | 445 | } |
271 | 446 | ||
@@ -273,17 +448,17 @@ static struct pending_req *alloc_req(void) | |||
273 | * Return the 'pending_req' structure back to the freepool. We also | 448 | * Return the 'pending_req' structure back to the freepool. We also |
274 | * wake up the thread if it was waiting for a free page. | 449 | * wake up the thread if it was waiting for a free page. |
275 | */ | 450 | */ |
276 | static void free_req(struct pending_req *req) | 451 | static void free_req(struct xen_blkif *blkif, struct pending_req *req) |
277 | { | 452 | { |
278 | unsigned long flags; | 453 | unsigned long flags; |
279 | int was_empty; | 454 | int was_empty; |
280 | 455 | ||
281 | spin_lock_irqsave(&blkbk->pending_free_lock, flags); | 456 | spin_lock_irqsave(&blkif->pending_free_lock, flags); |
282 | was_empty = list_empty(&blkbk->pending_free); | 457 | was_empty = list_empty(&blkif->pending_free); |
283 | list_add(&req->free_list, &blkbk->pending_free); | 458 | list_add(&req->free_list, &blkif->pending_free); |
284 | spin_unlock_irqrestore(&blkbk->pending_free_lock, flags); | 459 | spin_unlock_irqrestore(&blkif->pending_free_lock, flags); |
285 | if (was_empty) | 460 | if (was_empty) |
286 | wake_up(&blkbk->pending_free_wq); | 461 | wake_up(&blkif->pending_free_wq); |
287 | } | 462 | } |
288 | 463 | ||
289 | /* | 464 | /* |
@@ -382,10 +557,12 @@ irqreturn_t xen_blkif_be_int(int irq, void *dev_id) | |||
382 | static void print_stats(struct xen_blkif *blkif) | 557 | static void print_stats(struct xen_blkif *blkif) |
383 | { | 558 | { |
384 | pr_info("xen-blkback (%s): oo %3llu | rd %4llu | wr %4llu | f %4llu" | 559 | pr_info("xen-blkback (%s): oo %3llu | rd %4llu | wr %4llu | f %4llu" |
385 | " | ds %4llu\n", | 560 | " | ds %4llu | pg: %4u/%4d\n", |
386 | current->comm, blkif->st_oo_req, | 561 | current->comm, blkif->st_oo_req, |
387 | blkif->st_rd_req, blkif->st_wr_req, | 562 | blkif->st_rd_req, blkif->st_wr_req, |
388 | blkif->st_f_req, blkif->st_ds_req); | 563 | blkif->st_f_req, blkif->st_ds_req, |
564 | blkif->persistent_gnt_c, | ||
565 | xen_blkif_max_pgrants); | ||
389 | blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000); | 566 | blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000); |
390 | blkif->st_rd_req = 0; | 567 | blkif->st_rd_req = 0; |
391 | blkif->st_wr_req = 0; | 568 | blkif->st_wr_req = 0; |
@@ -397,6 +574,8 @@ int xen_blkif_schedule(void *arg) | |||
397 | { | 574 | { |
398 | struct xen_blkif *blkif = arg; | 575 | struct xen_blkif *blkif = arg; |
399 | struct xen_vbd *vbd = &blkif->vbd; | 576 | struct xen_vbd *vbd = &blkif->vbd; |
577 | unsigned long timeout; | ||
578 | int ret; | ||
400 | 579 | ||
401 | xen_blkif_get(blkif); | 580 | xen_blkif_get(blkif); |
402 | 581 | ||
@@ -406,27 +585,52 @@ int xen_blkif_schedule(void *arg) | |||
406 | if (unlikely(vbd->size != vbd_sz(vbd))) | 585 | if (unlikely(vbd->size != vbd_sz(vbd))) |
407 | xen_vbd_resize(blkif); | 586 | xen_vbd_resize(blkif); |
408 | 587 | ||
409 | wait_event_interruptible( | 588 | timeout = msecs_to_jiffies(LRU_INTERVAL); |
589 | |||
590 | timeout = wait_event_interruptible_timeout( | ||
410 | blkif->wq, | 591 | blkif->wq, |
411 | blkif->waiting_reqs || kthread_should_stop()); | 592 | blkif->waiting_reqs || kthread_should_stop(), |
412 | wait_event_interruptible( | 593 | timeout); |
413 | blkbk->pending_free_wq, | 594 | if (timeout == 0) |
414 | !list_empty(&blkbk->pending_free) || | 595 | goto purge_gnt_list; |
415 | kthread_should_stop()); | 596 | timeout = wait_event_interruptible_timeout( |
597 | blkif->pending_free_wq, | ||
598 | !list_empty(&blkif->pending_free) || | ||
599 | kthread_should_stop(), | ||
600 | timeout); | ||
601 | if (timeout == 0) | ||
602 | goto purge_gnt_list; | ||
416 | 603 | ||
417 | blkif->waiting_reqs = 0; | 604 | blkif->waiting_reqs = 0; |
418 | smp_mb(); /* clear flag *before* checking for work */ | 605 | smp_mb(); /* clear flag *before* checking for work */ |
419 | 606 | ||
420 | if (do_block_io_op(blkif)) | 607 | ret = do_block_io_op(blkif); |
608 | if (ret > 0) | ||
421 | blkif->waiting_reqs = 1; | 609 | blkif->waiting_reqs = 1; |
610 | if (ret == -EACCES) | ||
611 | wait_event_interruptible(blkif->shutdown_wq, | ||
612 | kthread_should_stop()); | ||
613 | |||
614 | purge_gnt_list: | ||
615 | if (blkif->vbd.feature_gnt_persistent && | ||
616 | time_after(jiffies, blkif->next_lru)) { | ||
617 | purge_persistent_gnt(blkif); | ||
618 | blkif->next_lru = jiffies + msecs_to_jiffies(LRU_INTERVAL); | ||
619 | } | ||
620 | |||
621 | /* Shrink if we have more than xen_blkif_max_buffer_pages */ | ||
622 | shrink_free_pagepool(blkif, xen_blkif_max_buffer_pages); | ||
422 | 623 | ||
423 | if (log_stats && time_after(jiffies, blkif->st_print)) | 624 | if (log_stats && time_after(jiffies, blkif->st_print)) |
424 | print_stats(blkif); | 625 | print_stats(blkif); |
425 | } | 626 | } |
426 | 627 | ||
628 | /* Since we are shutting down remove all pages from the buffer */ | ||
629 | shrink_free_pagepool(blkif, 0 /* All */); | ||
630 | |||
427 | /* Free all persistent grant pages */ | 631 | /* Free all persistent grant pages */ |
428 | if (!RB_EMPTY_ROOT(&blkif->persistent_gnts)) | 632 | if (!RB_EMPTY_ROOT(&blkif->persistent_gnts)) |
429 | free_persistent_gnts(&blkif->persistent_gnts, | 633 | free_persistent_gnts(blkif, &blkif->persistent_gnts, |
430 | blkif->persistent_gnt_c); | 634 | blkif->persistent_gnt_c); |
431 | 635 | ||
432 | BUG_ON(!RB_EMPTY_ROOT(&blkif->persistent_gnts)); | 636 | BUG_ON(!RB_EMPTY_ROOT(&blkif->persistent_gnts)); |
@@ -441,148 +645,98 @@ int xen_blkif_schedule(void *arg) | |||
441 | return 0; | 645 | return 0; |
442 | } | 646 | } |
443 | 647 | ||
444 | struct seg_buf { | ||
445 | unsigned int offset; | ||
446 | unsigned int nsec; | ||
447 | }; | ||
448 | /* | 648 | /* |
449 | * Unmap the grant references, and also remove the M2P over-rides | 649 | * Unmap the grant references, and also remove the M2P over-rides |
450 | * used in the 'pending_req'. | 650 | * used in the 'pending_req'. |
451 | */ | 651 | */ |
452 | static void xen_blkbk_unmap(struct pending_req *req) | 652 | static void xen_blkbk_unmap(struct xen_blkif *blkif, |
653 | struct grant_page *pages[], | ||
654 | int num) | ||
453 | { | 655 | { |
454 | struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST]; | 656 | struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST]; |
455 | struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST]; | 657 | struct page *unmap_pages[BLKIF_MAX_SEGMENTS_PER_REQUEST]; |
456 | unsigned int i, invcount = 0; | 658 | unsigned int i, invcount = 0; |
457 | grant_handle_t handle; | ||
458 | int ret; | 659 | int ret; |
459 | 660 | ||
460 | for (i = 0; i < req->nr_pages; i++) { | 661 | for (i = 0; i < num; i++) { |
461 | if (!test_bit(i, req->unmap_seg)) | 662 | if (pages[i]->persistent_gnt != NULL) { |
663 | put_persistent_gnt(blkif, pages[i]->persistent_gnt); | ||
462 | continue; | 664 | continue; |
463 | handle = pending_handle(req, i); | 665 | } |
464 | if (handle == BLKBACK_INVALID_HANDLE) | 666 | if (pages[i]->handle == BLKBACK_INVALID_HANDLE) |
465 | continue; | 667 | continue; |
466 | gnttab_set_unmap_op(&unmap[invcount], vaddr(req, i), | 668 | unmap_pages[invcount] = pages[i]->page; |
467 | GNTMAP_host_map, handle); | 669 | gnttab_set_unmap_op(&unmap[invcount], vaddr(pages[i]->page), |
468 | pending_handle(req, i) = BLKBACK_INVALID_HANDLE; | 670 | GNTMAP_host_map, pages[i]->handle); |
469 | pages[invcount] = virt_to_page(vaddr(req, i)); | 671 | pages[i]->handle = BLKBACK_INVALID_HANDLE; |
470 | invcount++; | 672 | if (++invcount == BLKIF_MAX_SEGMENTS_PER_REQUEST) { |
673 | ret = gnttab_unmap_refs(unmap, NULL, unmap_pages, | ||
674 | invcount); | ||
675 | BUG_ON(ret); | ||
676 | put_free_pages(blkif, unmap_pages, invcount); | ||
677 | invcount = 0; | ||
678 | } | ||
679 | } | ||
680 | if (invcount) { | ||
681 | ret = gnttab_unmap_refs(unmap, NULL, unmap_pages, invcount); | ||
682 | BUG_ON(ret); | ||
683 | put_free_pages(blkif, unmap_pages, invcount); | ||
471 | } | 684 | } |
472 | |||
473 | ret = gnttab_unmap_refs(unmap, NULL, pages, invcount); | ||
474 | BUG_ON(ret); | ||
475 | } | 685 | } |
476 | 686 | ||
477 | static int xen_blkbk_map(struct blkif_request *req, | 687 | static int xen_blkbk_map(struct xen_blkif *blkif, |
478 | struct pending_req *pending_req, | 688 | struct grant_page *pages[], |
479 | struct seg_buf seg[], | 689 | int num, bool ro) |
480 | struct page *pages[]) | ||
481 | { | 690 | { |
482 | struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST]; | 691 | struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST]; |
483 | struct persistent_gnt *persistent_gnts[BLKIF_MAX_SEGMENTS_PER_REQUEST]; | ||
484 | struct page *pages_to_gnt[BLKIF_MAX_SEGMENTS_PER_REQUEST]; | 692 | struct page *pages_to_gnt[BLKIF_MAX_SEGMENTS_PER_REQUEST]; |
485 | struct persistent_gnt *persistent_gnt = NULL; | 693 | struct persistent_gnt *persistent_gnt = NULL; |
486 | struct xen_blkif *blkif = pending_req->blkif; | ||
487 | phys_addr_t addr = 0; | 694 | phys_addr_t addr = 0; |
488 | int i, j; | 695 | int i, seg_idx, new_map_idx; |
489 | bool new_map; | ||
490 | int nseg = req->u.rw.nr_segments; | ||
491 | int segs_to_map = 0; | 696 | int segs_to_map = 0; |
492 | int ret = 0; | 697 | int ret = 0; |
698 | int last_map = 0, map_until = 0; | ||
493 | int use_persistent_gnts; | 699 | int use_persistent_gnts; |
494 | 700 | ||
495 | use_persistent_gnts = (blkif->vbd.feature_gnt_persistent); | 701 | use_persistent_gnts = (blkif->vbd.feature_gnt_persistent); |
496 | 702 | ||
497 | BUG_ON(blkif->persistent_gnt_c > | ||
498 | max_mapped_grant_pages(pending_req->blkif->blk_protocol)); | ||
499 | |||
500 | /* | 703 | /* |
501 | * Fill out preq.nr_sects with proper amount of sectors, and setup | 704 | * Fill out preq.nr_sects with proper amount of sectors, and setup |
502 | * assign map[..] with the PFN of the page in our domain with the | 705 | * assign map[..] with the PFN of the page in our domain with the |
503 | * corresponding grant reference for each page. | 706 | * corresponding grant reference for each page. |
504 | */ | 707 | */ |
505 | for (i = 0; i < nseg; i++) { | 708 | again: |
709 | for (i = map_until; i < num; i++) { | ||
506 | uint32_t flags; | 710 | uint32_t flags; |
507 | 711 | ||
508 | if (use_persistent_gnts) | 712 | if (use_persistent_gnts) |
509 | persistent_gnt = get_persistent_gnt( | 713 | persistent_gnt = get_persistent_gnt( |
510 | &blkif->persistent_gnts, | 714 | blkif, |
511 | req->u.rw.seg[i].gref); | 715 | pages[i]->gref); |
512 | 716 | ||
513 | if (persistent_gnt) { | 717 | if (persistent_gnt) { |
514 | /* | 718 | /* |
515 | * We are using persistent grants and | 719 | * We are using persistent grants and |
516 | * the grant is already mapped | 720 | * the grant is already mapped |
517 | */ | 721 | */ |
518 | new_map = false; | 722 | pages[i]->page = persistent_gnt->page; |
519 | } else if (use_persistent_gnts && | 723 | pages[i]->persistent_gnt = persistent_gnt; |
520 | blkif->persistent_gnt_c < | ||
521 | max_mapped_grant_pages(blkif->blk_protocol)) { | ||
522 | /* | ||
523 | * We are using persistent grants, the grant is | ||
524 | * not mapped but we have room for it | ||
525 | */ | ||
526 | new_map = true; | ||
527 | persistent_gnt = kmalloc( | ||
528 | sizeof(struct persistent_gnt), | ||
529 | GFP_KERNEL); | ||
530 | if (!persistent_gnt) | ||
531 | return -ENOMEM; | ||
532 | if (alloc_xenballooned_pages(1, &persistent_gnt->page, | ||
533 | false)) { | ||
534 | kfree(persistent_gnt); | ||
535 | return -ENOMEM; | ||
536 | } | ||
537 | persistent_gnt->gnt = req->u.rw.seg[i].gref; | ||
538 | persistent_gnt->handle = BLKBACK_INVALID_HANDLE; | ||
539 | |||
540 | pages_to_gnt[segs_to_map] = | ||
541 | persistent_gnt->page; | ||
542 | addr = (unsigned long) pfn_to_kaddr( | ||
543 | page_to_pfn(persistent_gnt->page)); | ||
544 | |||
545 | add_persistent_gnt(&blkif->persistent_gnts, | ||
546 | persistent_gnt); | ||
547 | blkif->persistent_gnt_c++; | ||
548 | pr_debug(DRV_PFX " grant %u added to the tree of persistent grants, using %u/%u\n", | ||
549 | persistent_gnt->gnt, blkif->persistent_gnt_c, | ||
550 | max_mapped_grant_pages(blkif->blk_protocol)); | ||
551 | } else { | 724 | } else { |
552 | /* | 725 | if (get_free_page(blkif, &pages[i]->page)) |
553 | * We are either using persistent grants and | 726 | goto out_of_memory; |
554 | * hit the maximum limit of grants mapped, | 727 | addr = vaddr(pages[i]->page); |
555 | * or we are not using persistent grants. | 728 | pages_to_gnt[segs_to_map] = pages[i]->page; |
556 | */ | 729 | pages[i]->persistent_gnt = NULL; |
557 | if (use_persistent_gnts && | ||
558 | !blkif->vbd.overflow_max_grants) { | ||
559 | blkif->vbd.overflow_max_grants = 1; | ||
560 | pr_alert(DRV_PFX " domain %u, device %#x is using maximum number of persistent grants\n", | ||
561 | blkif->domid, blkif->vbd.handle); | ||
562 | } | ||
563 | new_map = true; | ||
564 | pages[i] = blkbk->pending_page(pending_req, i); | ||
565 | addr = vaddr(pending_req, i); | ||
566 | pages_to_gnt[segs_to_map] = | ||
567 | blkbk->pending_page(pending_req, i); | ||
568 | } | ||
569 | |||
570 | if (persistent_gnt) { | ||
571 | pages[i] = persistent_gnt->page; | ||
572 | persistent_gnts[i] = persistent_gnt; | ||
573 | } else { | ||
574 | persistent_gnts[i] = NULL; | ||
575 | } | ||
576 | |||
577 | if (new_map) { | ||
578 | flags = GNTMAP_host_map; | 730 | flags = GNTMAP_host_map; |
579 | if (!persistent_gnt && | 731 | if (!use_persistent_gnts && ro) |
580 | (pending_req->operation != BLKIF_OP_READ)) | ||
581 | flags |= GNTMAP_readonly; | 732 | flags |= GNTMAP_readonly; |
582 | gnttab_set_map_op(&map[segs_to_map++], addr, | 733 | gnttab_set_map_op(&map[segs_to_map++], addr, |
583 | flags, req->u.rw.seg[i].gref, | 734 | flags, pages[i]->gref, |
584 | blkif->domid); | 735 | blkif->domid); |
585 | } | 736 | } |
737 | map_until = i + 1; | ||
738 | if (segs_to_map == BLKIF_MAX_SEGMENTS_PER_REQUEST) | ||
739 | break; | ||
586 | } | 740 | } |
587 | 741 | ||
588 | if (segs_to_map) { | 742 | if (segs_to_map) { |
@@ -595,49 +749,133 @@ static int xen_blkbk_map(struct blkif_request *req, | |||
595 | * so that when we access vaddr(pending_req,i) it has the contents of | 749 | * so that when we access vaddr(pending_req,i) it has the contents of |
596 | * the page from the other domain. | 750 | * the page from the other domain. |
597 | */ | 751 | */ |
598 | bitmap_zero(pending_req->unmap_seg, BLKIF_MAX_SEGMENTS_PER_REQUEST); | 752 | for (seg_idx = last_map, new_map_idx = 0; seg_idx < map_until; seg_idx++) { |
599 | for (i = 0, j = 0; i < nseg; i++) { | 753 | if (!pages[seg_idx]->persistent_gnt) { |
600 | if (!persistent_gnts[i] || | ||
601 | persistent_gnts[i]->handle == BLKBACK_INVALID_HANDLE) { | ||
602 | /* This is a newly mapped grant */ | 754 | /* This is a newly mapped grant */ |
603 | BUG_ON(j >= segs_to_map); | 755 | BUG_ON(new_map_idx >= segs_to_map); |
604 | if (unlikely(map[j].status != 0)) { | 756 | if (unlikely(map[new_map_idx].status != 0)) { |
605 | pr_debug(DRV_PFX "invalid buffer -- could not remap it\n"); | 757 | pr_debug(DRV_PFX "invalid buffer -- could not remap it\n"); |
606 | map[j].handle = BLKBACK_INVALID_HANDLE; | 758 | pages[seg_idx]->handle = BLKBACK_INVALID_HANDLE; |
607 | ret |= 1; | 759 | ret |= 1; |
608 | if (persistent_gnts[i]) { | 760 | goto next; |
609 | rb_erase(&persistent_gnts[i]->node, | ||
610 | &blkif->persistent_gnts); | ||
611 | blkif->persistent_gnt_c--; | ||
612 | kfree(persistent_gnts[i]); | ||
613 | persistent_gnts[i] = NULL; | ||
614 | } | ||
615 | } | 761 | } |
762 | pages[seg_idx]->handle = map[new_map_idx].handle; | ||
763 | } else { | ||
764 | continue; | ||
616 | } | 765 | } |
617 | if (persistent_gnts[i]) { | 766 | if (use_persistent_gnts && |
618 | if (persistent_gnts[i]->handle == | 767 | blkif->persistent_gnt_c < xen_blkif_max_pgrants) { |
619 | BLKBACK_INVALID_HANDLE) { | 768 | /* |
769 | * We are using persistent grants, the grant is | ||
770 | * not mapped but we might have room for it. | ||
771 | */ | ||
772 | persistent_gnt = kmalloc(sizeof(struct persistent_gnt), | ||
773 | GFP_KERNEL); | ||
774 | if (!persistent_gnt) { | ||
620 | /* | 775 | /* |
621 | * If this is a new persistent grant | 776 | * If we don't have enough memory to |
622 | * save the handler | 777 | * allocate the persistent_gnt struct |
778 | * map this grant non-persistenly | ||
623 | */ | 779 | */ |
624 | persistent_gnts[i]->handle = map[j++].handle; | 780 | goto next; |
625 | } | 781 | } |
626 | pending_handle(pending_req, i) = | 782 | persistent_gnt->gnt = map[new_map_idx].ref; |
627 | persistent_gnts[i]->handle; | 783 | persistent_gnt->handle = map[new_map_idx].handle; |
784 | persistent_gnt->page = pages[seg_idx]->page; | ||
785 | if (add_persistent_gnt(blkif, | ||
786 | persistent_gnt)) { | ||
787 | kfree(persistent_gnt); | ||
788 | persistent_gnt = NULL; | ||
789 | goto next; | ||
790 | } | ||
791 | pages[seg_idx]->persistent_gnt = persistent_gnt; | ||
792 | pr_debug(DRV_PFX " grant %u added to the tree of persistent grants, using %u/%u\n", | ||
793 | persistent_gnt->gnt, blkif->persistent_gnt_c, | ||
794 | xen_blkif_max_pgrants); | ||
795 | goto next; | ||
796 | } | ||
797 | if (use_persistent_gnts && !blkif->vbd.overflow_max_grants) { | ||
798 | blkif->vbd.overflow_max_grants = 1; | ||
799 | pr_debug(DRV_PFX " domain %u, device %#x is using maximum number of persistent grants\n", | ||
800 | blkif->domid, blkif->vbd.handle); | ||
801 | } | ||
802 | /* | ||
803 | * We could not map this grant persistently, so use it as | ||
804 | * a non-persistent grant. | ||
805 | */ | ||
806 | next: | ||
807 | new_map_idx++; | ||
808 | } | ||
809 | segs_to_map = 0; | ||
810 | last_map = map_until; | ||
811 | if (map_until != num) | ||
812 | goto again; | ||
628 | 813 | ||
629 | if (ret) | 814 | return ret; |
630 | continue; | 815 | |
631 | } else { | 816 | out_of_memory: |
632 | pending_handle(pending_req, i) = map[j++].handle; | 817 | pr_alert(DRV_PFX "%s: out of memory\n", __func__); |
633 | bitmap_set(pending_req->unmap_seg, i, 1); | 818 | put_free_pages(blkif, pages_to_gnt, segs_to_map); |
819 | return -ENOMEM; | ||
820 | } | ||
821 | |||
822 | static int xen_blkbk_map_seg(struct pending_req *pending_req) | ||
823 | { | ||
824 | int rc; | ||
825 | |||
826 | rc = xen_blkbk_map(pending_req->blkif, pending_req->segments, | ||
827 | pending_req->nr_pages, | ||
828 | (pending_req->operation != BLKIF_OP_READ)); | ||
829 | |||
830 | return rc; | ||
831 | } | ||
634 | 832 | ||
635 | if (ret) | 833 | static int xen_blkbk_parse_indirect(struct blkif_request *req, |
636 | continue; | 834 | struct pending_req *pending_req, |
835 | struct seg_buf seg[], | ||
836 | struct phys_req *preq) | ||
837 | { | ||
838 | struct grant_page **pages = pending_req->indirect_pages; | ||
839 | struct xen_blkif *blkif = pending_req->blkif; | ||
840 | int indirect_grefs, rc, n, nseg, i; | ||
841 | struct blkif_request_segment_aligned *segments = NULL; | ||
842 | |||
843 | nseg = pending_req->nr_pages; | ||
844 | indirect_grefs = INDIRECT_PAGES(nseg); | ||
845 | BUG_ON(indirect_grefs > BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST); | ||
846 | |||
847 | for (i = 0; i < indirect_grefs; i++) | ||
848 | pages[i]->gref = req->u.indirect.indirect_grefs[i]; | ||
849 | |||
850 | rc = xen_blkbk_map(blkif, pages, indirect_grefs, true); | ||
851 | if (rc) | ||
852 | goto unmap; | ||
853 | |||
854 | for (n = 0, i = 0; n < nseg; n++) { | ||
855 | if ((n % SEGS_PER_INDIRECT_FRAME) == 0) { | ||
856 | /* Map indirect segments */ | ||
857 | if (segments) | ||
858 | kunmap_atomic(segments); | ||
859 | segments = kmap_atomic(pages[n/SEGS_PER_INDIRECT_FRAME]->page); | ||
860 | } | ||
861 | i = n % SEGS_PER_INDIRECT_FRAME; | ||
862 | pending_req->segments[n]->gref = segments[i].gref; | ||
863 | seg[n].nsec = segments[i].last_sect - | ||
864 | segments[i].first_sect + 1; | ||
865 | seg[n].offset = (segments[i].first_sect << 9); | ||
866 | if ((segments[i].last_sect >= (PAGE_SIZE >> 9)) || | ||
867 | (segments[i].last_sect < segments[i].first_sect)) { | ||
868 | rc = -EINVAL; | ||
869 | goto unmap; | ||
637 | } | 870 | } |
638 | seg[i].offset = (req->u.rw.seg[i].first_sect << 9); | 871 | preq->nr_sects += seg[n].nsec; |
639 | } | 872 | } |
640 | return ret; | 873 | |
874 | unmap: | ||
875 | if (segments) | ||
876 | kunmap_atomic(segments); | ||
877 | xen_blkbk_unmap(blkif, pages, indirect_grefs); | ||
878 | return rc; | ||
641 | } | 879 | } |
642 | 880 | ||
643 | static int dispatch_discard_io(struct xen_blkif *blkif, | 881 | static int dispatch_discard_io(struct xen_blkif *blkif, |
@@ -647,7 +885,18 @@ static int dispatch_discard_io(struct xen_blkif *blkif, | |||
647 | int status = BLKIF_RSP_OKAY; | 885 | int status = BLKIF_RSP_OKAY; |
648 | struct block_device *bdev = blkif->vbd.bdev; | 886 | struct block_device *bdev = blkif->vbd.bdev; |
649 | unsigned long secure; | 887 | unsigned long secure; |
888 | struct phys_req preq; | ||
889 | |||
890 | preq.sector_number = req->u.discard.sector_number; | ||
891 | preq.nr_sects = req->u.discard.nr_sectors; | ||
650 | 892 | ||
893 | err = xen_vbd_translate(&preq, blkif, WRITE); | ||
894 | if (err) { | ||
895 | pr_warn(DRV_PFX "access denied: DISCARD [%llu->%llu] on dev=%04x\n", | ||
896 | preq.sector_number, | ||
897 | preq.sector_number + preq.nr_sects, blkif->vbd.pdevice); | ||
898 | goto fail_response; | ||
899 | } | ||
651 | blkif->st_ds_req++; | 900 | blkif->st_ds_req++; |
652 | 901 | ||
653 | xen_blkif_get(blkif); | 902 | xen_blkif_get(blkif); |
@@ -658,7 +907,7 @@ static int dispatch_discard_io(struct xen_blkif *blkif, | |||
658 | err = blkdev_issue_discard(bdev, req->u.discard.sector_number, | 907 | err = blkdev_issue_discard(bdev, req->u.discard.sector_number, |
659 | req->u.discard.nr_sectors, | 908 | req->u.discard.nr_sectors, |
660 | GFP_KERNEL, secure); | 909 | GFP_KERNEL, secure); |
661 | 910 | fail_response: | |
662 | if (err == -EOPNOTSUPP) { | 911 | if (err == -EOPNOTSUPP) { |
663 | pr_debug(DRV_PFX "discard op failed, not supported\n"); | 912 | pr_debug(DRV_PFX "discard op failed, not supported\n"); |
664 | status = BLKIF_RSP_EOPNOTSUPP; | 913 | status = BLKIF_RSP_EOPNOTSUPP; |
@@ -674,7 +923,7 @@ static int dispatch_other_io(struct xen_blkif *blkif, | |||
674 | struct blkif_request *req, | 923 | struct blkif_request *req, |
675 | struct pending_req *pending_req) | 924 | struct pending_req *pending_req) |
676 | { | 925 | { |
677 | free_req(pending_req); | 926 | free_req(blkif, pending_req); |
678 | make_response(blkif, req->u.other.id, req->operation, | 927 | make_response(blkif, req->u.other.id, req->operation, |
679 | BLKIF_RSP_EOPNOTSUPP); | 928 | BLKIF_RSP_EOPNOTSUPP); |
680 | return -EIO; | 929 | return -EIO; |
@@ -726,7 +975,9 @@ static void __end_block_io_op(struct pending_req *pending_req, int error) | |||
726 | * the proper response on the ring. | 975 | * the proper response on the ring. |
727 | */ | 976 | */ |
728 | if (atomic_dec_and_test(&pending_req->pendcnt)) { | 977 | if (atomic_dec_and_test(&pending_req->pendcnt)) { |
729 | xen_blkbk_unmap(pending_req); | 978 | xen_blkbk_unmap(pending_req->blkif, |
979 | pending_req->segments, | ||
980 | pending_req->nr_pages); | ||
730 | make_response(pending_req->blkif, pending_req->id, | 981 | make_response(pending_req->blkif, pending_req->id, |
731 | pending_req->operation, pending_req->status); | 982 | pending_req->operation, pending_req->status); |
732 | xen_blkif_put(pending_req->blkif); | 983 | xen_blkif_put(pending_req->blkif); |
@@ -734,7 +985,7 @@ static void __end_block_io_op(struct pending_req *pending_req, int error) | |||
734 | if (atomic_read(&pending_req->blkif->drain)) | 985 | if (atomic_read(&pending_req->blkif->drain)) |
735 | complete(&pending_req->blkif->drain_complete); | 986 | complete(&pending_req->blkif->drain_complete); |
736 | } | 987 | } |
737 | free_req(pending_req); | 988 | free_req(pending_req->blkif, pending_req); |
738 | } | 989 | } |
739 | } | 990 | } |
740 | 991 | ||
@@ -767,6 +1018,12 @@ __do_block_io_op(struct xen_blkif *blkif) | |||
767 | rp = blk_rings->common.sring->req_prod; | 1018 | rp = blk_rings->common.sring->req_prod; |
768 | rmb(); /* Ensure we see queued requests up to 'rp'. */ | 1019 | rmb(); /* Ensure we see queued requests up to 'rp'. */ |
769 | 1020 | ||
1021 | if (RING_REQUEST_PROD_OVERFLOW(&blk_rings->common, rp)) { | ||
1022 | rc = blk_rings->common.rsp_prod_pvt; | ||
1023 | pr_warn(DRV_PFX "Frontend provided bogus ring requests (%d - %d = %d). Halting ring processing on dev=%04x\n", | ||
1024 | rp, rc, rp - rc, blkif->vbd.pdevice); | ||
1025 | return -EACCES; | ||
1026 | } | ||
770 | while (rc != rp) { | 1027 | while (rc != rp) { |
771 | 1028 | ||
772 | if (RING_REQUEST_CONS_OVERFLOW(&blk_rings->common, rc)) | 1029 | if (RING_REQUEST_CONS_OVERFLOW(&blk_rings->common, rc)) |
@@ -777,7 +1034,7 @@ __do_block_io_op(struct xen_blkif *blkif) | |||
777 | break; | 1034 | break; |
778 | } | 1035 | } |
779 | 1036 | ||
780 | pending_req = alloc_req(); | 1037 | pending_req = alloc_req(blkif); |
781 | if (NULL == pending_req) { | 1038 | if (NULL == pending_req) { |
782 | blkif->st_oo_req++; | 1039 | blkif->st_oo_req++; |
783 | more_to_do = 1; | 1040 | more_to_do = 1; |
@@ -807,11 +1064,12 @@ __do_block_io_op(struct xen_blkif *blkif) | |||
807 | case BLKIF_OP_WRITE: | 1064 | case BLKIF_OP_WRITE: |
808 | case BLKIF_OP_WRITE_BARRIER: | 1065 | case BLKIF_OP_WRITE_BARRIER: |
809 | case BLKIF_OP_FLUSH_DISKCACHE: | 1066 | case BLKIF_OP_FLUSH_DISKCACHE: |
1067 | case BLKIF_OP_INDIRECT: | ||
810 | if (dispatch_rw_block_io(blkif, &req, pending_req)) | 1068 | if (dispatch_rw_block_io(blkif, &req, pending_req)) |
811 | goto done; | 1069 | goto done; |
812 | break; | 1070 | break; |
813 | case BLKIF_OP_DISCARD: | 1071 | case BLKIF_OP_DISCARD: |
814 | free_req(pending_req); | 1072 | free_req(blkif, pending_req); |
815 | if (dispatch_discard_io(blkif, &req)) | 1073 | if (dispatch_discard_io(blkif, &req)) |
816 | goto done; | 1074 | goto done; |
817 | break; | 1075 | break; |
@@ -853,17 +1111,28 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, | |||
853 | struct pending_req *pending_req) | 1111 | struct pending_req *pending_req) |
854 | { | 1112 | { |
855 | struct phys_req preq; | 1113 | struct phys_req preq; |
856 | struct seg_buf seg[BLKIF_MAX_SEGMENTS_PER_REQUEST]; | 1114 | struct seg_buf *seg = pending_req->seg; |
857 | unsigned int nseg; | 1115 | unsigned int nseg; |
858 | struct bio *bio = NULL; | 1116 | struct bio *bio = NULL; |
859 | struct bio *biolist[BLKIF_MAX_SEGMENTS_PER_REQUEST]; | 1117 | struct bio **biolist = pending_req->biolist; |
860 | int i, nbio = 0; | 1118 | int i, nbio = 0; |
861 | int operation; | 1119 | int operation; |
862 | struct blk_plug plug; | 1120 | struct blk_plug plug; |
863 | bool drain = false; | 1121 | bool drain = false; |
864 | struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST]; | 1122 | struct grant_page **pages = pending_req->segments; |
1123 | unsigned short req_operation; | ||
1124 | |||
1125 | req_operation = req->operation == BLKIF_OP_INDIRECT ? | ||
1126 | req->u.indirect.indirect_op : req->operation; | ||
1127 | if ((req->operation == BLKIF_OP_INDIRECT) && | ||
1128 | (req_operation != BLKIF_OP_READ) && | ||
1129 | (req_operation != BLKIF_OP_WRITE)) { | ||
1130 | pr_debug(DRV_PFX "Invalid indirect operation (%u)\n", | ||
1131 | req_operation); | ||
1132 | goto fail_response; | ||
1133 | } | ||
865 | 1134 | ||
866 | switch (req->operation) { | 1135 | switch (req_operation) { |
867 | case BLKIF_OP_READ: | 1136 | case BLKIF_OP_READ: |
868 | blkif->st_rd_req++; | 1137 | blkif->st_rd_req++; |
869 | operation = READ; | 1138 | operation = READ; |
@@ -885,33 +1154,47 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, | |||
885 | } | 1154 | } |
886 | 1155 | ||
887 | /* Check that the number of segments is sane. */ | 1156 | /* Check that the number of segments is sane. */ |
888 | nseg = req->u.rw.nr_segments; | 1157 | nseg = req->operation == BLKIF_OP_INDIRECT ? |
1158 | req->u.indirect.nr_segments : req->u.rw.nr_segments; | ||
889 | 1159 | ||
890 | if (unlikely(nseg == 0 && operation != WRITE_FLUSH) || | 1160 | if (unlikely(nseg == 0 && operation != WRITE_FLUSH) || |
891 | unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) { | 1161 | unlikely((req->operation != BLKIF_OP_INDIRECT) && |
1162 | (nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) || | ||
1163 | unlikely((req->operation == BLKIF_OP_INDIRECT) && | ||
1164 | (nseg > MAX_INDIRECT_SEGMENTS))) { | ||
892 | pr_debug(DRV_PFX "Bad number of segments in request (%d)\n", | 1165 | pr_debug(DRV_PFX "Bad number of segments in request (%d)\n", |
893 | nseg); | 1166 | nseg); |
894 | /* Haven't submitted any bio's yet. */ | 1167 | /* Haven't submitted any bio's yet. */ |
895 | goto fail_response; | 1168 | goto fail_response; |
896 | } | 1169 | } |
897 | 1170 | ||
898 | preq.sector_number = req->u.rw.sector_number; | ||
899 | preq.nr_sects = 0; | 1171 | preq.nr_sects = 0; |
900 | 1172 | ||
901 | pending_req->blkif = blkif; | 1173 | pending_req->blkif = blkif; |
902 | pending_req->id = req->u.rw.id; | 1174 | pending_req->id = req->u.rw.id; |
903 | pending_req->operation = req->operation; | 1175 | pending_req->operation = req_operation; |
904 | pending_req->status = BLKIF_RSP_OKAY; | 1176 | pending_req->status = BLKIF_RSP_OKAY; |
905 | pending_req->nr_pages = nseg; | 1177 | pending_req->nr_pages = nseg; |
906 | 1178 | ||
907 | for (i = 0; i < nseg; i++) { | 1179 | if (req->operation != BLKIF_OP_INDIRECT) { |
908 | seg[i].nsec = req->u.rw.seg[i].last_sect - | 1180 | preq.dev = req->u.rw.handle; |
909 | req->u.rw.seg[i].first_sect + 1; | 1181 | preq.sector_number = req->u.rw.sector_number; |
910 | if ((req->u.rw.seg[i].last_sect >= (PAGE_SIZE >> 9)) || | 1182 | for (i = 0; i < nseg; i++) { |
911 | (req->u.rw.seg[i].last_sect < req->u.rw.seg[i].first_sect)) | 1183 | pages[i]->gref = req->u.rw.seg[i].gref; |
1184 | seg[i].nsec = req->u.rw.seg[i].last_sect - | ||
1185 | req->u.rw.seg[i].first_sect + 1; | ||
1186 | seg[i].offset = (req->u.rw.seg[i].first_sect << 9); | ||
1187 | if ((req->u.rw.seg[i].last_sect >= (PAGE_SIZE >> 9)) || | ||
1188 | (req->u.rw.seg[i].last_sect < | ||
1189 | req->u.rw.seg[i].first_sect)) | ||
1190 | goto fail_response; | ||
1191 | preq.nr_sects += seg[i].nsec; | ||
1192 | } | ||
1193 | } else { | ||
1194 | preq.dev = req->u.indirect.handle; | ||
1195 | preq.sector_number = req->u.indirect.sector_number; | ||
1196 | if (xen_blkbk_parse_indirect(req, pending_req, seg, &preq)) | ||
912 | goto fail_response; | 1197 | goto fail_response; |
913 | preq.nr_sects += seg[i].nsec; | ||
914 | |||
915 | } | 1198 | } |
916 | 1199 | ||
917 | if (xen_vbd_translate(&preq, blkif, operation) != 0) { | 1200 | if (xen_vbd_translate(&preq, blkif, operation) != 0) { |
@@ -948,7 +1231,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, | |||
948 | * the hypercall to unmap the grants - that is all done in | 1231 | * the hypercall to unmap the grants - that is all done in |
949 | * xen_blkbk_unmap. | 1232 | * xen_blkbk_unmap. |
950 | */ | 1233 | */ |
951 | if (xen_blkbk_map(req, pending_req, seg, pages)) | 1234 | if (xen_blkbk_map_seg(pending_req)) |
952 | goto fail_flush; | 1235 | goto fail_flush; |
953 | 1236 | ||
954 | /* | 1237 | /* |
@@ -960,11 +1243,12 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, | |||
960 | for (i = 0; i < nseg; i++) { | 1243 | for (i = 0; i < nseg; i++) { |
961 | while ((bio == NULL) || | 1244 | while ((bio == NULL) || |
962 | (bio_add_page(bio, | 1245 | (bio_add_page(bio, |
963 | pages[i], | 1246 | pages[i]->page, |
964 | seg[i].nsec << 9, | 1247 | seg[i].nsec << 9, |
965 | seg[i].offset) == 0)) { | 1248 | seg[i].offset) == 0)) { |
966 | 1249 | ||
967 | bio = bio_alloc(GFP_KERNEL, nseg-i); | 1250 | int nr_iovecs = min_t(int, (nseg-i), BIO_MAX_PAGES); |
1251 | bio = bio_alloc(GFP_KERNEL, nr_iovecs); | ||
968 | if (unlikely(bio == NULL)) | 1252 | if (unlikely(bio == NULL)) |
969 | goto fail_put_bio; | 1253 | goto fail_put_bio; |
970 | 1254 | ||
@@ -1009,11 +1293,12 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, | |||
1009 | return 0; | 1293 | return 0; |
1010 | 1294 | ||
1011 | fail_flush: | 1295 | fail_flush: |
1012 | xen_blkbk_unmap(pending_req); | 1296 | xen_blkbk_unmap(blkif, pending_req->segments, |
1297 | pending_req->nr_pages); | ||
1013 | fail_response: | 1298 | fail_response: |
1014 | /* Haven't submitted any bio's yet. */ | 1299 | /* Haven't submitted any bio's yet. */ |
1015 | make_response(blkif, req->u.rw.id, req->operation, BLKIF_RSP_ERROR); | 1300 | make_response(blkif, req->u.rw.id, req_operation, BLKIF_RSP_ERROR); |
1016 | free_req(pending_req); | 1301 | free_req(blkif, pending_req); |
1017 | msleep(1); /* back off a bit */ | 1302 | msleep(1); /* back off a bit */ |
1018 | return -EIO; | 1303 | return -EIO; |
1019 | 1304 | ||
@@ -1070,73 +1355,20 @@ static void make_response(struct xen_blkif *blkif, u64 id, | |||
1070 | 1355 | ||
1071 | static int __init xen_blkif_init(void) | 1356 | static int __init xen_blkif_init(void) |
1072 | { | 1357 | { |
1073 | int i, mmap_pages; | ||
1074 | int rc = 0; | 1358 | int rc = 0; |
1075 | 1359 | ||
1076 | if (!xen_domain()) | 1360 | if (!xen_domain()) |
1077 | return -ENODEV; | 1361 | return -ENODEV; |
1078 | 1362 | ||
1079 | blkbk = kzalloc(sizeof(struct xen_blkbk), GFP_KERNEL); | ||
1080 | if (!blkbk) { | ||
1081 | pr_alert(DRV_PFX "%s: out of memory!\n", __func__); | ||
1082 | return -ENOMEM; | ||
1083 | } | ||
1084 | |||
1085 | mmap_pages = xen_blkif_reqs * BLKIF_MAX_SEGMENTS_PER_REQUEST; | ||
1086 | |||
1087 | blkbk->pending_reqs = kzalloc(sizeof(blkbk->pending_reqs[0]) * | ||
1088 | xen_blkif_reqs, GFP_KERNEL); | ||
1089 | blkbk->pending_grant_handles = kmalloc(sizeof(blkbk->pending_grant_handles[0]) * | ||
1090 | mmap_pages, GFP_KERNEL); | ||
1091 | blkbk->pending_pages = kzalloc(sizeof(blkbk->pending_pages[0]) * | ||
1092 | mmap_pages, GFP_KERNEL); | ||
1093 | |||
1094 | if (!blkbk->pending_reqs || !blkbk->pending_grant_handles || | ||
1095 | !blkbk->pending_pages) { | ||
1096 | rc = -ENOMEM; | ||
1097 | goto out_of_memory; | ||
1098 | } | ||
1099 | |||
1100 | for (i = 0; i < mmap_pages; i++) { | ||
1101 | blkbk->pending_grant_handles[i] = BLKBACK_INVALID_HANDLE; | ||
1102 | blkbk->pending_pages[i] = alloc_page(GFP_KERNEL); | ||
1103 | if (blkbk->pending_pages[i] == NULL) { | ||
1104 | rc = -ENOMEM; | ||
1105 | goto out_of_memory; | ||
1106 | } | ||
1107 | } | ||
1108 | rc = xen_blkif_interface_init(); | 1363 | rc = xen_blkif_interface_init(); |
1109 | if (rc) | 1364 | if (rc) |
1110 | goto failed_init; | 1365 | goto failed_init; |
1111 | 1366 | ||
1112 | INIT_LIST_HEAD(&blkbk->pending_free); | ||
1113 | spin_lock_init(&blkbk->pending_free_lock); | ||
1114 | init_waitqueue_head(&blkbk->pending_free_wq); | ||
1115 | |||
1116 | for (i = 0; i < xen_blkif_reqs; i++) | ||
1117 | list_add_tail(&blkbk->pending_reqs[i].free_list, | ||
1118 | &blkbk->pending_free); | ||
1119 | |||
1120 | rc = xen_blkif_xenbus_init(); | 1367 | rc = xen_blkif_xenbus_init(); |
1121 | if (rc) | 1368 | if (rc) |
1122 | goto failed_init; | 1369 | goto failed_init; |
1123 | 1370 | ||
1124 | return 0; | ||
1125 | |||
1126 | out_of_memory: | ||
1127 | pr_alert(DRV_PFX "%s: out of memory\n", __func__); | ||
1128 | failed_init: | 1371 | failed_init: |
1129 | kfree(blkbk->pending_reqs); | ||
1130 | kfree(blkbk->pending_grant_handles); | ||
1131 | if (blkbk->pending_pages) { | ||
1132 | for (i = 0; i < mmap_pages; i++) { | ||
1133 | if (blkbk->pending_pages[i]) | ||
1134 | __free_page(blkbk->pending_pages[i]); | ||
1135 | } | ||
1136 | kfree(blkbk->pending_pages); | ||
1137 | } | ||
1138 | kfree(blkbk); | ||
1139 | blkbk = NULL; | ||
1140 | return rc; | 1372 | return rc; |
1141 | } | 1373 | } |
1142 | 1374 | ||
diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h index 60103e2517ba..8d8807563d99 100644 --- a/drivers/block/xen-blkback/common.h +++ b/drivers/block/xen-blkback/common.h | |||
@@ -50,6 +50,19 @@ | |||
50 | __func__, __LINE__, ##args) | 50 | __func__, __LINE__, ##args) |
51 | 51 | ||
52 | 52 | ||
53 | /* | ||
54 | * This is the maximum number of segments that would be allowed in indirect | ||
55 | * requests. This value will also be passed to the frontend. | ||
56 | */ | ||
57 | #define MAX_INDIRECT_SEGMENTS 256 | ||
58 | |||
59 | #define SEGS_PER_INDIRECT_FRAME \ | ||
60 | (PAGE_SIZE/sizeof(struct blkif_request_segment_aligned)) | ||
61 | #define MAX_INDIRECT_PAGES \ | ||
62 | ((MAX_INDIRECT_SEGMENTS + SEGS_PER_INDIRECT_FRAME - 1)/SEGS_PER_INDIRECT_FRAME) | ||
63 | #define INDIRECT_PAGES(_segs) \ | ||
64 | ((_segs + SEGS_PER_INDIRECT_FRAME - 1)/SEGS_PER_INDIRECT_FRAME) | ||
65 | |||
53 | /* Not a real protocol. Used to generate ring structs which contain | 66 | /* Not a real protocol. Used to generate ring structs which contain |
54 | * the elements common to all protocols only. This way we get a | 67 | * the elements common to all protocols only. This way we get a |
55 | * compiler-checkable way to use common struct elements, so we can | 68 | * compiler-checkable way to use common struct elements, so we can |
@@ -83,12 +96,31 @@ struct blkif_x86_32_request_other { | |||
83 | uint64_t id; /* private guest value, echoed in resp */ | 96 | uint64_t id; /* private guest value, echoed in resp */ |
84 | } __attribute__((__packed__)); | 97 | } __attribute__((__packed__)); |
85 | 98 | ||
99 | struct blkif_x86_32_request_indirect { | ||
100 | uint8_t indirect_op; | ||
101 | uint16_t nr_segments; | ||
102 | uint64_t id; | ||
103 | blkif_sector_t sector_number; | ||
104 | blkif_vdev_t handle; | ||
105 | uint16_t _pad1; | ||
106 | grant_ref_t indirect_grefs[BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST]; | ||
107 | /* | ||
108 | * The maximum number of indirect segments (and pages) that will | ||
109 | * be used is determined by MAX_INDIRECT_SEGMENTS, this value | ||
110 | * is also exported to the guest (via xenstore | ||
111 | * feature-max-indirect-segments entry), so the frontend knows how | ||
112 | * many indirect segments the backend supports. | ||
113 | */ | ||
114 | uint64_t _pad2; /* make it 64 byte aligned */ | ||
115 | } __attribute__((__packed__)); | ||
116 | |||
86 | struct blkif_x86_32_request { | 117 | struct blkif_x86_32_request { |
87 | uint8_t operation; /* BLKIF_OP_??? */ | 118 | uint8_t operation; /* BLKIF_OP_??? */ |
88 | union { | 119 | union { |
89 | struct blkif_x86_32_request_rw rw; | 120 | struct blkif_x86_32_request_rw rw; |
90 | struct blkif_x86_32_request_discard discard; | 121 | struct blkif_x86_32_request_discard discard; |
91 | struct blkif_x86_32_request_other other; | 122 | struct blkif_x86_32_request_other other; |
123 | struct blkif_x86_32_request_indirect indirect; | ||
92 | } u; | 124 | } u; |
93 | } __attribute__((__packed__)); | 125 | } __attribute__((__packed__)); |
94 | 126 | ||
@@ -127,12 +159,32 @@ struct blkif_x86_64_request_other { | |||
127 | uint64_t id; /* private guest value, echoed in resp */ | 159 | uint64_t id; /* private guest value, echoed in resp */ |
128 | } __attribute__((__packed__)); | 160 | } __attribute__((__packed__)); |
129 | 161 | ||
162 | struct blkif_x86_64_request_indirect { | ||
163 | uint8_t indirect_op; | ||
164 | uint16_t nr_segments; | ||
165 | uint32_t _pad1; /* offsetof(blkif_..,u.indirect.id)==8 */ | ||
166 | uint64_t id; | ||
167 | blkif_sector_t sector_number; | ||
168 | blkif_vdev_t handle; | ||
169 | uint16_t _pad2; | ||
170 | grant_ref_t indirect_grefs[BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST]; | ||
171 | /* | ||
172 | * The maximum number of indirect segments (and pages) that will | ||
173 | * be used is determined by MAX_INDIRECT_SEGMENTS, this value | ||
174 | * is also exported to the guest (via xenstore | ||
175 | * feature-max-indirect-segments entry), so the frontend knows how | ||
176 | * many indirect segments the backend supports. | ||
177 | */ | ||
178 | uint32_t _pad3; /* make it 64 byte aligned */ | ||
179 | } __attribute__((__packed__)); | ||
180 | |||
130 | struct blkif_x86_64_request { | 181 | struct blkif_x86_64_request { |
131 | uint8_t operation; /* BLKIF_OP_??? */ | 182 | uint8_t operation; /* BLKIF_OP_??? */ |
132 | union { | 183 | union { |
133 | struct blkif_x86_64_request_rw rw; | 184 | struct blkif_x86_64_request_rw rw; |
134 | struct blkif_x86_64_request_discard discard; | 185 | struct blkif_x86_64_request_discard discard; |
135 | struct blkif_x86_64_request_other other; | 186 | struct blkif_x86_64_request_other other; |
187 | struct blkif_x86_64_request_indirect indirect; | ||
136 | } u; | 188 | } u; |
137 | } __attribute__((__packed__)); | 189 | } __attribute__((__packed__)); |
138 | 190 | ||
@@ -182,12 +234,26 @@ struct xen_vbd { | |||
182 | 234 | ||
183 | struct backend_info; | 235 | struct backend_info; |
184 | 236 | ||
237 | /* Number of available flags */ | ||
238 | #define PERSISTENT_GNT_FLAGS_SIZE 2 | ||
239 | /* This persistent grant is currently in use */ | ||
240 | #define PERSISTENT_GNT_ACTIVE 0 | ||
241 | /* | ||
242 | * This persistent grant has been used, this flag is set when we remove the | ||
243 | * PERSISTENT_GNT_ACTIVE, to know that this grant has been used recently. | ||
244 | */ | ||
245 | #define PERSISTENT_GNT_WAS_ACTIVE 1 | ||
246 | |||
247 | /* Number of requests that we can fit in a ring */ | ||
248 | #define XEN_BLKIF_REQS 32 | ||
185 | 249 | ||
186 | struct persistent_gnt { | 250 | struct persistent_gnt { |
187 | struct page *page; | 251 | struct page *page; |
188 | grant_ref_t gnt; | 252 | grant_ref_t gnt; |
189 | grant_handle_t handle; | 253 | grant_handle_t handle; |
254 | DECLARE_BITMAP(flags, PERSISTENT_GNT_FLAGS_SIZE); | ||
190 | struct rb_node node; | 255 | struct rb_node node; |
256 | struct list_head remove_node; | ||
191 | }; | 257 | }; |
192 | 258 | ||
193 | struct xen_blkif { | 259 | struct xen_blkif { |
@@ -219,6 +285,23 @@ struct xen_blkif { | |||
219 | /* tree to store persistent grants */ | 285 | /* tree to store persistent grants */ |
220 | struct rb_root persistent_gnts; | 286 | struct rb_root persistent_gnts; |
221 | unsigned int persistent_gnt_c; | 287 | unsigned int persistent_gnt_c; |
288 | atomic_t persistent_gnt_in_use; | ||
289 | unsigned long next_lru; | ||
290 | |||
291 | /* used by the kworker that offload work from the persistent purge */ | ||
292 | struct list_head persistent_purge_list; | ||
293 | struct work_struct persistent_purge_work; | ||
294 | |||
295 | /* buffer of free pages to map grant refs */ | ||
296 | spinlock_t free_pages_lock; | ||
297 | int free_pages_num; | ||
298 | struct list_head free_pages; | ||
299 | |||
300 | /* List of all 'pending_req' available */ | ||
301 | struct list_head pending_free; | ||
302 | /* And its spinlock. */ | ||
303 | spinlock_t pending_free_lock; | ||
304 | wait_queue_head_t pending_free_wq; | ||
222 | 305 | ||
223 | /* statistics */ | 306 | /* statistics */ |
224 | unsigned long st_print; | 307 | unsigned long st_print; |
@@ -231,6 +314,41 @@ struct xen_blkif { | |||
231 | unsigned long long st_wr_sect; | 314 | unsigned long long st_wr_sect; |
232 | 315 | ||
233 | wait_queue_head_t waiting_to_free; | 316 | wait_queue_head_t waiting_to_free; |
317 | /* Thread shutdown wait queue. */ | ||
318 | wait_queue_head_t shutdown_wq; | ||
319 | }; | ||
320 | |||
321 | struct seg_buf { | ||
322 | unsigned long offset; | ||
323 | unsigned int nsec; | ||
324 | }; | ||
325 | |||
326 | struct grant_page { | ||
327 | struct page *page; | ||
328 | struct persistent_gnt *persistent_gnt; | ||
329 | grant_handle_t handle; | ||
330 | grant_ref_t gref; | ||
331 | }; | ||
332 | |||
333 | /* | ||
334 | * Each outstanding request that we've passed to the lower device layers has a | ||
335 | * 'pending_req' allocated to it. Each buffer_head that completes decrements | ||
336 | * the pendcnt towards zero. When it hits zero, the specified domain has a | ||
337 | * response queued for it, with the saved 'id' passed back. | ||
338 | */ | ||
339 | struct pending_req { | ||
340 | struct xen_blkif *blkif; | ||
341 | u64 id; | ||
342 | int nr_pages; | ||
343 | atomic_t pendcnt; | ||
344 | unsigned short operation; | ||
345 | int status; | ||
346 | struct list_head free_list; | ||
347 | struct grant_page *segments[MAX_INDIRECT_SEGMENTS]; | ||
348 | /* Indirect descriptors */ | ||
349 | struct grant_page *indirect_pages[MAX_INDIRECT_PAGES]; | ||
350 | struct seg_buf seg[MAX_INDIRECT_SEGMENTS]; | ||
351 | struct bio *biolist[MAX_INDIRECT_SEGMENTS]; | ||
234 | }; | 352 | }; |
235 | 353 | ||
236 | 354 | ||
@@ -257,6 +375,7 @@ int xen_blkif_xenbus_init(void); | |||
257 | 375 | ||
258 | irqreturn_t xen_blkif_be_int(int irq, void *dev_id); | 376 | irqreturn_t xen_blkif_be_int(int irq, void *dev_id); |
259 | int xen_blkif_schedule(void *arg); | 377 | int xen_blkif_schedule(void *arg); |
378 | int xen_blkif_purge_persistent(void *arg); | ||
260 | 379 | ||
261 | int xen_blkbk_flush_diskcache(struct xenbus_transaction xbt, | 380 | int xen_blkbk_flush_diskcache(struct xenbus_transaction xbt, |
262 | struct backend_info *be, int state); | 381 | struct backend_info *be, int state); |
@@ -268,7 +387,7 @@ struct xenbus_device *xen_blkbk_xenbus(struct backend_info *be); | |||
268 | static inline void blkif_get_x86_32_req(struct blkif_request *dst, | 387 | static inline void blkif_get_x86_32_req(struct blkif_request *dst, |
269 | struct blkif_x86_32_request *src) | 388 | struct blkif_x86_32_request *src) |
270 | { | 389 | { |
271 | int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST; | 390 | int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST, j; |
272 | dst->operation = src->operation; | 391 | dst->operation = src->operation; |
273 | switch (src->operation) { | 392 | switch (src->operation) { |
274 | case BLKIF_OP_READ: | 393 | case BLKIF_OP_READ: |
@@ -291,6 +410,18 @@ static inline void blkif_get_x86_32_req(struct blkif_request *dst, | |||
291 | dst->u.discard.sector_number = src->u.discard.sector_number; | 410 | dst->u.discard.sector_number = src->u.discard.sector_number; |
292 | dst->u.discard.nr_sectors = src->u.discard.nr_sectors; | 411 | dst->u.discard.nr_sectors = src->u.discard.nr_sectors; |
293 | break; | 412 | break; |
413 | case BLKIF_OP_INDIRECT: | ||
414 | dst->u.indirect.indirect_op = src->u.indirect.indirect_op; | ||
415 | dst->u.indirect.nr_segments = src->u.indirect.nr_segments; | ||
416 | dst->u.indirect.handle = src->u.indirect.handle; | ||
417 | dst->u.indirect.id = src->u.indirect.id; | ||
418 | dst->u.indirect.sector_number = src->u.indirect.sector_number; | ||
419 | barrier(); | ||
420 | j = min(MAX_INDIRECT_PAGES, INDIRECT_PAGES(dst->u.indirect.nr_segments)); | ||
421 | for (i = 0; i < j; i++) | ||
422 | dst->u.indirect.indirect_grefs[i] = | ||
423 | src->u.indirect.indirect_grefs[i]; | ||
424 | break; | ||
294 | default: | 425 | default: |
295 | /* | 426 | /* |
296 | * Don't know how to translate this op. Only get the | 427 | * Don't know how to translate this op. Only get the |
@@ -304,7 +435,7 @@ static inline void blkif_get_x86_32_req(struct blkif_request *dst, | |||
304 | static inline void blkif_get_x86_64_req(struct blkif_request *dst, | 435 | static inline void blkif_get_x86_64_req(struct blkif_request *dst, |
305 | struct blkif_x86_64_request *src) | 436 | struct blkif_x86_64_request *src) |
306 | { | 437 | { |
307 | int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST; | 438 | int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST, j; |
308 | dst->operation = src->operation; | 439 | dst->operation = src->operation; |
309 | switch (src->operation) { | 440 | switch (src->operation) { |
310 | case BLKIF_OP_READ: | 441 | case BLKIF_OP_READ: |
@@ -327,6 +458,18 @@ static inline void blkif_get_x86_64_req(struct blkif_request *dst, | |||
327 | dst->u.discard.sector_number = src->u.discard.sector_number; | 458 | dst->u.discard.sector_number = src->u.discard.sector_number; |
328 | dst->u.discard.nr_sectors = src->u.discard.nr_sectors; | 459 | dst->u.discard.nr_sectors = src->u.discard.nr_sectors; |
329 | break; | 460 | break; |
461 | case BLKIF_OP_INDIRECT: | ||
462 | dst->u.indirect.indirect_op = src->u.indirect.indirect_op; | ||
463 | dst->u.indirect.nr_segments = src->u.indirect.nr_segments; | ||
464 | dst->u.indirect.handle = src->u.indirect.handle; | ||
465 | dst->u.indirect.id = src->u.indirect.id; | ||
466 | dst->u.indirect.sector_number = src->u.indirect.sector_number; | ||
467 | barrier(); | ||
468 | j = min(MAX_INDIRECT_PAGES, INDIRECT_PAGES(dst->u.indirect.nr_segments)); | ||
469 | for (i = 0; i < j; i++) | ||
470 | dst->u.indirect.indirect_grefs[i] = | ||
471 | src->u.indirect.indirect_grefs[i]; | ||
472 | break; | ||
330 | default: | 473 | default: |
331 | /* | 474 | /* |
332 | * Don't know how to translate this op. Only get the | 475 | * Don't know how to translate this op. Only get the |
diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c index 8bfd1bcf95ec..2e5b69d612ac 100644 --- a/drivers/block/xen-blkback/xenbus.c +++ b/drivers/block/xen-blkback/xenbus.c | |||
@@ -98,12 +98,17 @@ static void xen_update_blkif_status(struct xen_blkif *blkif) | |||
98 | err = PTR_ERR(blkif->xenblkd); | 98 | err = PTR_ERR(blkif->xenblkd); |
99 | blkif->xenblkd = NULL; | 99 | blkif->xenblkd = NULL; |
100 | xenbus_dev_error(blkif->be->dev, err, "start xenblkd"); | 100 | xenbus_dev_error(blkif->be->dev, err, "start xenblkd"); |
101 | return; | ||
101 | } | 102 | } |
102 | } | 103 | } |
103 | 104 | ||
104 | static struct xen_blkif *xen_blkif_alloc(domid_t domid) | 105 | static struct xen_blkif *xen_blkif_alloc(domid_t domid) |
105 | { | 106 | { |
106 | struct xen_blkif *blkif; | 107 | struct xen_blkif *blkif; |
108 | struct pending_req *req, *n; | ||
109 | int i, j; | ||
110 | |||
111 | BUILD_BUG_ON(MAX_INDIRECT_PAGES > BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST); | ||
107 | 112 | ||
108 | blkif = kmem_cache_zalloc(xen_blkif_cachep, GFP_KERNEL); | 113 | blkif = kmem_cache_zalloc(xen_blkif_cachep, GFP_KERNEL); |
109 | if (!blkif) | 114 | if (!blkif) |
@@ -118,8 +123,57 @@ static struct xen_blkif *xen_blkif_alloc(domid_t domid) | |||
118 | blkif->st_print = jiffies; | 123 | blkif->st_print = jiffies; |
119 | init_waitqueue_head(&blkif->waiting_to_free); | 124 | init_waitqueue_head(&blkif->waiting_to_free); |
120 | blkif->persistent_gnts.rb_node = NULL; | 125 | blkif->persistent_gnts.rb_node = NULL; |
126 | spin_lock_init(&blkif->free_pages_lock); | ||
127 | INIT_LIST_HEAD(&blkif->free_pages); | ||
128 | blkif->free_pages_num = 0; | ||
129 | atomic_set(&blkif->persistent_gnt_in_use, 0); | ||
130 | |||
131 | INIT_LIST_HEAD(&blkif->pending_free); | ||
132 | |||
133 | for (i = 0; i < XEN_BLKIF_REQS; i++) { | ||
134 | req = kzalloc(sizeof(*req), GFP_KERNEL); | ||
135 | if (!req) | ||
136 | goto fail; | ||
137 | list_add_tail(&req->free_list, | ||
138 | &blkif->pending_free); | ||
139 | for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++) { | ||
140 | req->segments[j] = kzalloc(sizeof(*req->segments[0]), | ||
141 | GFP_KERNEL); | ||
142 | if (!req->segments[j]) | ||
143 | goto fail; | ||
144 | } | ||
145 | for (j = 0; j < MAX_INDIRECT_PAGES; j++) { | ||
146 | req->indirect_pages[j] = kzalloc(sizeof(*req->indirect_pages[0]), | ||
147 | GFP_KERNEL); | ||
148 | if (!req->indirect_pages[j]) | ||
149 | goto fail; | ||
150 | } | ||
151 | } | ||
152 | spin_lock_init(&blkif->pending_free_lock); | ||
153 | init_waitqueue_head(&blkif->pending_free_wq); | ||
154 | init_waitqueue_head(&blkif->shutdown_wq); | ||
121 | 155 | ||
122 | return blkif; | 156 | return blkif; |
157 | |||
158 | fail: | ||
159 | list_for_each_entry_safe(req, n, &blkif->pending_free, free_list) { | ||
160 | list_del(&req->free_list); | ||
161 | for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++) { | ||
162 | if (!req->segments[j]) | ||
163 | break; | ||
164 | kfree(req->segments[j]); | ||
165 | } | ||
166 | for (j = 0; j < MAX_INDIRECT_PAGES; j++) { | ||
167 | if (!req->indirect_pages[j]) | ||
168 | break; | ||
169 | kfree(req->indirect_pages[j]); | ||
170 | } | ||
171 | kfree(req); | ||
172 | } | ||
173 | |||
174 | kmem_cache_free(xen_blkif_cachep, blkif); | ||
175 | |||
176 | return ERR_PTR(-ENOMEM); | ||
123 | } | 177 | } |
124 | 178 | ||
125 | static int xen_blkif_map(struct xen_blkif *blkif, unsigned long shared_page, | 179 | static int xen_blkif_map(struct xen_blkif *blkif, unsigned long shared_page, |
@@ -178,6 +232,7 @@ static void xen_blkif_disconnect(struct xen_blkif *blkif) | |||
178 | { | 232 | { |
179 | if (blkif->xenblkd) { | 233 | if (blkif->xenblkd) { |
180 | kthread_stop(blkif->xenblkd); | 234 | kthread_stop(blkif->xenblkd); |
235 | wake_up(&blkif->shutdown_wq); | ||
181 | blkif->xenblkd = NULL; | 236 | blkif->xenblkd = NULL; |
182 | } | 237 | } |
183 | 238 | ||
@@ -198,8 +253,28 @@ static void xen_blkif_disconnect(struct xen_blkif *blkif) | |||
198 | 253 | ||
199 | static void xen_blkif_free(struct xen_blkif *blkif) | 254 | static void xen_blkif_free(struct xen_blkif *blkif) |
200 | { | 255 | { |
256 | struct pending_req *req, *n; | ||
257 | int i = 0, j; | ||
258 | |||
201 | if (!atomic_dec_and_test(&blkif->refcnt)) | 259 | if (!atomic_dec_and_test(&blkif->refcnt)) |
202 | BUG(); | 260 | BUG(); |
261 | |||
262 | /* Check that there is no request in use */ | ||
263 | list_for_each_entry_safe(req, n, &blkif->pending_free, free_list) { | ||
264 | list_del(&req->free_list); | ||
265 | |||
266 | for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++) | ||
267 | kfree(req->segments[j]); | ||
268 | |||
269 | for (j = 0; j < MAX_INDIRECT_PAGES; j++) | ||
270 | kfree(req->indirect_pages[j]); | ||
271 | |||
272 | kfree(req); | ||
273 | i++; | ||
274 | } | ||
275 | |||
276 | WARN_ON(i != XEN_BLKIF_REQS); | ||
277 | |||
203 | kmem_cache_free(xen_blkif_cachep, blkif); | 278 | kmem_cache_free(xen_blkif_cachep, blkif); |
204 | } | 279 | } |
205 | 280 | ||
@@ -678,6 +753,11 @@ again: | |||
678 | dev->nodename); | 753 | dev->nodename); |
679 | goto abort; | 754 | goto abort; |
680 | } | 755 | } |
756 | err = xenbus_printf(xbt, dev->nodename, "feature-max-indirect-segments", "%u", | ||
757 | MAX_INDIRECT_SEGMENTS); | ||
758 | if (err) | ||
759 | dev_warn(&dev->dev, "writing %s/feature-max-indirect-segments (%d)", | ||
760 | dev->nodename, err); | ||
681 | 761 | ||
682 | err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu", | 762 | err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu", |
683 | (unsigned long long)vbd_sz(&be->blkif->vbd)); | 763 | (unsigned long long)vbd_sz(&be->blkif->vbd)); |
@@ -704,6 +784,11 @@ again: | |||
704 | dev->nodename); | 784 | dev->nodename); |
705 | goto abort; | 785 | goto abort; |
706 | } | 786 | } |
787 | err = xenbus_printf(xbt, dev->nodename, "physical-sector-size", "%u", | ||
788 | bdev_physical_block_size(be->blkif->vbd.bdev)); | ||
789 | if (err) | ||
790 | xenbus_dev_error(dev, err, "writing %s/physical-sector-size", | ||
791 | dev->nodename); | ||
707 | 792 | ||
708 | err = xenbus_transaction_end(xbt, 0); | 793 | err = xenbus_transaction_end(xbt, 0); |
709 | if (err == -EAGAIN) | 794 | if (err == -EAGAIN) |
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index d89ef86220f4..a4660bbee8a6 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c | |||
@@ -74,12 +74,30 @@ struct grant { | |||
74 | struct blk_shadow { | 74 | struct blk_shadow { |
75 | struct blkif_request req; | 75 | struct blkif_request req; |
76 | struct request *request; | 76 | struct request *request; |
77 | struct grant *grants_used[BLKIF_MAX_SEGMENTS_PER_REQUEST]; | 77 | struct grant **grants_used; |
78 | struct grant **indirect_grants; | ||
79 | struct scatterlist *sg; | ||
80 | }; | ||
81 | |||
82 | struct split_bio { | ||
83 | struct bio *bio; | ||
84 | atomic_t pending; | ||
85 | int err; | ||
78 | }; | 86 | }; |
79 | 87 | ||
80 | static DEFINE_MUTEX(blkfront_mutex); | 88 | static DEFINE_MUTEX(blkfront_mutex); |
81 | static const struct block_device_operations xlvbd_block_fops; | 89 | static const struct block_device_operations xlvbd_block_fops; |
82 | 90 | ||
91 | /* | ||
92 | * Maximum number of segments in indirect requests, the actual value used by | ||
93 | * the frontend driver is the minimum of this value and the value provided | ||
94 | * by the backend driver. | ||
95 | */ | ||
96 | |||
97 | static unsigned int xen_blkif_max_segments = 32; | ||
98 | module_param_named(max, xen_blkif_max_segments, int, S_IRUGO); | ||
99 | MODULE_PARM_DESC(max, "Maximum amount of segments in indirect requests (default is 32)"); | ||
100 | |||
83 | #define BLK_RING_SIZE __CONST_RING_SIZE(blkif, PAGE_SIZE) | 101 | #define BLK_RING_SIZE __CONST_RING_SIZE(blkif, PAGE_SIZE) |
84 | 102 | ||
85 | /* | 103 | /* |
@@ -98,7 +116,6 @@ struct blkfront_info | |||
98 | enum blkif_state connected; | 116 | enum blkif_state connected; |
99 | int ring_ref; | 117 | int ring_ref; |
100 | struct blkif_front_ring ring; | 118 | struct blkif_front_ring ring; |
101 | struct scatterlist sg[BLKIF_MAX_SEGMENTS_PER_REQUEST]; | ||
102 | unsigned int evtchn, irq; | 119 | unsigned int evtchn, irq; |
103 | struct request_queue *rq; | 120 | struct request_queue *rq; |
104 | struct work_struct work; | 121 | struct work_struct work; |
@@ -114,6 +131,7 @@ struct blkfront_info | |||
114 | unsigned int discard_granularity; | 131 | unsigned int discard_granularity; |
115 | unsigned int discard_alignment; | 132 | unsigned int discard_alignment; |
116 | unsigned int feature_persistent:1; | 133 | unsigned int feature_persistent:1; |
134 | unsigned int max_indirect_segments; | ||
117 | int is_ready; | 135 | int is_ready; |
118 | }; | 136 | }; |
119 | 137 | ||
@@ -142,6 +160,13 @@ static DEFINE_SPINLOCK(minor_lock); | |||
142 | 160 | ||
143 | #define DEV_NAME "xvd" /* name in /dev */ | 161 | #define DEV_NAME "xvd" /* name in /dev */ |
144 | 162 | ||
163 | #define SEGS_PER_INDIRECT_FRAME \ | ||
164 | (PAGE_SIZE/sizeof(struct blkif_request_segment_aligned)) | ||
165 | #define INDIRECT_GREFS(_segs) \ | ||
166 | ((_segs + SEGS_PER_INDIRECT_FRAME - 1)/SEGS_PER_INDIRECT_FRAME) | ||
167 | |||
168 | static int blkfront_setup_indirect(struct blkfront_info *info); | ||
169 | |||
145 | static int get_id_from_freelist(struct blkfront_info *info) | 170 | static int get_id_from_freelist(struct blkfront_info *info) |
146 | { | 171 | { |
147 | unsigned long free = info->shadow_free; | 172 | unsigned long free = info->shadow_free; |
@@ -358,7 +383,8 @@ static int blkif_queue_request(struct request *req) | |||
358 | struct blkif_request *ring_req; | 383 | struct blkif_request *ring_req; |
359 | unsigned long id; | 384 | unsigned long id; |
360 | unsigned int fsect, lsect; | 385 | unsigned int fsect, lsect; |
361 | int i, ref; | 386 | int i, ref, n; |
387 | struct blkif_request_segment_aligned *segments = NULL; | ||
362 | 388 | ||
363 | /* | 389 | /* |
364 | * Used to store if we are able to queue the request by just using | 390 | * Used to store if we are able to queue the request by just using |
@@ -369,21 +395,27 @@ static int blkif_queue_request(struct request *req) | |||
369 | grant_ref_t gref_head; | 395 | grant_ref_t gref_head; |
370 | struct grant *gnt_list_entry = NULL; | 396 | struct grant *gnt_list_entry = NULL; |
371 | struct scatterlist *sg; | 397 | struct scatterlist *sg; |
398 | int nseg, max_grefs; | ||
372 | 399 | ||
373 | if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) | 400 | if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) |
374 | return 1; | 401 | return 1; |
375 | 402 | ||
376 | /* Check if we have enought grants to allocate a requests */ | 403 | max_grefs = info->max_indirect_segments ? |
377 | if (info->persistent_gnts_c < BLKIF_MAX_SEGMENTS_PER_REQUEST) { | 404 | info->max_indirect_segments + |
405 | INDIRECT_GREFS(info->max_indirect_segments) : | ||
406 | BLKIF_MAX_SEGMENTS_PER_REQUEST; | ||
407 | |||
408 | /* Check if we have enough grants to allocate a requests */ | ||
409 | if (info->persistent_gnts_c < max_grefs) { | ||
378 | new_persistent_gnts = 1; | 410 | new_persistent_gnts = 1; |
379 | if (gnttab_alloc_grant_references( | 411 | if (gnttab_alloc_grant_references( |
380 | BLKIF_MAX_SEGMENTS_PER_REQUEST - info->persistent_gnts_c, | 412 | max_grefs - info->persistent_gnts_c, |
381 | &gref_head) < 0) { | 413 | &gref_head) < 0) { |
382 | gnttab_request_free_callback( | 414 | gnttab_request_free_callback( |
383 | &info->callback, | 415 | &info->callback, |
384 | blkif_restart_queue_callback, | 416 | blkif_restart_queue_callback, |
385 | info, | 417 | info, |
386 | BLKIF_MAX_SEGMENTS_PER_REQUEST); | 418 | max_grefs); |
387 | return 1; | 419 | return 1; |
388 | } | 420 | } |
389 | } else | 421 | } else |
@@ -394,42 +426,67 @@ static int blkif_queue_request(struct request *req) | |||
394 | id = get_id_from_freelist(info); | 426 | id = get_id_from_freelist(info); |
395 | info->shadow[id].request = req; | 427 | info->shadow[id].request = req; |
396 | 428 | ||
397 | ring_req->u.rw.id = id; | ||
398 | ring_req->u.rw.sector_number = (blkif_sector_t)blk_rq_pos(req); | ||
399 | ring_req->u.rw.handle = info->handle; | ||
400 | |||
401 | ring_req->operation = rq_data_dir(req) ? | ||
402 | BLKIF_OP_WRITE : BLKIF_OP_READ; | ||
403 | |||
404 | if (req->cmd_flags & (REQ_FLUSH | REQ_FUA)) { | ||
405 | /* | ||
406 | * Ideally we can do an unordered flush-to-disk. In case the | ||
407 | * backend onlysupports barriers, use that. A barrier request | ||
408 | * a superset of FUA, so we can implement it the same | ||
409 | * way. (It's also a FLUSH+FUA, since it is | ||
410 | * guaranteed ordered WRT previous writes.) | ||
411 | */ | ||
412 | ring_req->operation = info->flush_op; | ||
413 | } | ||
414 | |||
415 | if (unlikely(req->cmd_flags & (REQ_DISCARD | REQ_SECURE))) { | 429 | if (unlikely(req->cmd_flags & (REQ_DISCARD | REQ_SECURE))) { |
416 | /* id, sector_number and handle are set above. */ | ||
417 | ring_req->operation = BLKIF_OP_DISCARD; | 430 | ring_req->operation = BLKIF_OP_DISCARD; |
418 | ring_req->u.discard.nr_sectors = blk_rq_sectors(req); | 431 | ring_req->u.discard.nr_sectors = blk_rq_sectors(req); |
432 | ring_req->u.discard.id = id; | ||
433 | ring_req->u.discard.sector_number = (blkif_sector_t)blk_rq_pos(req); | ||
419 | if ((req->cmd_flags & REQ_SECURE) && info->feature_secdiscard) | 434 | if ((req->cmd_flags & REQ_SECURE) && info->feature_secdiscard) |
420 | ring_req->u.discard.flag = BLKIF_DISCARD_SECURE; | 435 | ring_req->u.discard.flag = BLKIF_DISCARD_SECURE; |
421 | else | 436 | else |
422 | ring_req->u.discard.flag = 0; | 437 | ring_req->u.discard.flag = 0; |
423 | } else { | 438 | } else { |
424 | ring_req->u.rw.nr_segments = blk_rq_map_sg(req->q, req, | 439 | BUG_ON(info->max_indirect_segments == 0 && |
425 | info->sg); | 440 | req->nr_phys_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST); |
426 | BUG_ON(ring_req->u.rw.nr_segments > | 441 | BUG_ON(info->max_indirect_segments && |
427 | BLKIF_MAX_SEGMENTS_PER_REQUEST); | 442 | req->nr_phys_segments > info->max_indirect_segments); |
428 | 443 | nseg = blk_rq_map_sg(req->q, req, info->shadow[id].sg); | |
429 | for_each_sg(info->sg, sg, ring_req->u.rw.nr_segments, i) { | 444 | ring_req->u.rw.id = id; |
445 | if (nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST) { | ||
446 | /* | ||
447 | * The indirect operation can only be a BLKIF_OP_READ or | ||
448 | * BLKIF_OP_WRITE | ||
449 | */ | ||
450 | BUG_ON(req->cmd_flags & (REQ_FLUSH | REQ_FUA)); | ||
451 | ring_req->operation = BLKIF_OP_INDIRECT; | ||
452 | ring_req->u.indirect.indirect_op = rq_data_dir(req) ? | ||
453 | BLKIF_OP_WRITE : BLKIF_OP_READ; | ||
454 | ring_req->u.indirect.sector_number = (blkif_sector_t)blk_rq_pos(req); | ||
455 | ring_req->u.indirect.handle = info->handle; | ||
456 | ring_req->u.indirect.nr_segments = nseg; | ||
457 | } else { | ||
458 | ring_req->u.rw.sector_number = (blkif_sector_t)blk_rq_pos(req); | ||
459 | ring_req->u.rw.handle = info->handle; | ||
460 | ring_req->operation = rq_data_dir(req) ? | ||
461 | BLKIF_OP_WRITE : BLKIF_OP_READ; | ||
462 | if (req->cmd_flags & (REQ_FLUSH | REQ_FUA)) { | ||
463 | /* | ||
464 | * Ideally we can do an unordered flush-to-disk. In case the | ||
465 | * backend onlysupports barriers, use that. A barrier request | ||
466 | * a superset of FUA, so we can implement it the same | ||
467 | * way. (It's also a FLUSH+FUA, since it is | ||
468 | * guaranteed ordered WRT previous writes.) | ||
469 | */ | ||
470 | ring_req->operation = info->flush_op; | ||
471 | } | ||
472 | ring_req->u.rw.nr_segments = nseg; | ||
473 | } | ||
474 | for_each_sg(info->shadow[id].sg, sg, nseg, i) { | ||
430 | fsect = sg->offset >> 9; | 475 | fsect = sg->offset >> 9; |
431 | lsect = fsect + (sg->length >> 9) - 1; | 476 | lsect = fsect + (sg->length >> 9) - 1; |
432 | 477 | ||
478 | if ((ring_req->operation == BLKIF_OP_INDIRECT) && | ||
479 | (i % SEGS_PER_INDIRECT_FRAME == 0)) { | ||
480 | if (segments) | ||
481 | kunmap_atomic(segments); | ||
482 | |||
483 | n = i / SEGS_PER_INDIRECT_FRAME; | ||
484 | gnt_list_entry = get_grant(&gref_head, info); | ||
485 | info->shadow[id].indirect_grants[n] = gnt_list_entry; | ||
486 | segments = kmap_atomic(pfn_to_page(gnt_list_entry->pfn)); | ||
487 | ring_req->u.indirect.indirect_grefs[n] = gnt_list_entry->gref; | ||
488 | } | ||
489 | |||
433 | gnt_list_entry = get_grant(&gref_head, info); | 490 | gnt_list_entry = get_grant(&gref_head, info); |
434 | ref = gnt_list_entry->gref; | 491 | ref = gnt_list_entry->gref; |
435 | 492 | ||
@@ -441,8 +498,7 @@ static int blkif_queue_request(struct request *req) | |||
441 | 498 | ||
442 | BUG_ON(sg->offset + sg->length > PAGE_SIZE); | 499 | BUG_ON(sg->offset + sg->length > PAGE_SIZE); |
443 | 500 | ||
444 | shared_data = kmap_atomic( | 501 | shared_data = kmap_atomic(pfn_to_page(gnt_list_entry->pfn)); |
445 | pfn_to_page(gnt_list_entry->pfn)); | ||
446 | bvec_data = kmap_atomic(sg_page(sg)); | 502 | bvec_data = kmap_atomic(sg_page(sg)); |
447 | 503 | ||
448 | /* | 504 | /* |
@@ -461,13 +517,23 @@ static int blkif_queue_request(struct request *req) | |||
461 | kunmap_atomic(bvec_data); | 517 | kunmap_atomic(bvec_data); |
462 | kunmap_atomic(shared_data); | 518 | kunmap_atomic(shared_data); |
463 | } | 519 | } |
464 | 520 | if (ring_req->operation != BLKIF_OP_INDIRECT) { | |
465 | ring_req->u.rw.seg[i] = | 521 | ring_req->u.rw.seg[i] = |
466 | (struct blkif_request_segment) { | 522 | (struct blkif_request_segment) { |
467 | .gref = ref, | 523 | .gref = ref, |
468 | .first_sect = fsect, | 524 | .first_sect = fsect, |
469 | .last_sect = lsect }; | 525 | .last_sect = lsect }; |
526 | } else { | ||
527 | n = i % SEGS_PER_INDIRECT_FRAME; | ||
528 | segments[n] = | ||
529 | (struct blkif_request_segment_aligned) { | ||
530 | .gref = ref, | ||
531 | .first_sect = fsect, | ||
532 | .last_sect = lsect }; | ||
533 | } | ||
470 | } | 534 | } |
535 | if (segments) | ||
536 | kunmap_atomic(segments); | ||
471 | } | 537 | } |
472 | 538 | ||
473 | info->ring.req_prod_pvt++; | 539 | info->ring.req_prod_pvt++; |
@@ -542,7 +608,9 @@ wait: | |||
542 | flush_requests(info); | 608 | flush_requests(info); |
543 | } | 609 | } |
544 | 610 | ||
545 | static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size) | 611 | static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size, |
612 | unsigned int physical_sector_size, | ||
613 | unsigned int segments) | ||
546 | { | 614 | { |
547 | struct request_queue *rq; | 615 | struct request_queue *rq; |
548 | struct blkfront_info *info = gd->private_data; | 616 | struct blkfront_info *info = gd->private_data; |
@@ -564,14 +632,15 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size) | |||
564 | 632 | ||
565 | /* Hard sector size and max sectors impersonate the equiv. hardware. */ | 633 | /* Hard sector size and max sectors impersonate the equiv. hardware. */ |
566 | blk_queue_logical_block_size(rq, sector_size); | 634 | blk_queue_logical_block_size(rq, sector_size); |
567 | blk_queue_max_hw_sectors(rq, 512); | 635 | blk_queue_physical_block_size(rq, physical_sector_size); |
636 | blk_queue_max_hw_sectors(rq, (segments * PAGE_SIZE) / 512); | ||
568 | 637 | ||
569 | /* Each segment in a request is up to an aligned page in size. */ | 638 | /* Each segment in a request is up to an aligned page in size. */ |
570 | blk_queue_segment_boundary(rq, PAGE_SIZE - 1); | 639 | blk_queue_segment_boundary(rq, PAGE_SIZE - 1); |
571 | blk_queue_max_segment_size(rq, PAGE_SIZE); | 640 | blk_queue_max_segment_size(rq, PAGE_SIZE); |
572 | 641 | ||
573 | /* Ensure a merged request will fit in a single I/O ring slot. */ | 642 | /* Ensure a merged request will fit in a single I/O ring slot. */ |
574 | blk_queue_max_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST); | 643 | blk_queue_max_segments(rq, segments); |
575 | 644 | ||
576 | /* Make sure buffer addresses are sector-aligned. */ | 645 | /* Make sure buffer addresses are sector-aligned. */ |
577 | blk_queue_dma_alignment(rq, 511); | 646 | blk_queue_dma_alignment(rq, 511); |
@@ -588,13 +657,16 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size) | |||
588 | static void xlvbd_flush(struct blkfront_info *info) | 657 | static void xlvbd_flush(struct blkfront_info *info) |
589 | { | 658 | { |
590 | blk_queue_flush(info->rq, info->feature_flush); | 659 | blk_queue_flush(info->rq, info->feature_flush); |
591 | printk(KERN_INFO "blkfront: %s: %s: %s %s\n", | 660 | printk(KERN_INFO "blkfront: %s: %s: %s %s %s %s %s\n", |
592 | info->gd->disk_name, | 661 | info->gd->disk_name, |
593 | info->flush_op == BLKIF_OP_WRITE_BARRIER ? | 662 | info->flush_op == BLKIF_OP_WRITE_BARRIER ? |
594 | "barrier" : (info->flush_op == BLKIF_OP_FLUSH_DISKCACHE ? | 663 | "barrier" : (info->flush_op == BLKIF_OP_FLUSH_DISKCACHE ? |
595 | "flush diskcache" : "barrier or flush"), | 664 | "flush diskcache" : "barrier or flush"), |
596 | info->feature_flush ? "enabled" : "disabled", | 665 | info->feature_flush ? "enabled;" : "disabled;", |
597 | info->feature_persistent ? "using persistent grants" : ""); | 666 | "persistent grants:", |
667 | info->feature_persistent ? "enabled;" : "disabled;", | ||
668 | "indirect descriptors:", | ||
669 | info->max_indirect_segments ? "enabled;" : "disabled;"); | ||
598 | } | 670 | } |
599 | 671 | ||
600 | static int xen_translate_vdev(int vdevice, int *minor, unsigned int *offset) | 672 | static int xen_translate_vdev(int vdevice, int *minor, unsigned int *offset) |
@@ -667,7 +739,8 @@ static char *encode_disk_name(char *ptr, unsigned int n) | |||
667 | 739 | ||
668 | static int xlvbd_alloc_gendisk(blkif_sector_t capacity, | 740 | static int xlvbd_alloc_gendisk(blkif_sector_t capacity, |
669 | struct blkfront_info *info, | 741 | struct blkfront_info *info, |
670 | u16 vdisk_info, u16 sector_size) | 742 | u16 vdisk_info, u16 sector_size, |
743 | unsigned int physical_sector_size) | ||
671 | { | 744 | { |
672 | struct gendisk *gd; | 745 | struct gendisk *gd; |
673 | int nr_minors = 1; | 746 | int nr_minors = 1; |
@@ -734,7 +807,9 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity, | |||
734 | gd->driverfs_dev = &(info->xbdev->dev); | 807 | gd->driverfs_dev = &(info->xbdev->dev); |
735 | set_capacity(gd, capacity); | 808 | set_capacity(gd, capacity); |
736 | 809 | ||
737 | if (xlvbd_init_blk_queue(gd, sector_size)) { | 810 | if (xlvbd_init_blk_queue(gd, sector_size, physical_sector_size, |
811 | info->max_indirect_segments ? : | ||
812 | BLKIF_MAX_SEGMENTS_PER_REQUEST)) { | ||
738 | del_gendisk(gd); | 813 | del_gendisk(gd); |
739 | goto release; | 814 | goto release; |
740 | } | 815 | } |
@@ -818,6 +893,7 @@ static void blkif_free(struct blkfront_info *info, int suspend) | |||
818 | { | 893 | { |
819 | struct grant *persistent_gnt; | 894 | struct grant *persistent_gnt; |
820 | struct grant *n; | 895 | struct grant *n; |
896 | int i, j, segs; | ||
821 | 897 | ||
822 | /* Prevent new requests being issued until we fix things up. */ | 898 | /* Prevent new requests being issued until we fix things up. */ |
823 | spin_lock_irq(&info->io_lock); | 899 | spin_lock_irq(&info->io_lock); |
@@ -843,6 +919,47 @@ static void blkif_free(struct blkfront_info *info, int suspend) | |||
843 | } | 919 | } |
844 | BUG_ON(info->persistent_gnts_c != 0); | 920 | BUG_ON(info->persistent_gnts_c != 0); |
845 | 921 | ||
922 | for (i = 0; i < BLK_RING_SIZE; i++) { | ||
923 | /* | ||
924 | * Clear persistent grants present in requests already | ||
925 | * on the shared ring | ||
926 | */ | ||
927 | if (!info->shadow[i].request) | ||
928 | goto free_shadow; | ||
929 | |||
930 | segs = info->shadow[i].req.operation == BLKIF_OP_INDIRECT ? | ||
931 | info->shadow[i].req.u.indirect.nr_segments : | ||
932 | info->shadow[i].req.u.rw.nr_segments; | ||
933 | for (j = 0; j < segs; j++) { | ||
934 | persistent_gnt = info->shadow[i].grants_used[j]; | ||
935 | gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL); | ||
936 | __free_page(pfn_to_page(persistent_gnt->pfn)); | ||
937 | kfree(persistent_gnt); | ||
938 | } | ||
939 | |||
940 | if (info->shadow[i].req.operation != BLKIF_OP_INDIRECT) | ||
941 | /* | ||
942 | * If this is not an indirect operation don't try to | ||
943 | * free indirect segments | ||
944 | */ | ||
945 | goto free_shadow; | ||
946 | |||
947 | for (j = 0; j < INDIRECT_GREFS(segs); j++) { | ||
948 | persistent_gnt = info->shadow[i].indirect_grants[j]; | ||
949 | gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL); | ||
950 | __free_page(pfn_to_page(persistent_gnt->pfn)); | ||
951 | kfree(persistent_gnt); | ||
952 | } | ||
953 | |||
954 | free_shadow: | ||
955 | kfree(info->shadow[i].grants_used); | ||
956 | info->shadow[i].grants_used = NULL; | ||
957 | kfree(info->shadow[i].indirect_grants); | ||
958 | info->shadow[i].indirect_grants = NULL; | ||
959 | kfree(info->shadow[i].sg); | ||
960 | info->shadow[i].sg = NULL; | ||
961 | } | ||
962 | |||
846 | /* No more gnttab callback work. */ | 963 | /* No more gnttab callback work. */ |
847 | gnttab_cancel_free_callback(&info->callback); | 964 | gnttab_cancel_free_callback(&info->callback); |
848 | spin_unlock_irq(&info->io_lock); | 965 | spin_unlock_irq(&info->io_lock); |
@@ -867,12 +984,13 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info, | |||
867 | struct blkif_response *bret) | 984 | struct blkif_response *bret) |
868 | { | 985 | { |
869 | int i = 0; | 986 | int i = 0; |
870 | struct bio_vec *bvec; | 987 | struct scatterlist *sg; |
871 | struct req_iterator iter; | ||
872 | unsigned long flags; | ||
873 | char *bvec_data; | 988 | char *bvec_data; |
874 | void *shared_data; | 989 | void *shared_data; |
875 | unsigned int offset = 0; | 990 | int nseg; |
991 | |||
992 | nseg = s->req.operation == BLKIF_OP_INDIRECT ? | ||
993 | s->req.u.indirect.nr_segments : s->req.u.rw.nr_segments; | ||
876 | 994 | ||
877 | if (bret->operation == BLKIF_OP_READ) { | 995 | if (bret->operation == BLKIF_OP_READ) { |
878 | /* | 996 | /* |
@@ -881,26 +999,29 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info, | |||
881 | * than PAGE_SIZE, we have to keep track of the current offset, | 999 | * than PAGE_SIZE, we have to keep track of the current offset, |
882 | * to be sure we are copying the data from the right shared page. | 1000 | * to be sure we are copying the data from the right shared page. |
883 | */ | 1001 | */ |
884 | rq_for_each_segment(bvec, s->request, iter) { | 1002 | for_each_sg(s->sg, sg, nseg, i) { |
885 | BUG_ON((bvec->bv_offset + bvec->bv_len) > PAGE_SIZE); | 1003 | BUG_ON(sg->offset + sg->length > PAGE_SIZE); |
886 | if (bvec->bv_offset < offset) | ||
887 | i++; | ||
888 | BUG_ON(i >= s->req.u.rw.nr_segments); | ||
889 | shared_data = kmap_atomic( | 1004 | shared_data = kmap_atomic( |
890 | pfn_to_page(s->grants_used[i]->pfn)); | 1005 | pfn_to_page(s->grants_used[i]->pfn)); |
891 | bvec_data = bvec_kmap_irq(bvec, &flags); | 1006 | bvec_data = kmap_atomic(sg_page(sg)); |
892 | memcpy(bvec_data, shared_data + bvec->bv_offset, | 1007 | memcpy(bvec_data + sg->offset, |
893 | bvec->bv_len); | 1008 | shared_data + sg->offset, |
894 | bvec_kunmap_irq(bvec_data, &flags); | 1009 | sg->length); |
1010 | kunmap_atomic(bvec_data); | ||
895 | kunmap_atomic(shared_data); | 1011 | kunmap_atomic(shared_data); |
896 | offset = bvec->bv_offset + bvec->bv_len; | ||
897 | } | 1012 | } |
898 | } | 1013 | } |
899 | /* Add the persistent grant into the list of free grants */ | 1014 | /* Add the persistent grant into the list of free grants */ |
900 | for (i = 0; i < s->req.u.rw.nr_segments; i++) { | 1015 | for (i = 0; i < nseg; i++) { |
901 | list_add(&s->grants_used[i]->node, &info->persistent_gnts); | 1016 | list_add(&s->grants_used[i]->node, &info->persistent_gnts); |
902 | info->persistent_gnts_c++; | 1017 | info->persistent_gnts_c++; |
903 | } | 1018 | } |
1019 | if (s->req.operation == BLKIF_OP_INDIRECT) { | ||
1020 | for (i = 0; i < INDIRECT_GREFS(nseg); i++) { | ||
1021 | list_add(&s->indirect_grants[i]->node, &info->persistent_gnts); | ||
1022 | info->persistent_gnts_c++; | ||
1023 | } | ||
1024 | } | ||
904 | } | 1025 | } |
905 | 1026 | ||
906 | static irqreturn_t blkif_interrupt(int irq, void *dev_id) | 1027 | static irqreturn_t blkif_interrupt(int irq, void *dev_id) |
@@ -1034,14 +1155,6 @@ static int setup_blkring(struct xenbus_device *dev, | |||
1034 | SHARED_RING_INIT(sring); | 1155 | SHARED_RING_INIT(sring); |
1035 | FRONT_RING_INIT(&info->ring, sring, PAGE_SIZE); | 1156 | FRONT_RING_INIT(&info->ring, sring, PAGE_SIZE); |
1036 | 1157 | ||
1037 | sg_init_table(info->sg, BLKIF_MAX_SEGMENTS_PER_REQUEST); | ||
1038 | |||
1039 | /* Allocate memory for grants */ | ||
1040 | err = fill_grant_buffer(info, BLK_RING_SIZE * | ||
1041 | BLKIF_MAX_SEGMENTS_PER_REQUEST); | ||
1042 | if (err) | ||
1043 | goto fail; | ||
1044 | |||
1045 | err = xenbus_grant_ring(dev, virt_to_mfn(info->ring.sring)); | 1158 | err = xenbus_grant_ring(dev, virt_to_mfn(info->ring.sring)); |
1046 | if (err < 0) { | 1159 | if (err < 0) { |
1047 | free_page((unsigned long)sring); | 1160 | free_page((unsigned long)sring); |
@@ -1223,13 +1336,84 @@ static int blkfront_probe(struct xenbus_device *dev, | |||
1223 | return 0; | 1336 | return 0; |
1224 | } | 1337 | } |
1225 | 1338 | ||
1339 | /* | ||
1340 | * This is a clone of md_trim_bio, used to split a bio into smaller ones | ||
1341 | */ | ||
1342 | static void trim_bio(struct bio *bio, int offset, int size) | ||
1343 | { | ||
1344 | /* 'bio' is a cloned bio which we need to trim to match | ||
1345 | * the given offset and size. | ||
1346 | * This requires adjusting bi_sector, bi_size, and bi_io_vec | ||
1347 | */ | ||
1348 | int i; | ||
1349 | struct bio_vec *bvec; | ||
1350 | int sofar = 0; | ||
1351 | |||
1352 | size <<= 9; | ||
1353 | if (offset == 0 && size == bio->bi_size) | ||
1354 | return; | ||
1355 | |||
1356 | bio->bi_sector += offset; | ||
1357 | bio->bi_size = size; | ||
1358 | offset <<= 9; | ||
1359 | clear_bit(BIO_SEG_VALID, &bio->bi_flags); | ||
1360 | |||
1361 | while (bio->bi_idx < bio->bi_vcnt && | ||
1362 | bio->bi_io_vec[bio->bi_idx].bv_len <= offset) { | ||
1363 | /* remove this whole bio_vec */ | ||
1364 | offset -= bio->bi_io_vec[bio->bi_idx].bv_len; | ||
1365 | bio->bi_idx++; | ||
1366 | } | ||
1367 | if (bio->bi_idx < bio->bi_vcnt) { | ||
1368 | bio->bi_io_vec[bio->bi_idx].bv_offset += offset; | ||
1369 | bio->bi_io_vec[bio->bi_idx].bv_len -= offset; | ||
1370 | } | ||
1371 | /* avoid any complications with bi_idx being non-zero*/ | ||
1372 | if (bio->bi_idx) { | ||
1373 | memmove(bio->bi_io_vec, bio->bi_io_vec+bio->bi_idx, | ||
1374 | (bio->bi_vcnt - bio->bi_idx) * sizeof(struct bio_vec)); | ||
1375 | bio->bi_vcnt -= bio->bi_idx; | ||
1376 | bio->bi_idx = 0; | ||
1377 | } | ||
1378 | /* Make sure vcnt and last bv are not too big */ | ||
1379 | bio_for_each_segment(bvec, bio, i) { | ||
1380 | if (sofar + bvec->bv_len > size) | ||
1381 | bvec->bv_len = size - sofar; | ||
1382 | if (bvec->bv_len == 0) { | ||
1383 | bio->bi_vcnt = i; | ||
1384 | break; | ||
1385 | } | ||
1386 | sofar += bvec->bv_len; | ||
1387 | } | ||
1388 | } | ||
1389 | |||
1390 | static void split_bio_end(struct bio *bio, int error) | ||
1391 | { | ||
1392 | struct split_bio *split_bio = bio->bi_private; | ||
1393 | |||
1394 | if (error) | ||
1395 | split_bio->err = error; | ||
1396 | |||
1397 | if (atomic_dec_and_test(&split_bio->pending)) { | ||
1398 | split_bio->bio->bi_phys_segments = 0; | ||
1399 | bio_endio(split_bio->bio, split_bio->err); | ||
1400 | kfree(split_bio); | ||
1401 | } | ||
1402 | bio_put(bio); | ||
1403 | } | ||
1226 | 1404 | ||
1227 | static int blkif_recover(struct blkfront_info *info) | 1405 | static int blkif_recover(struct blkfront_info *info) |
1228 | { | 1406 | { |
1229 | int i; | 1407 | int i; |
1230 | struct blkif_request *req; | 1408 | struct request *req, *n; |
1231 | struct blk_shadow *copy; | 1409 | struct blk_shadow *copy; |
1232 | int j; | 1410 | int rc; |
1411 | struct bio *bio, *cloned_bio; | ||
1412 | struct bio_list bio_list, merge_bio; | ||
1413 | unsigned int segs, offset; | ||
1414 | int pending, size; | ||
1415 | struct split_bio *split_bio; | ||
1416 | struct list_head requests; | ||
1233 | 1417 | ||
1234 | /* Stage 1: Make a safe copy of the shadow state. */ | 1418 | /* Stage 1: Make a safe copy of the shadow state. */ |
1235 | copy = kmemdup(info->shadow, sizeof(info->shadow), | 1419 | copy = kmemdup(info->shadow, sizeof(info->shadow), |
@@ -1244,36 +1428,64 @@ static int blkif_recover(struct blkfront_info *info) | |||
1244 | info->shadow_free = info->ring.req_prod_pvt; | 1428 | info->shadow_free = info->ring.req_prod_pvt; |
1245 | info->shadow[BLK_RING_SIZE-1].req.u.rw.id = 0x0fffffff; | 1429 | info->shadow[BLK_RING_SIZE-1].req.u.rw.id = 0x0fffffff; |
1246 | 1430 | ||
1247 | /* Stage 3: Find pending requests and requeue them. */ | 1431 | rc = blkfront_setup_indirect(info); |
1432 | if (rc) { | ||
1433 | kfree(copy); | ||
1434 | return rc; | ||
1435 | } | ||
1436 | |||
1437 | segs = info->max_indirect_segments ? : BLKIF_MAX_SEGMENTS_PER_REQUEST; | ||
1438 | blk_queue_max_segments(info->rq, segs); | ||
1439 | bio_list_init(&bio_list); | ||
1440 | INIT_LIST_HEAD(&requests); | ||
1248 | for (i = 0; i < BLK_RING_SIZE; i++) { | 1441 | for (i = 0; i < BLK_RING_SIZE; i++) { |
1249 | /* Not in use? */ | 1442 | /* Not in use? */ |
1250 | if (!copy[i].request) | 1443 | if (!copy[i].request) |
1251 | continue; | 1444 | continue; |
1252 | 1445 | ||
1253 | /* Grab a request slot and copy shadow state into it. */ | 1446 | /* |
1254 | req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt); | 1447 | * Get the bios in the request so we can re-queue them. |
1255 | *req = copy[i].req; | 1448 | */ |
1256 | 1449 | if (copy[i].request->cmd_flags & | |
1257 | /* We get a new request id, and must reset the shadow state. */ | 1450 | (REQ_FLUSH | REQ_FUA | REQ_DISCARD | REQ_SECURE)) { |
1258 | req->u.rw.id = get_id_from_freelist(info); | 1451 | /* |
1259 | memcpy(&info->shadow[req->u.rw.id], ©[i], sizeof(copy[i])); | 1452 | * Flush operations don't contain bios, so |
1260 | 1453 | * we need to requeue the whole request | |
1261 | if (req->operation != BLKIF_OP_DISCARD) { | 1454 | */ |
1262 | /* Rewrite any grant references invalidated by susp/resume. */ | 1455 | list_add(©[i].request->queuelist, &requests); |
1263 | for (j = 0; j < req->u.rw.nr_segments; j++) | 1456 | continue; |
1264 | gnttab_grant_foreign_access_ref( | ||
1265 | req->u.rw.seg[j].gref, | ||
1266 | info->xbdev->otherend_id, | ||
1267 | pfn_to_mfn(copy[i].grants_used[j]->pfn), | ||
1268 | 0); | ||
1269 | } | 1457 | } |
1270 | info->shadow[req->u.rw.id].req = *req; | 1458 | merge_bio.head = copy[i].request->bio; |
1271 | 1459 | merge_bio.tail = copy[i].request->biotail; | |
1272 | info->ring.req_prod_pvt++; | 1460 | bio_list_merge(&bio_list, &merge_bio); |
1461 | copy[i].request->bio = NULL; | ||
1462 | blk_put_request(copy[i].request); | ||
1273 | } | 1463 | } |
1274 | 1464 | ||
1275 | kfree(copy); | 1465 | kfree(copy); |
1276 | 1466 | ||
1467 | /* | ||
1468 | * Empty the queue, this is important because we might have | ||
1469 | * requests in the queue with more segments than what we | ||
1470 | * can handle now. | ||
1471 | */ | ||
1472 | spin_lock_irq(&info->io_lock); | ||
1473 | while ((req = blk_fetch_request(info->rq)) != NULL) { | ||
1474 | if (req->cmd_flags & | ||
1475 | (REQ_FLUSH | REQ_FUA | REQ_DISCARD | REQ_SECURE)) { | ||
1476 | list_add(&req->queuelist, &requests); | ||
1477 | continue; | ||
1478 | } | ||
1479 | merge_bio.head = req->bio; | ||
1480 | merge_bio.tail = req->biotail; | ||
1481 | bio_list_merge(&bio_list, &merge_bio); | ||
1482 | req->bio = NULL; | ||
1483 | if (req->cmd_flags & (REQ_FLUSH | REQ_FUA)) | ||
1484 | pr_alert("diskcache flush request found!\n"); | ||
1485 | __blk_put_request(info->rq, req); | ||
1486 | } | ||
1487 | spin_unlock_irq(&info->io_lock); | ||
1488 | |||
1277 | xenbus_switch_state(info->xbdev, XenbusStateConnected); | 1489 | xenbus_switch_state(info->xbdev, XenbusStateConnected); |
1278 | 1490 | ||
1279 | spin_lock_irq(&info->io_lock); | 1491 | spin_lock_irq(&info->io_lock); |
@@ -1281,14 +1493,50 @@ static int blkif_recover(struct blkfront_info *info) | |||
1281 | /* Now safe for us to use the shared ring */ | 1493 | /* Now safe for us to use the shared ring */ |
1282 | info->connected = BLKIF_STATE_CONNECTED; | 1494 | info->connected = BLKIF_STATE_CONNECTED; |
1283 | 1495 | ||
1284 | /* Send off requeued requests */ | ||
1285 | flush_requests(info); | ||
1286 | |||
1287 | /* Kick any other new requests queued since we resumed */ | 1496 | /* Kick any other new requests queued since we resumed */ |
1288 | kick_pending_request_queues(info); | 1497 | kick_pending_request_queues(info); |
1289 | 1498 | ||
1499 | list_for_each_entry_safe(req, n, &requests, queuelist) { | ||
1500 | /* Requeue pending requests (flush or discard) */ | ||
1501 | list_del_init(&req->queuelist); | ||
1502 | BUG_ON(req->nr_phys_segments > segs); | ||
1503 | blk_requeue_request(info->rq, req); | ||
1504 | } | ||
1290 | spin_unlock_irq(&info->io_lock); | 1505 | spin_unlock_irq(&info->io_lock); |
1291 | 1506 | ||
1507 | while ((bio = bio_list_pop(&bio_list)) != NULL) { | ||
1508 | /* Traverse the list of pending bios and re-queue them */ | ||
1509 | if (bio_segments(bio) > segs) { | ||
1510 | /* | ||
1511 | * This bio has more segments than what we can | ||
1512 | * handle, we have to split it. | ||
1513 | */ | ||
1514 | pending = (bio_segments(bio) + segs - 1) / segs; | ||
1515 | split_bio = kzalloc(sizeof(*split_bio), GFP_NOIO); | ||
1516 | BUG_ON(split_bio == NULL); | ||
1517 | atomic_set(&split_bio->pending, pending); | ||
1518 | split_bio->bio = bio; | ||
1519 | for (i = 0; i < pending; i++) { | ||
1520 | offset = (i * segs * PAGE_SIZE) >> 9; | ||
1521 | size = min((unsigned int)(segs * PAGE_SIZE) >> 9, | ||
1522 | (unsigned int)(bio->bi_size >> 9) - offset); | ||
1523 | cloned_bio = bio_clone(bio, GFP_NOIO); | ||
1524 | BUG_ON(cloned_bio == NULL); | ||
1525 | trim_bio(cloned_bio, offset, size); | ||
1526 | cloned_bio->bi_private = split_bio; | ||
1527 | cloned_bio->bi_end_io = split_bio_end; | ||
1528 | submit_bio(cloned_bio->bi_rw, cloned_bio); | ||
1529 | } | ||
1530 | /* | ||
1531 | * Now we have to wait for all those smaller bios to | ||
1532 | * end, so we can also end the "parent" bio. | ||
1533 | */ | ||
1534 | continue; | ||
1535 | } | ||
1536 | /* We don't need to split this bio */ | ||
1537 | submit_bio(bio->bi_rw, bio); | ||
1538 | } | ||
1539 | |||
1292 | return 0; | 1540 | return 0; |
1293 | } | 1541 | } |
1294 | 1542 | ||
@@ -1308,8 +1556,12 @@ static int blkfront_resume(struct xenbus_device *dev) | |||
1308 | blkif_free(info, info->connected == BLKIF_STATE_CONNECTED); | 1556 | blkif_free(info, info->connected == BLKIF_STATE_CONNECTED); |
1309 | 1557 | ||
1310 | err = talk_to_blkback(dev, info); | 1558 | err = talk_to_blkback(dev, info); |
1311 | if (info->connected == BLKIF_STATE_SUSPENDED && !err) | 1559 | |
1312 | err = blkif_recover(info); | 1560 | /* |
1561 | * We have to wait for the backend to switch to | ||
1562 | * connected state, since we want to read which | ||
1563 | * features it supports. | ||
1564 | */ | ||
1313 | 1565 | ||
1314 | return err; | 1566 | return err; |
1315 | } | 1567 | } |
@@ -1387,6 +1639,60 @@ static void blkfront_setup_discard(struct blkfront_info *info) | |||
1387 | kfree(type); | 1639 | kfree(type); |
1388 | } | 1640 | } |
1389 | 1641 | ||
1642 | static int blkfront_setup_indirect(struct blkfront_info *info) | ||
1643 | { | ||
1644 | unsigned int indirect_segments, segs; | ||
1645 | int err, i; | ||
1646 | |||
1647 | err = xenbus_gather(XBT_NIL, info->xbdev->otherend, | ||
1648 | "feature-max-indirect-segments", "%u", &indirect_segments, | ||
1649 | NULL); | ||
1650 | if (err) { | ||
1651 | info->max_indirect_segments = 0; | ||
1652 | segs = BLKIF_MAX_SEGMENTS_PER_REQUEST; | ||
1653 | } else { | ||
1654 | info->max_indirect_segments = min(indirect_segments, | ||
1655 | xen_blkif_max_segments); | ||
1656 | segs = info->max_indirect_segments; | ||
1657 | } | ||
1658 | |||
1659 | err = fill_grant_buffer(info, (segs + INDIRECT_GREFS(segs)) * BLK_RING_SIZE); | ||
1660 | if (err) | ||
1661 | goto out_of_memory; | ||
1662 | |||
1663 | for (i = 0; i < BLK_RING_SIZE; i++) { | ||
1664 | info->shadow[i].grants_used = kzalloc( | ||
1665 | sizeof(info->shadow[i].grants_used[0]) * segs, | ||
1666 | GFP_NOIO); | ||
1667 | info->shadow[i].sg = kzalloc(sizeof(info->shadow[i].sg[0]) * segs, GFP_NOIO); | ||
1668 | if (info->max_indirect_segments) | ||
1669 | info->shadow[i].indirect_grants = kzalloc( | ||
1670 | sizeof(info->shadow[i].indirect_grants[0]) * | ||
1671 | INDIRECT_GREFS(segs), | ||
1672 | GFP_NOIO); | ||
1673 | if ((info->shadow[i].grants_used == NULL) || | ||
1674 | (info->shadow[i].sg == NULL) || | ||
1675 | (info->max_indirect_segments && | ||
1676 | (info->shadow[i].indirect_grants == NULL))) | ||
1677 | goto out_of_memory; | ||
1678 | sg_init_table(info->shadow[i].sg, segs); | ||
1679 | } | ||
1680 | |||
1681 | |||
1682 | return 0; | ||
1683 | |||
1684 | out_of_memory: | ||
1685 | for (i = 0; i < BLK_RING_SIZE; i++) { | ||
1686 | kfree(info->shadow[i].grants_used); | ||
1687 | info->shadow[i].grants_used = NULL; | ||
1688 | kfree(info->shadow[i].sg); | ||
1689 | info->shadow[i].sg = NULL; | ||
1690 | kfree(info->shadow[i].indirect_grants); | ||
1691 | info->shadow[i].indirect_grants = NULL; | ||
1692 | } | ||
1693 | return -ENOMEM; | ||
1694 | } | ||
1695 | |||
1390 | /* | 1696 | /* |
1391 | * Invoked when the backend is finally 'ready' (and has told produced | 1697 | * Invoked when the backend is finally 'ready' (and has told produced |
1392 | * the details about the physical device - #sectors, size, etc). | 1698 | * the details about the physical device - #sectors, size, etc). |
@@ -1395,6 +1701,7 @@ static void blkfront_connect(struct blkfront_info *info) | |||
1395 | { | 1701 | { |
1396 | unsigned long long sectors; | 1702 | unsigned long long sectors; |
1397 | unsigned long sector_size; | 1703 | unsigned long sector_size; |
1704 | unsigned int physical_sector_size; | ||
1398 | unsigned int binfo; | 1705 | unsigned int binfo; |
1399 | int err; | 1706 | int err; |
1400 | int barrier, flush, discard, persistent; | 1707 | int barrier, flush, discard, persistent; |
@@ -1414,8 +1721,15 @@ static void blkfront_connect(struct blkfront_info *info) | |||
1414 | set_capacity(info->gd, sectors); | 1721 | set_capacity(info->gd, sectors); |
1415 | revalidate_disk(info->gd); | 1722 | revalidate_disk(info->gd); |
1416 | 1723 | ||
1417 | /* fall through */ | 1724 | return; |
1418 | case BLKIF_STATE_SUSPENDED: | 1725 | case BLKIF_STATE_SUSPENDED: |
1726 | /* | ||
1727 | * If we are recovering from suspension, we need to wait | ||
1728 | * for the backend to announce it's features before | ||
1729 | * reconnecting, at least we need to know if the backend | ||
1730 | * supports indirect descriptors, and how many. | ||
1731 | */ | ||
1732 | blkif_recover(info); | ||
1419 | return; | 1733 | return; |
1420 | 1734 | ||
1421 | default: | 1735 | default: |
@@ -1437,6 +1751,16 @@ static void blkfront_connect(struct blkfront_info *info) | |||
1437 | return; | 1751 | return; |
1438 | } | 1752 | } |
1439 | 1753 | ||
1754 | /* | ||
1755 | * physcial-sector-size is a newer field, so old backends may not | ||
1756 | * provide this. Assume physical sector size to be the same as | ||
1757 | * sector_size in that case. | ||
1758 | */ | ||
1759 | err = xenbus_scanf(XBT_NIL, info->xbdev->otherend, | ||
1760 | "physical-sector-size", "%u", &physical_sector_size); | ||
1761 | if (err != 1) | ||
1762 | physical_sector_size = sector_size; | ||
1763 | |||
1440 | info->feature_flush = 0; | 1764 | info->feature_flush = 0; |
1441 | info->flush_op = 0; | 1765 | info->flush_op = 0; |
1442 | 1766 | ||
@@ -1483,7 +1807,15 @@ static void blkfront_connect(struct blkfront_info *info) | |||
1483 | else | 1807 | else |
1484 | info->feature_persistent = persistent; | 1808 | info->feature_persistent = persistent; |
1485 | 1809 | ||
1486 | err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size); | 1810 | err = blkfront_setup_indirect(info); |
1811 | if (err) { | ||
1812 | xenbus_dev_fatal(info->xbdev, err, "setup_indirect at %s", | ||
1813 | info->xbdev->otherend); | ||
1814 | return; | ||
1815 | } | ||
1816 | |||
1817 | err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size, | ||
1818 | physical_sector_size); | ||
1487 | if (err) { | 1819 | if (err) { |
1488 | xenbus_dev_fatal(info->xbdev, err, "xlvbd_add at %s", | 1820 | xenbus_dev_fatal(info->xbdev, err, "xlvbd_add at %s", |
1489 | info->xbdev->otherend); | 1821 | info->xbdev->otherend); |
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 8bda1294c035..dac7738df7ff 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h | |||
@@ -272,6 +272,8 @@ enum { | |||
272 | * - memcg: use_hierarchy is on by default and the cgroup file for | 272 | * - memcg: use_hierarchy is on by default and the cgroup file for |
273 | * the flag is not created. | 273 | * the flag is not created. |
274 | * | 274 | * |
275 | * - blkcg: blk-throttle becomes properly hierarchical. | ||
276 | * | ||
275 | * The followings are planned changes. | 277 | * The followings are planned changes. |
276 | * | 278 | * |
277 | * - release_agent will be disallowed once replacement notification | 279 | * - release_agent will be disallowed once replacement notification |
diff --git a/include/linux/drbd.h b/include/linux/drbd.h index 1b4d4ee1168f..de7d74ab3de6 100644 --- a/include/linux/drbd.h +++ b/include/linux/drbd.h | |||
@@ -177,7 +177,11 @@ enum drbd_ret_code { | |||
177 | ERR_NEED_APV_100 = 163, | 177 | ERR_NEED_APV_100 = 163, |
178 | ERR_NEED_ALLOW_TWO_PRI = 164, | 178 | ERR_NEED_ALLOW_TWO_PRI = 164, |
179 | ERR_MD_UNCLEAN = 165, | 179 | ERR_MD_UNCLEAN = 165, |
180 | 180 | ERR_MD_LAYOUT_CONNECTED = 166, | |
181 | ERR_MD_LAYOUT_TOO_BIG = 167, | ||
182 | ERR_MD_LAYOUT_TOO_SMALL = 168, | ||
183 | ERR_MD_LAYOUT_NO_FIT = 169, | ||
184 | ERR_IMPLICIT_SHRINK = 170, | ||
181 | /* insert new ones above this line */ | 185 | /* insert new ones above this line */ |
182 | AFTER_LAST_ERR_CODE | 186 | AFTER_LAST_ERR_CODE |
183 | }; | 187 | }; |
diff --git a/include/linux/drbd_genl.h b/include/linux/drbd_genl.h index d0d8fac8a6e4..e8c44572b8cb 100644 --- a/include/linux/drbd_genl.h +++ b/include/linux/drbd_genl.h | |||
@@ -181,6 +181,8 @@ GENL_struct(DRBD_NLA_RESIZE_PARMS, 7, resize_parms, | |||
181 | __u64_field(1, DRBD_GENLA_F_MANDATORY, resize_size) | 181 | __u64_field(1, DRBD_GENLA_F_MANDATORY, resize_size) |
182 | __flg_field(2, DRBD_GENLA_F_MANDATORY, resize_force) | 182 | __flg_field(2, DRBD_GENLA_F_MANDATORY, resize_force) |
183 | __flg_field(3, DRBD_GENLA_F_MANDATORY, no_resync) | 183 | __flg_field(3, DRBD_GENLA_F_MANDATORY, no_resync) |
184 | __u32_field_def(4, 0 /* OPTIONAL */, al_stripes, DRBD_AL_STRIPES_DEF) | ||
185 | __u32_field_def(5, 0 /* OPTIONAL */, al_stripe_size, DRBD_AL_STRIPE_SIZE_DEF) | ||
184 | ) | 186 | ) |
185 | 187 | ||
186 | GENL_struct(DRBD_NLA_STATE_INFO, 8, state_info, | 188 | GENL_struct(DRBD_NLA_STATE_INFO, 8, state_info, |
diff --git a/include/linux/drbd_limits.h b/include/linux/drbd_limits.h index 1fedf2b17cc8..17e50bb00521 100644 --- a/include/linux/drbd_limits.h +++ b/include/linux/drbd_limits.h | |||
@@ -215,4 +215,13 @@ | |||
215 | #define DRBD_ALWAYS_ASBP_DEF 0 | 215 | #define DRBD_ALWAYS_ASBP_DEF 0 |
216 | #define DRBD_USE_RLE_DEF 1 | 216 | #define DRBD_USE_RLE_DEF 1 |
217 | 217 | ||
218 | #define DRBD_AL_STRIPES_MIN 1 | ||
219 | #define DRBD_AL_STRIPES_MAX 1024 | ||
220 | #define DRBD_AL_STRIPES_DEF 1 | ||
221 | #define DRBD_AL_STRIPES_SCALE '1' | ||
222 | |||
223 | #define DRBD_AL_STRIPE_SIZE_MIN 4 | ||
224 | #define DRBD_AL_STRIPE_SIZE_MAX 16777216 | ||
225 | #define DRBD_AL_STRIPE_SIZE_DEF 32 | ||
226 | #define DRBD_AL_STRIPE_SIZE_SCALE 'k' /* kilobytes */ | ||
218 | #endif | 227 | #endif |
diff --git a/include/xen/interface/io/blkif.h b/include/xen/interface/io/blkif.h index ffd4652de91c..65e12099ef89 100644 --- a/include/xen/interface/io/blkif.h +++ b/include/xen/interface/io/blkif.h | |||
@@ -103,12 +103,46 @@ typedef uint64_t blkif_sector_t; | |||
103 | #define BLKIF_OP_DISCARD 5 | 103 | #define BLKIF_OP_DISCARD 5 |
104 | 104 | ||
105 | /* | 105 | /* |
106 | * Recognized if "feature-max-indirect-segments" in present in the backend | ||
107 | * xenbus info. The "feature-max-indirect-segments" node contains the maximum | ||
108 | * number of segments allowed by the backend per request. If the node is | ||
109 | * present, the frontend might use blkif_request_indirect structs in order to | ||
110 | * issue requests with more than BLKIF_MAX_SEGMENTS_PER_REQUEST (11). The | ||
111 | * maximum number of indirect segments is fixed by the backend, but the | ||
112 | * frontend can issue requests with any number of indirect segments as long as | ||
113 | * it's less than the number provided by the backend. The indirect_grefs field | ||
114 | * in blkif_request_indirect should be filled by the frontend with the | ||
115 | * grant references of the pages that are holding the indirect segments. | ||
116 | * This pages are filled with an array of blkif_request_segment_aligned | ||
117 | * that hold the information about the segments. The number of indirect | ||
118 | * pages to use is determined by the maximum number of segments | ||
119 | * a indirect request contains. Every indirect page can contain a maximum | ||
120 | * of 512 segments (PAGE_SIZE/sizeof(blkif_request_segment_aligned)), | ||
121 | * so to calculate the number of indirect pages to use we have to do | ||
122 | * ceil(indirect_segments/512). | ||
123 | * | ||
124 | * If a backend does not recognize BLKIF_OP_INDIRECT, it should *not* | ||
125 | * create the "feature-max-indirect-segments" node! | ||
126 | */ | ||
127 | #define BLKIF_OP_INDIRECT 6 | ||
128 | |||
129 | /* | ||
106 | * Maximum scatter/gather segments per request. | 130 | * Maximum scatter/gather segments per request. |
107 | * This is carefully chosen so that sizeof(struct blkif_ring) <= PAGE_SIZE. | 131 | * This is carefully chosen so that sizeof(struct blkif_ring) <= PAGE_SIZE. |
108 | * NB. This could be 12 if the ring indexes weren't stored in the same page. | 132 | * NB. This could be 12 if the ring indexes weren't stored in the same page. |
109 | */ | 133 | */ |
110 | #define BLKIF_MAX_SEGMENTS_PER_REQUEST 11 | 134 | #define BLKIF_MAX_SEGMENTS_PER_REQUEST 11 |
111 | 135 | ||
136 | #define BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST 8 | ||
137 | |||
138 | struct blkif_request_segment_aligned { | ||
139 | grant_ref_t gref; /* reference to I/O buffer frame */ | ||
140 | /* @first_sect: first sector in frame to transfer (inclusive). */ | ||
141 | /* @last_sect: last sector in frame to transfer (inclusive). */ | ||
142 | uint8_t first_sect, last_sect; | ||
143 | uint16_t _pad; /* padding to make it 8 bytes, so it's cache-aligned */ | ||
144 | } __attribute__((__packed__)); | ||
145 | |||
112 | struct blkif_request_rw { | 146 | struct blkif_request_rw { |
113 | uint8_t nr_segments; /* number of segments */ | 147 | uint8_t nr_segments; /* number of segments */ |
114 | blkif_vdev_t handle; /* only for read/write requests */ | 148 | blkif_vdev_t handle; /* only for read/write requests */ |
@@ -147,12 +181,31 @@ struct blkif_request_other { | |||
147 | uint64_t id; /* private guest value, echoed in resp */ | 181 | uint64_t id; /* private guest value, echoed in resp */ |
148 | } __attribute__((__packed__)); | 182 | } __attribute__((__packed__)); |
149 | 183 | ||
184 | struct blkif_request_indirect { | ||
185 | uint8_t indirect_op; | ||
186 | uint16_t nr_segments; | ||
187 | #ifdef CONFIG_X86_64 | ||
188 | uint32_t _pad1; /* offsetof(blkif_...,u.indirect.id) == 8 */ | ||
189 | #endif | ||
190 | uint64_t id; | ||
191 | blkif_sector_t sector_number; | ||
192 | blkif_vdev_t handle; | ||
193 | uint16_t _pad2; | ||
194 | grant_ref_t indirect_grefs[BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST]; | ||
195 | #ifdef CONFIG_X86_64 | ||
196 | uint32_t _pad3; /* make it 64 byte aligned */ | ||
197 | #else | ||
198 | uint64_t _pad3; /* make it 64 byte aligned */ | ||
199 | #endif | ||
200 | } __attribute__((__packed__)); | ||
201 | |||
150 | struct blkif_request { | 202 | struct blkif_request { |
151 | uint8_t operation; /* BLKIF_OP_??? */ | 203 | uint8_t operation; /* BLKIF_OP_??? */ |
152 | union { | 204 | union { |
153 | struct blkif_request_rw rw; | 205 | struct blkif_request_rw rw; |
154 | struct blkif_request_discard discard; | 206 | struct blkif_request_discard discard; |
155 | struct blkif_request_other other; | 207 | struct blkif_request_other other; |
208 | struct blkif_request_indirect indirect; | ||
156 | } u; | 209 | } u; |
157 | } __attribute__((__packed__)); | 210 | } __attribute__((__packed__)); |
158 | 211 | ||
diff --git a/include/xen/interface/io/ring.h b/include/xen/interface/io/ring.h index 75271b9a8f61..7d28aff605c7 100644 --- a/include/xen/interface/io/ring.h +++ b/include/xen/interface/io/ring.h | |||
@@ -188,6 +188,11 @@ struct __name##_back_ring { \ | |||
188 | #define RING_REQUEST_CONS_OVERFLOW(_r, _cons) \ | 188 | #define RING_REQUEST_CONS_OVERFLOW(_r, _cons) \ |
189 | (((_cons) - (_r)->rsp_prod_pvt) >= RING_SIZE(_r)) | 189 | (((_cons) - (_r)->rsp_prod_pvt) >= RING_SIZE(_r)) |
190 | 190 | ||
191 | /* Ill-behaved frontend determination: Can there be this many requests? */ | ||
192 | #define RING_REQUEST_PROD_OVERFLOW(_r, _prod) \ | ||
193 | (((_prod) - (_r)->rsp_prod_pvt) > RING_SIZE(_r)) | ||
194 | |||
195 | |||
191 | #define RING_PUSH_REQUESTS(_r) do { \ | 196 | #define RING_PUSH_REQUESTS(_r) do { \ |
192 | wmb(); /* back sees requests /before/ updated producer index */ \ | 197 | wmb(); /* back sees requests /before/ updated producer index */ \ |
193 | (_r)->sring->req_prod = (_r)->req_prod_pvt; \ | 198 | (_r)->sring->req_prod = (_r)->req_prod_pvt; \ |