aboutsummaryrefslogtreecommitdiffstats
path: root/drivers
diff options
context:
space:
mode:
Diffstat (limited to 'drivers')
-rw-r--r--drivers/block/xen-blkback/blkback.c872
-rw-r--r--drivers/block/xen-blkback/common.h147
-rw-r--r--drivers/block/xen-blkback/xenbus.c85
-rw-r--r--drivers/block/xen-blkfront.c532
4 files changed, 1214 insertions, 422 deletions
diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c
index dd5b2fed97e9..bf4b9d282c04 100644
--- a/drivers/block/xen-blkback/blkback.c
+++ b/drivers/block/xen-blkback/blkback.c
@@ -50,110 +50,118 @@
50#include "common.h" 50#include "common.h"
51 51
52/* 52/*
53 * These are rather arbitrary. They are fairly large because adjacent requests 53 * Maximum number of unused free pages to keep in the internal buffer.
54 * pulled from a communication ring are quite likely to end up being part of 54 * Setting this to a value too low will reduce memory used in each backend,
55 * the same scatter/gather request at the disc. 55 * but can have a performance penalty.
56 * 56 *
57 * ** TRY INCREASING 'xen_blkif_reqs' IF WRITE SPEEDS SEEM TOO LOW ** 57 * A sane value is xen_blkif_reqs * BLKIF_MAX_SEGMENTS_PER_REQUEST, but can
58 * 58 * be set to a lower value that might degrade performance on some intensive
59 * This will increase the chances of being able to write whole tracks. 59 * IO workloads.
60 * 64 should be enough to keep us competitive with Linux.
61 */ 60 */
62static int xen_blkif_reqs = 64;
63module_param_named(reqs, xen_blkif_reqs, int, 0);
64MODULE_PARM_DESC(reqs, "Number of blkback requests to allocate");
65 61
66/* Run-time switchable: /sys/module/blkback/parameters/ */ 62static int xen_blkif_max_buffer_pages = 1024;
67static unsigned int log_stats; 63module_param_named(max_buffer_pages, xen_blkif_max_buffer_pages, int, 0644);
68module_param(log_stats, int, 0644); 64MODULE_PARM_DESC(max_buffer_pages,
65"Maximum number of free pages to keep in each block backend buffer");
69 66
70/* 67/*
71 * Each outstanding request that we've passed to the lower device layers has a 68 * Maximum number of grants to map persistently in blkback. For maximum
72 * 'pending_req' allocated to it. Each buffer_head that completes decrements 69 * performance this should be the total numbers of grants that can be used
73 * the pendcnt towards zero. When it hits zero, the specified domain has a 70 * to fill the ring, but since this might become too high, specially with
74 * response queued for it, with the saved 'id' passed back. 71 * the use of indirect descriptors, we set it to a value that provides good
72 * performance without using too much memory.
73 *
74 * When the list of persistent grants is full we clean it up using a LRU
75 * algorithm.
75 */ 76 */
76struct pending_req {
77 struct xen_blkif *blkif;
78 u64 id;
79 int nr_pages;
80 atomic_t pendcnt;
81 unsigned short operation;
82 int status;
83 struct list_head free_list;
84 DECLARE_BITMAP(unmap_seg, BLKIF_MAX_SEGMENTS_PER_REQUEST);
85};
86 77
87#define BLKBACK_INVALID_HANDLE (~0) 78static int xen_blkif_max_pgrants = 1056;
79module_param_named(max_persistent_grants, xen_blkif_max_pgrants, int, 0644);
80MODULE_PARM_DESC(max_persistent_grants,
81 "Maximum number of grants to map persistently");
88 82
89struct xen_blkbk { 83/*
90 struct pending_req *pending_reqs; 84 * The LRU mechanism to clean the lists of persistent grants needs to
91 /* List of all 'pending_req' available */ 85 * be executed periodically. The time interval between consecutive executions
92 struct list_head pending_free; 86 * of the purge mechanism is set in ms.
93 /* And its spinlock. */ 87 */
94 spinlock_t pending_free_lock; 88#define LRU_INTERVAL 100
95 wait_queue_head_t pending_free_wq;
96 /* The list of all pages that are available. */
97 struct page **pending_pages;
98 /* And the grant handles that are available. */
99 grant_handle_t *pending_grant_handles;
100};
101
102static struct xen_blkbk *blkbk;
103 89
104/* 90/*
105 * Maximum number of grant pages that can be mapped in blkback. 91 * When the persistent grants list is full we will remove unused grants
106 * BLKIF_MAX_SEGMENTS_PER_REQUEST * RING_SIZE is the maximum number of 92 * from the list. The percent number of grants to be removed at each LRU
107 * pages that blkback will persistently map. 93 * execution.
108 * Currently, this is:
109 * RING_SIZE = 32 (for all known ring types)
110 * BLKIF_MAX_SEGMENTS_PER_REQUEST = 11
111 * sizeof(struct persistent_gnt) = 48
112 * So the maximum memory used to store the grants is:
113 * 32 * 11 * 48 = 16896 bytes
114 */ 94 */
115static inline unsigned int max_mapped_grant_pages(enum blkif_protocol protocol) 95#define LRU_PERCENT_CLEAN 5
96
97/* Run-time switchable: /sys/module/blkback/parameters/ */
98static unsigned int log_stats;
99module_param(log_stats, int, 0644);
100
101#define BLKBACK_INVALID_HANDLE (~0)
102
103/* Number of free pages to remove on each call to free_xenballooned_pages */
104#define NUM_BATCH_FREE_PAGES 10
105
106static inline int get_free_page(struct xen_blkif *blkif, struct page **page)
116{ 107{
117 switch (protocol) { 108 unsigned long flags;
118 case BLKIF_PROTOCOL_NATIVE: 109
119 return __CONST_RING_SIZE(blkif, PAGE_SIZE) * 110 spin_lock_irqsave(&blkif->free_pages_lock, flags);
120 BLKIF_MAX_SEGMENTS_PER_REQUEST; 111 if (list_empty(&blkif->free_pages)) {
121 case BLKIF_PROTOCOL_X86_32: 112 BUG_ON(blkif->free_pages_num != 0);
122 return __CONST_RING_SIZE(blkif_x86_32, PAGE_SIZE) * 113 spin_unlock_irqrestore(&blkif->free_pages_lock, flags);
123 BLKIF_MAX_SEGMENTS_PER_REQUEST; 114 return alloc_xenballooned_pages(1, page, false);
124 case BLKIF_PROTOCOL_X86_64:
125 return __CONST_RING_SIZE(blkif_x86_64, PAGE_SIZE) *
126 BLKIF_MAX_SEGMENTS_PER_REQUEST;
127 default:
128 BUG();
129 } 115 }
116 BUG_ON(blkif->free_pages_num == 0);
117 page[0] = list_first_entry(&blkif->free_pages, struct page, lru);
118 list_del(&page[0]->lru);
119 blkif->free_pages_num--;
120 spin_unlock_irqrestore(&blkif->free_pages_lock, flags);
121
130 return 0; 122 return 0;
131} 123}
132 124
133 125static inline void put_free_pages(struct xen_blkif *blkif, struct page **page,
134/* 126 int num)
135 * Little helpful macro to figure out the index and virtual address of the
136 * pending_pages[..]. For each 'pending_req' we have have up to
137 * BLKIF_MAX_SEGMENTS_PER_REQUEST (11) pages. The seg would be from 0 through
138 * 10 and would index in the pending_pages[..].
139 */
140static inline int vaddr_pagenr(struct pending_req *req, int seg)
141{ 127{
142 return (req - blkbk->pending_reqs) * 128 unsigned long flags;
143 BLKIF_MAX_SEGMENTS_PER_REQUEST + seg; 129 int i;
144}
145 130
146#define pending_page(req, seg) pending_pages[vaddr_pagenr(req, seg)] 131 spin_lock_irqsave(&blkif->free_pages_lock, flags);
132 for (i = 0; i < num; i++)
133 list_add(&page[i]->lru, &blkif->free_pages);
134 blkif->free_pages_num += num;
135 spin_unlock_irqrestore(&blkif->free_pages_lock, flags);
136}
147 137
148static inline unsigned long vaddr(struct pending_req *req, int seg) 138static inline void shrink_free_pagepool(struct xen_blkif *blkif, int num)
149{ 139{
150 unsigned long pfn = page_to_pfn(blkbk->pending_page(req, seg)); 140 /* Remove requested pages in batches of NUM_BATCH_FREE_PAGES */
151 return (unsigned long)pfn_to_kaddr(pfn); 141 struct page *page[NUM_BATCH_FREE_PAGES];
152} 142 unsigned int num_pages = 0;
143 unsigned long flags;
153 144
154#define pending_handle(_req, _seg) \ 145 spin_lock_irqsave(&blkif->free_pages_lock, flags);
155 (blkbk->pending_grant_handles[vaddr_pagenr(_req, _seg)]) 146 while (blkif->free_pages_num > num) {
147 BUG_ON(list_empty(&blkif->free_pages));
148 page[num_pages] = list_first_entry(&blkif->free_pages,
149 struct page, lru);
150 list_del(&page[num_pages]->lru);
151 blkif->free_pages_num--;
152 if (++num_pages == NUM_BATCH_FREE_PAGES) {
153 spin_unlock_irqrestore(&blkif->free_pages_lock, flags);
154 free_xenballooned_pages(num_pages, page);
155 spin_lock_irqsave(&blkif->free_pages_lock, flags);
156 num_pages = 0;
157 }
158 }
159 spin_unlock_irqrestore(&blkif->free_pages_lock, flags);
160 if (num_pages != 0)
161 free_xenballooned_pages(num_pages, page);
162}
156 163
164#define vaddr(page) ((unsigned long)pfn_to_kaddr(page_to_pfn(page)))
157 165
158static int do_block_io_op(struct xen_blkif *blkif); 166static int do_block_io_op(struct xen_blkif *blkif);
159static int dispatch_rw_block_io(struct xen_blkif *blkif, 167static int dispatch_rw_block_io(struct xen_blkif *blkif,
@@ -170,13 +178,29 @@ static void make_response(struct xen_blkif *blkif, u64 id,
170 (n) = (&(pos)->node != NULL) ? rb_next(&(pos)->node) : NULL) 178 (n) = (&(pos)->node != NULL) ? rb_next(&(pos)->node) : NULL)
171 179
172 180
173static void add_persistent_gnt(struct rb_root *root, 181/*
182 * We don't need locking around the persistent grant helpers
183 * because blkback uses a single-thread for each backed, so we
184 * can be sure that this functions will never be called recursively.
185 *
186 * The only exception to that is put_persistent_grant, that can be called
187 * from interrupt context (by xen_blkbk_unmap), so we have to use atomic
188 * bit operations to modify the flags of a persistent grant and to count
189 * the number of used grants.
190 */
191static int add_persistent_gnt(struct xen_blkif *blkif,
174 struct persistent_gnt *persistent_gnt) 192 struct persistent_gnt *persistent_gnt)
175{ 193{
176 struct rb_node **new = &(root->rb_node), *parent = NULL; 194 struct rb_node **new = NULL, *parent = NULL;
177 struct persistent_gnt *this; 195 struct persistent_gnt *this;
178 196
197 if (blkif->persistent_gnt_c >= xen_blkif_max_pgrants) {
198 if (!blkif->vbd.overflow_max_grants)
199 blkif->vbd.overflow_max_grants = 1;
200 return -EBUSY;
201 }
179 /* Figure out where to put new node */ 202 /* Figure out where to put new node */
203 new = &blkif->persistent_gnts.rb_node;
180 while (*new) { 204 while (*new) {
181 this = container_of(*new, struct persistent_gnt, node); 205 this = container_of(*new, struct persistent_gnt, node);
182 206
@@ -186,22 +210,28 @@ static void add_persistent_gnt(struct rb_root *root,
186 else if (persistent_gnt->gnt > this->gnt) 210 else if (persistent_gnt->gnt > this->gnt)
187 new = &((*new)->rb_right); 211 new = &((*new)->rb_right);
188 else { 212 else {
189 pr_alert(DRV_PFX " trying to add a gref that's already in the tree\n"); 213 pr_alert_ratelimited(DRV_PFX " trying to add a gref that's already in the tree\n");
190 BUG(); 214 return -EINVAL;
191 } 215 }
192 } 216 }
193 217
218 bitmap_zero(persistent_gnt->flags, PERSISTENT_GNT_FLAGS_SIZE);
219 set_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags);
194 /* Add new node and rebalance tree. */ 220 /* Add new node and rebalance tree. */
195 rb_link_node(&(persistent_gnt->node), parent, new); 221 rb_link_node(&(persistent_gnt->node), parent, new);
196 rb_insert_color(&(persistent_gnt->node), root); 222 rb_insert_color(&(persistent_gnt->node), &blkif->persistent_gnts);
223 blkif->persistent_gnt_c++;
224 atomic_inc(&blkif->persistent_gnt_in_use);
225 return 0;
197} 226}
198 227
199static struct persistent_gnt *get_persistent_gnt(struct rb_root *root, 228static struct persistent_gnt *get_persistent_gnt(struct xen_blkif *blkif,
200 grant_ref_t gref) 229 grant_ref_t gref)
201{ 230{
202 struct persistent_gnt *data; 231 struct persistent_gnt *data;
203 struct rb_node *node = root->rb_node; 232 struct rb_node *node = NULL;
204 233
234 node = blkif->persistent_gnts.rb_node;
205 while (node) { 235 while (node) {
206 data = container_of(node, struct persistent_gnt, node); 236 data = container_of(node, struct persistent_gnt, node);
207 237
@@ -209,13 +239,31 @@ static struct persistent_gnt *get_persistent_gnt(struct rb_root *root,
209 node = node->rb_left; 239 node = node->rb_left;
210 else if (gref > data->gnt) 240 else if (gref > data->gnt)
211 node = node->rb_right; 241 node = node->rb_right;
212 else 242 else {
243 if(test_bit(PERSISTENT_GNT_ACTIVE, data->flags)) {
244 pr_alert_ratelimited(DRV_PFX " requesting a grant already in use\n");
245 return NULL;
246 }
247 set_bit(PERSISTENT_GNT_ACTIVE, data->flags);
248 atomic_inc(&blkif->persistent_gnt_in_use);
213 return data; 249 return data;
250 }
214 } 251 }
215 return NULL; 252 return NULL;
216} 253}
217 254
218static void free_persistent_gnts(struct rb_root *root, unsigned int num) 255static void put_persistent_gnt(struct xen_blkif *blkif,
256 struct persistent_gnt *persistent_gnt)
257{
258 if(!test_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags))
259 pr_alert_ratelimited(DRV_PFX " freeing a grant already unused");
260 set_bit(PERSISTENT_GNT_WAS_ACTIVE, persistent_gnt->flags);
261 clear_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags);
262 atomic_dec(&blkif->persistent_gnt_in_use);
263}
264
265static void free_persistent_gnts(struct xen_blkif *blkif, struct rb_root *root,
266 unsigned int num)
219{ 267{
220 struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST]; 268 struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
221 struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST]; 269 struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST];
@@ -240,7 +288,7 @@ static void free_persistent_gnts(struct rb_root *root, unsigned int num)
240 ret = gnttab_unmap_refs(unmap, NULL, pages, 288 ret = gnttab_unmap_refs(unmap, NULL, pages,
241 segs_to_unmap); 289 segs_to_unmap);
242 BUG_ON(ret); 290 BUG_ON(ret);
243 free_xenballooned_pages(segs_to_unmap, pages); 291 put_free_pages(blkif, pages, segs_to_unmap);
244 segs_to_unmap = 0; 292 segs_to_unmap = 0;
245 } 293 }
246 294
@@ -251,21 +299,148 @@ static void free_persistent_gnts(struct rb_root *root, unsigned int num)
251 BUG_ON(num != 0); 299 BUG_ON(num != 0);
252} 300}
253 301
302static void unmap_purged_grants(struct work_struct *work)
303{
304 struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
305 struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST];
306 struct persistent_gnt *persistent_gnt;
307 int ret, segs_to_unmap = 0;
308 struct xen_blkif *blkif = container_of(work, typeof(*blkif), persistent_purge_work);
309
310 while(!list_empty(&blkif->persistent_purge_list)) {
311 persistent_gnt = list_first_entry(&blkif->persistent_purge_list,
312 struct persistent_gnt,
313 remove_node);
314 list_del(&persistent_gnt->remove_node);
315
316 gnttab_set_unmap_op(&unmap[segs_to_unmap],
317 vaddr(persistent_gnt->page),
318 GNTMAP_host_map,
319 persistent_gnt->handle);
320
321 pages[segs_to_unmap] = persistent_gnt->page;
322
323 if (++segs_to_unmap == BLKIF_MAX_SEGMENTS_PER_REQUEST) {
324 ret = gnttab_unmap_refs(unmap, NULL, pages,
325 segs_to_unmap);
326 BUG_ON(ret);
327 put_free_pages(blkif, pages, segs_to_unmap);
328 segs_to_unmap = 0;
329 }
330 kfree(persistent_gnt);
331 }
332 if (segs_to_unmap > 0) {
333 ret = gnttab_unmap_refs(unmap, NULL, pages, segs_to_unmap);
334 BUG_ON(ret);
335 put_free_pages(blkif, pages, segs_to_unmap);
336 }
337}
338
339static void purge_persistent_gnt(struct xen_blkif *blkif)
340{
341 struct persistent_gnt *persistent_gnt;
342 struct rb_node *n;
343 unsigned int num_clean, total;
344 bool scan_used = false, clean_used = false;
345 struct rb_root *root;
346
347 if (blkif->persistent_gnt_c < xen_blkif_max_pgrants ||
348 (blkif->persistent_gnt_c == xen_blkif_max_pgrants &&
349 !blkif->vbd.overflow_max_grants)) {
350 return;
351 }
352
353 if (work_pending(&blkif->persistent_purge_work)) {
354 pr_alert_ratelimited(DRV_PFX "Scheduled work from previous purge is still pending, cannot purge list\n");
355 return;
356 }
357
358 num_clean = (xen_blkif_max_pgrants / 100) * LRU_PERCENT_CLEAN;
359 num_clean = blkif->persistent_gnt_c - xen_blkif_max_pgrants + num_clean;
360 num_clean = min(blkif->persistent_gnt_c, num_clean);
361 if ((num_clean == 0) ||
362 (num_clean > (blkif->persistent_gnt_c - atomic_read(&blkif->persistent_gnt_in_use))))
363 return;
364
365 /*
366 * At this point, we can assure that there will be no calls
367 * to get_persistent_grant (because we are executing this code from
368 * xen_blkif_schedule), there can only be calls to put_persistent_gnt,
369 * which means that the number of currently used grants will go down,
370 * but never up, so we will always be able to remove the requested
371 * number of grants.
372 */
373
374 total = num_clean;
375
376 pr_debug(DRV_PFX "Going to purge %u persistent grants\n", num_clean);
377
378 INIT_LIST_HEAD(&blkif->persistent_purge_list);
379 root = &blkif->persistent_gnts;
380purge_list:
381 foreach_grant_safe(persistent_gnt, n, root, node) {
382 BUG_ON(persistent_gnt->handle ==
383 BLKBACK_INVALID_HANDLE);
384
385 if (clean_used) {
386 clear_bit(PERSISTENT_GNT_WAS_ACTIVE, persistent_gnt->flags);
387 continue;
388 }
389
390 if (test_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags))
391 continue;
392 if (!scan_used &&
393 (test_bit(PERSISTENT_GNT_WAS_ACTIVE, persistent_gnt->flags)))
394 continue;
395
396 rb_erase(&persistent_gnt->node, root);
397 list_add(&persistent_gnt->remove_node,
398 &blkif->persistent_purge_list);
399 if (--num_clean == 0)
400 goto finished;
401 }
402 /*
403 * If we get here it means we also need to start cleaning
404 * grants that were used since last purge in order to cope
405 * with the requested num
406 */
407 if (!scan_used && !clean_used) {
408 pr_debug(DRV_PFX "Still missing %u purged frames\n", num_clean);
409 scan_used = true;
410 goto purge_list;
411 }
412finished:
413 if (!clean_used) {
414 pr_debug(DRV_PFX "Finished scanning for grants to clean, removing used flag\n");
415 clean_used = true;
416 goto purge_list;
417 }
418
419 blkif->persistent_gnt_c -= (total - num_clean);
420 blkif->vbd.overflow_max_grants = 0;
421
422 /* We can defer this work */
423 INIT_WORK(&blkif->persistent_purge_work, unmap_purged_grants);
424 schedule_work(&blkif->persistent_purge_work);
425 pr_debug(DRV_PFX "Purged %u/%u\n", (total - num_clean), total);
426 return;
427}
428
254/* 429/*
255 * Retrieve from the 'pending_reqs' a free pending_req structure to be used. 430 * Retrieve from the 'pending_reqs' a free pending_req structure to be used.
256 */ 431 */
257static struct pending_req *alloc_req(void) 432static struct pending_req *alloc_req(struct xen_blkif *blkif)
258{ 433{
259 struct pending_req *req = NULL; 434 struct pending_req *req = NULL;
260 unsigned long flags; 435 unsigned long flags;
261 436
262 spin_lock_irqsave(&blkbk->pending_free_lock, flags); 437 spin_lock_irqsave(&blkif->pending_free_lock, flags);
263 if (!list_empty(&blkbk->pending_free)) { 438 if (!list_empty(&blkif->pending_free)) {
264 req = list_entry(blkbk->pending_free.next, struct pending_req, 439 req = list_entry(blkif->pending_free.next, struct pending_req,
265 free_list); 440 free_list);
266 list_del(&req->free_list); 441 list_del(&req->free_list);
267 } 442 }
268 spin_unlock_irqrestore(&blkbk->pending_free_lock, flags); 443 spin_unlock_irqrestore(&blkif->pending_free_lock, flags);
269 return req; 444 return req;
270} 445}
271 446
@@ -273,17 +448,17 @@ static struct pending_req *alloc_req(void)
273 * Return the 'pending_req' structure back to the freepool. We also 448 * Return the 'pending_req' structure back to the freepool. We also
274 * wake up the thread if it was waiting for a free page. 449 * wake up the thread if it was waiting for a free page.
275 */ 450 */
276static void free_req(struct pending_req *req) 451static void free_req(struct xen_blkif *blkif, struct pending_req *req)
277{ 452{
278 unsigned long flags; 453 unsigned long flags;
279 int was_empty; 454 int was_empty;
280 455
281 spin_lock_irqsave(&blkbk->pending_free_lock, flags); 456 spin_lock_irqsave(&blkif->pending_free_lock, flags);
282 was_empty = list_empty(&blkbk->pending_free); 457 was_empty = list_empty(&blkif->pending_free);
283 list_add(&req->free_list, &blkbk->pending_free); 458 list_add(&req->free_list, &blkif->pending_free);
284 spin_unlock_irqrestore(&blkbk->pending_free_lock, flags); 459 spin_unlock_irqrestore(&blkif->pending_free_lock, flags);
285 if (was_empty) 460 if (was_empty)
286 wake_up(&blkbk->pending_free_wq); 461 wake_up(&blkif->pending_free_wq);
287} 462}
288 463
289/* 464/*
@@ -382,10 +557,12 @@ irqreturn_t xen_blkif_be_int(int irq, void *dev_id)
382static void print_stats(struct xen_blkif *blkif) 557static void print_stats(struct xen_blkif *blkif)
383{ 558{
384 pr_info("xen-blkback (%s): oo %3llu | rd %4llu | wr %4llu | f %4llu" 559 pr_info("xen-blkback (%s): oo %3llu | rd %4llu | wr %4llu | f %4llu"
385 " | ds %4llu\n", 560 " | ds %4llu | pg: %4u/%4d\n",
386 current->comm, blkif->st_oo_req, 561 current->comm, blkif->st_oo_req,
387 blkif->st_rd_req, blkif->st_wr_req, 562 blkif->st_rd_req, blkif->st_wr_req,
388 blkif->st_f_req, blkif->st_ds_req); 563 blkif->st_f_req, blkif->st_ds_req,
564 blkif->persistent_gnt_c,
565 xen_blkif_max_pgrants);
389 blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000); 566 blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000);
390 blkif->st_rd_req = 0; 567 blkif->st_rd_req = 0;
391 blkif->st_wr_req = 0; 568 blkif->st_wr_req = 0;
@@ -397,6 +574,8 @@ int xen_blkif_schedule(void *arg)
397{ 574{
398 struct xen_blkif *blkif = arg; 575 struct xen_blkif *blkif = arg;
399 struct xen_vbd *vbd = &blkif->vbd; 576 struct xen_vbd *vbd = &blkif->vbd;
577 unsigned long timeout;
578 int ret;
400 579
401 xen_blkif_get(blkif); 580 xen_blkif_get(blkif);
402 581
@@ -406,27 +585,52 @@ int xen_blkif_schedule(void *arg)
406 if (unlikely(vbd->size != vbd_sz(vbd))) 585 if (unlikely(vbd->size != vbd_sz(vbd)))
407 xen_vbd_resize(blkif); 586 xen_vbd_resize(blkif);
408 587
409 wait_event_interruptible( 588 timeout = msecs_to_jiffies(LRU_INTERVAL);
589
590 timeout = wait_event_interruptible_timeout(
410 blkif->wq, 591 blkif->wq,
411 blkif->waiting_reqs || kthread_should_stop()); 592 blkif->waiting_reqs || kthread_should_stop(),
412 wait_event_interruptible( 593 timeout);
413 blkbk->pending_free_wq, 594 if (timeout == 0)
414 !list_empty(&blkbk->pending_free) || 595 goto purge_gnt_list;
415 kthread_should_stop()); 596 timeout = wait_event_interruptible_timeout(
597 blkif->pending_free_wq,
598 !list_empty(&blkif->pending_free) ||
599 kthread_should_stop(),
600 timeout);
601 if (timeout == 0)
602 goto purge_gnt_list;
416 603
417 blkif->waiting_reqs = 0; 604 blkif->waiting_reqs = 0;
418 smp_mb(); /* clear flag *before* checking for work */ 605 smp_mb(); /* clear flag *before* checking for work */
419 606
420 if (do_block_io_op(blkif)) 607 ret = do_block_io_op(blkif);
608 if (ret > 0)
421 blkif->waiting_reqs = 1; 609 blkif->waiting_reqs = 1;
610 if (ret == -EACCES)
611 wait_event_interruptible(blkif->shutdown_wq,
612 kthread_should_stop());
613
614purge_gnt_list:
615 if (blkif->vbd.feature_gnt_persistent &&
616 time_after(jiffies, blkif->next_lru)) {
617 purge_persistent_gnt(blkif);
618 blkif->next_lru = jiffies + msecs_to_jiffies(LRU_INTERVAL);
619 }
620
621 /* Shrink if we have more than xen_blkif_max_buffer_pages */
622 shrink_free_pagepool(blkif, xen_blkif_max_buffer_pages);
422 623
423 if (log_stats && time_after(jiffies, blkif->st_print)) 624 if (log_stats && time_after(jiffies, blkif->st_print))
424 print_stats(blkif); 625 print_stats(blkif);
425 } 626 }
426 627
628 /* Since we are shutting down remove all pages from the buffer */
629 shrink_free_pagepool(blkif, 0 /* All */);
630
427 /* Free all persistent grant pages */ 631 /* Free all persistent grant pages */
428 if (!RB_EMPTY_ROOT(&blkif->persistent_gnts)) 632 if (!RB_EMPTY_ROOT(&blkif->persistent_gnts))
429 free_persistent_gnts(&blkif->persistent_gnts, 633 free_persistent_gnts(blkif, &blkif->persistent_gnts,
430 blkif->persistent_gnt_c); 634 blkif->persistent_gnt_c);
431 635
432 BUG_ON(!RB_EMPTY_ROOT(&blkif->persistent_gnts)); 636 BUG_ON(!RB_EMPTY_ROOT(&blkif->persistent_gnts));
@@ -441,148 +645,98 @@ int xen_blkif_schedule(void *arg)
441 return 0; 645 return 0;
442} 646}
443 647
444struct seg_buf {
445 unsigned int offset;
446 unsigned int nsec;
447};
448/* 648/*
449 * Unmap the grant references, and also remove the M2P over-rides 649 * Unmap the grant references, and also remove the M2P over-rides
450 * used in the 'pending_req'. 650 * used in the 'pending_req'.
451 */ 651 */
452static void xen_blkbk_unmap(struct pending_req *req) 652static void xen_blkbk_unmap(struct xen_blkif *blkif,
653 struct grant_page *pages[],
654 int num)
453{ 655{
454 struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST]; 656 struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
455 struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST]; 657 struct page *unmap_pages[BLKIF_MAX_SEGMENTS_PER_REQUEST];
456 unsigned int i, invcount = 0; 658 unsigned int i, invcount = 0;
457 grant_handle_t handle;
458 int ret; 659 int ret;
459 660
460 for (i = 0; i < req->nr_pages; i++) { 661 for (i = 0; i < num; i++) {
461 if (!test_bit(i, req->unmap_seg)) 662 if (pages[i]->persistent_gnt != NULL) {
663 put_persistent_gnt(blkif, pages[i]->persistent_gnt);
462 continue; 664 continue;
463 handle = pending_handle(req, i); 665 }
464 if (handle == BLKBACK_INVALID_HANDLE) 666 if (pages[i]->handle == BLKBACK_INVALID_HANDLE)
465 continue; 667 continue;
466 gnttab_set_unmap_op(&unmap[invcount], vaddr(req, i), 668 unmap_pages[invcount] = pages[i]->page;
467 GNTMAP_host_map, handle); 669 gnttab_set_unmap_op(&unmap[invcount], vaddr(pages[i]->page),
468 pending_handle(req, i) = BLKBACK_INVALID_HANDLE; 670 GNTMAP_host_map, pages[i]->handle);
469 pages[invcount] = virt_to_page(vaddr(req, i)); 671 pages[i]->handle = BLKBACK_INVALID_HANDLE;
470 invcount++; 672 if (++invcount == BLKIF_MAX_SEGMENTS_PER_REQUEST) {
673 ret = gnttab_unmap_refs(unmap, NULL, unmap_pages,
674 invcount);
675 BUG_ON(ret);
676 put_free_pages(blkif, unmap_pages, invcount);
677 invcount = 0;
678 }
679 }
680 if (invcount) {
681 ret = gnttab_unmap_refs(unmap, NULL, unmap_pages, invcount);
682 BUG_ON(ret);
683 put_free_pages(blkif, unmap_pages, invcount);
471 } 684 }
472
473 ret = gnttab_unmap_refs(unmap, NULL, pages, invcount);
474 BUG_ON(ret);
475} 685}
476 686
477static int xen_blkbk_map(struct blkif_request *req, 687static int xen_blkbk_map(struct xen_blkif *blkif,
478 struct pending_req *pending_req, 688 struct grant_page *pages[],
479 struct seg_buf seg[], 689 int num, bool ro)
480 struct page *pages[])
481{ 690{
482 struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST]; 691 struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST];
483 struct persistent_gnt *persistent_gnts[BLKIF_MAX_SEGMENTS_PER_REQUEST];
484 struct page *pages_to_gnt[BLKIF_MAX_SEGMENTS_PER_REQUEST]; 692 struct page *pages_to_gnt[BLKIF_MAX_SEGMENTS_PER_REQUEST];
485 struct persistent_gnt *persistent_gnt = NULL; 693 struct persistent_gnt *persistent_gnt = NULL;
486 struct xen_blkif *blkif = pending_req->blkif;
487 phys_addr_t addr = 0; 694 phys_addr_t addr = 0;
488 int i, j; 695 int i, seg_idx, new_map_idx;
489 bool new_map;
490 int nseg = req->u.rw.nr_segments;
491 int segs_to_map = 0; 696 int segs_to_map = 0;
492 int ret = 0; 697 int ret = 0;
698 int last_map = 0, map_until = 0;
493 int use_persistent_gnts; 699 int use_persistent_gnts;
494 700
495 use_persistent_gnts = (blkif->vbd.feature_gnt_persistent); 701 use_persistent_gnts = (blkif->vbd.feature_gnt_persistent);
496 702
497 BUG_ON(blkif->persistent_gnt_c >
498 max_mapped_grant_pages(pending_req->blkif->blk_protocol));
499
500 /* 703 /*
501 * Fill out preq.nr_sects with proper amount of sectors, and setup 704 * Fill out preq.nr_sects with proper amount of sectors, and setup
502 * assign map[..] with the PFN of the page in our domain with the 705 * assign map[..] with the PFN of the page in our domain with the
503 * corresponding grant reference for each page. 706 * corresponding grant reference for each page.
504 */ 707 */
505 for (i = 0; i < nseg; i++) { 708again:
709 for (i = map_until; i < num; i++) {
506 uint32_t flags; 710 uint32_t flags;
507 711
508 if (use_persistent_gnts) 712 if (use_persistent_gnts)
509 persistent_gnt = get_persistent_gnt( 713 persistent_gnt = get_persistent_gnt(
510 &blkif->persistent_gnts, 714 blkif,
511 req->u.rw.seg[i].gref); 715 pages[i]->gref);
512 716
513 if (persistent_gnt) { 717 if (persistent_gnt) {
514 /* 718 /*
515 * We are using persistent grants and 719 * We are using persistent grants and
516 * the grant is already mapped 720 * the grant is already mapped
517 */ 721 */
518 new_map = false; 722 pages[i]->page = persistent_gnt->page;
519 } else if (use_persistent_gnts && 723 pages[i]->persistent_gnt = persistent_gnt;
520 blkif->persistent_gnt_c <
521 max_mapped_grant_pages(blkif->blk_protocol)) {
522 /*
523 * We are using persistent grants, the grant is
524 * not mapped but we have room for it
525 */
526 new_map = true;
527 persistent_gnt = kmalloc(
528 sizeof(struct persistent_gnt),
529 GFP_KERNEL);
530 if (!persistent_gnt)
531 return -ENOMEM;
532 if (alloc_xenballooned_pages(1, &persistent_gnt->page,
533 false)) {
534 kfree(persistent_gnt);
535 return -ENOMEM;
536 }
537 persistent_gnt->gnt = req->u.rw.seg[i].gref;
538 persistent_gnt->handle = BLKBACK_INVALID_HANDLE;
539
540 pages_to_gnt[segs_to_map] =
541 persistent_gnt->page;
542 addr = (unsigned long) pfn_to_kaddr(
543 page_to_pfn(persistent_gnt->page));
544
545 add_persistent_gnt(&blkif->persistent_gnts,
546 persistent_gnt);
547 blkif->persistent_gnt_c++;
548 pr_debug(DRV_PFX " grant %u added to the tree of persistent grants, using %u/%u\n",
549 persistent_gnt->gnt, blkif->persistent_gnt_c,
550 max_mapped_grant_pages(blkif->blk_protocol));
551 } else { 724 } else {
552 /* 725 if (get_free_page(blkif, &pages[i]->page))
553 * We are either using persistent grants and 726 goto out_of_memory;
554 * hit the maximum limit of grants mapped, 727 addr = vaddr(pages[i]->page);
555 * or we are not using persistent grants. 728 pages_to_gnt[segs_to_map] = pages[i]->page;
556 */ 729 pages[i]->persistent_gnt = NULL;
557 if (use_persistent_gnts &&
558 !blkif->vbd.overflow_max_grants) {
559 blkif->vbd.overflow_max_grants = 1;
560 pr_alert(DRV_PFX " domain %u, device %#x is using maximum number of persistent grants\n",
561 blkif->domid, blkif->vbd.handle);
562 }
563 new_map = true;
564 pages[i] = blkbk->pending_page(pending_req, i);
565 addr = vaddr(pending_req, i);
566 pages_to_gnt[segs_to_map] =
567 blkbk->pending_page(pending_req, i);
568 }
569
570 if (persistent_gnt) {
571 pages[i] = persistent_gnt->page;
572 persistent_gnts[i] = persistent_gnt;
573 } else {
574 persistent_gnts[i] = NULL;
575 }
576
577 if (new_map) {
578 flags = GNTMAP_host_map; 730 flags = GNTMAP_host_map;
579 if (!persistent_gnt && 731 if (!use_persistent_gnts && ro)
580 (pending_req->operation != BLKIF_OP_READ))
581 flags |= GNTMAP_readonly; 732 flags |= GNTMAP_readonly;
582 gnttab_set_map_op(&map[segs_to_map++], addr, 733 gnttab_set_map_op(&map[segs_to_map++], addr,
583 flags, req->u.rw.seg[i].gref, 734 flags, pages[i]->gref,
584 blkif->domid); 735 blkif->domid);
585 } 736 }
737 map_until = i + 1;
738 if (segs_to_map == BLKIF_MAX_SEGMENTS_PER_REQUEST)
739 break;
586 } 740 }
587 741
588 if (segs_to_map) { 742 if (segs_to_map) {
@@ -595,49 +749,133 @@ static int xen_blkbk_map(struct blkif_request *req,
595 * so that when we access vaddr(pending_req,i) it has the contents of 749 * so that when we access vaddr(pending_req,i) it has the contents of
596 * the page from the other domain. 750 * the page from the other domain.
597 */ 751 */
598 bitmap_zero(pending_req->unmap_seg, BLKIF_MAX_SEGMENTS_PER_REQUEST); 752 for (seg_idx = last_map, new_map_idx = 0; seg_idx < map_until; seg_idx++) {
599 for (i = 0, j = 0; i < nseg; i++) { 753 if (!pages[seg_idx]->persistent_gnt) {
600 if (!persistent_gnts[i] ||
601 persistent_gnts[i]->handle == BLKBACK_INVALID_HANDLE) {
602 /* This is a newly mapped grant */ 754 /* This is a newly mapped grant */
603 BUG_ON(j >= segs_to_map); 755 BUG_ON(new_map_idx >= segs_to_map);
604 if (unlikely(map[j].status != 0)) { 756 if (unlikely(map[new_map_idx].status != 0)) {
605 pr_debug(DRV_PFX "invalid buffer -- could not remap it\n"); 757 pr_debug(DRV_PFX "invalid buffer -- could not remap it\n");
606 map[j].handle = BLKBACK_INVALID_HANDLE; 758 pages[seg_idx]->handle = BLKBACK_INVALID_HANDLE;
607 ret |= 1; 759 ret |= 1;
608 if (persistent_gnts[i]) { 760 goto next;
609 rb_erase(&persistent_gnts[i]->node,
610 &blkif->persistent_gnts);
611 blkif->persistent_gnt_c--;
612 kfree(persistent_gnts[i]);
613 persistent_gnts[i] = NULL;
614 }
615 } 761 }
762 pages[seg_idx]->handle = map[new_map_idx].handle;
763 } else {
764 continue;
616 } 765 }
617 if (persistent_gnts[i]) { 766 if (use_persistent_gnts &&
618 if (persistent_gnts[i]->handle == 767 blkif->persistent_gnt_c < xen_blkif_max_pgrants) {
619 BLKBACK_INVALID_HANDLE) { 768 /*
769 * We are using persistent grants, the grant is
770 * not mapped but we might have room for it.
771 */
772 persistent_gnt = kmalloc(sizeof(struct persistent_gnt),
773 GFP_KERNEL);
774 if (!persistent_gnt) {
620 /* 775 /*
621 * If this is a new persistent grant 776 * If we don't have enough memory to
622 * save the handler 777 * allocate the persistent_gnt struct
778 * map this grant non-persistenly
623 */ 779 */
624 persistent_gnts[i]->handle = map[j++].handle; 780 goto next;
625 } 781 }
626 pending_handle(pending_req, i) = 782 persistent_gnt->gnt = map[new_map_idx].ref;
627 persistent_gnts[i]->handle; 783 persistent_gnt->handle = map[new_map_idx].handle;
784 persistent_gnt->page = pages[seg_idx]->page;
785 if (add_persistent_gnt(blkif,
786 persistent_gnt)) {
787 kfree(persistent_gnt);
788 persistent_gnt = NULL;
789 goto next;
790 }
791 pages[seg_idx]->persistent_gnt = persistent_gnt;
792 pr_debug(DRV_PFX " grant %u added to the tree of persistent grants, using %u/%u\n",
793 persistent_gnt->gnt, blkif->persistent_gnt_c,
794 xen_blkif_max_pgrants);
795 goto next;
796 }
797 if (use_persistent_gnts && !blkif->vbd.overflow_max_grants) {
798 blkif->vbd.overflow_max_grants = 1;
799 pr_debug(DRV_PFX " domain %u, device %#x is using maximum number of persistent grants\n",
800 blkif->domid, blkif->vbd.handle);
801 }
802 /*
803 * We could not map this grant persistently, so use it as
804 * a non-persistent grant.
805 */
806next:
807 new_map_idx++;
808 }
809 segs_to_map = 0;
810 last_map = map_until;
811 if (map_until != num)
812 goto again;
628 813
629 if (ret) 814 return ret;
630 continue; 815
631 } else { 816out_of_memory:
632 pending_handle(pending_req, i) = map[j++].handle; 817 pr_alert(DRV_PFX "%s: out of memory\n", __func__);
633 bitmap_set(pending_req->unmap_seg, i, 1); 818 put_free_pages(blkif, pages_to_gnt, segs_to_map);
819 return -ENOMEM;
820}
821
822static int xen_blkbk_map_seg(struct pending_req *pending_req)
823{
824 int rc;
825
826 rc = xen_blkbk_map(pending_req->blkif, pending_req->segments,
827 pending_req->nr_pages,
828 (pending_req->operation != BLKIF_OP_READ));
829
830 return rc;
831}
634 832
635 if (ret) 833static int xen_blkbk_parse_indirect(struct blkif_request *req,
636 continue; 834 struct pending_req *pending_req,
835 struct seg_buf seg[],
836 struct phys_req *preq)
837{
838 struct grant_page **pages = pending_req->indirect_pages;
839 struct xen_blkif *blkif = pending_req->blkif;
840 int indirect_grefs, rc, n, nseg, i;
841 struct blkif_request_segment_aligned *segments = NULL;
842
843 nseg = pending_req->nr_pages;
844 indirect_grefs = INDIRECT_PAGES(nseg);
845 BUG_ON(indirect_grefs > BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST);
846
847 for (i = 0; i < indirect_grefs; i++)
848 pages[i]->gref = req->u.indirect.indirect_grefs[i];
849
850 rc = xen_blkbk_map(blkif, pages, indirect_grefs, true);
851 if (rc)
852 goto unmap;
853
854 for (n = 0, i = 0; n < nseg; n++) {
855 if ((n % SEGS_PER_INDIRECT_FRAME) == 0) {
856 /* Map indirect segments */
857 if (segments)
858 kunmap_atomic(segments);
859 segments = kmap_atomic(pages[n/SEGS_PER_INDIRECT_FRAME]->page);
860 }
861 i = n % SEGS_PER_INDIRECT_FRAME;
862 pending_req->segments[n]->gref = segments[i].gref;
863 seg[n].nsec = segments[i].last_sect -
864 segments[i].first_sect + 1;
865 seg[n].offset = (segments[i].first_sect << 9);
866 if ((segments[i].last_sect >= (PAGE_SIZE >> 9)) ||
867 (segments[i].last_sect < segments[i].first_sect)) {
868 rc = -EINVAL;
869 goto unmap;
637 } 870 }
638 seg[i].offset = (req->u.rw.seg[i].first_sect << 9); 871 preq->nr_sects += seg[n].nsec;
639 } 872 }
640 return ret; 873
874unmap:
875 if (segments)
876 kunmap_atomic(segments);
877 xen_blkbk_unmap(blkif, pages, indirect_grefs);
878 return rc;
641} 879}
642 880
643static int dispatch_discard_io(struct xen_blkif *blkif, 881static int dispatch_discard_io(struct xen_blkif *blkif,
@@ -647,7 +885,18 @@ static int dispatch_discard_io(struct xen_blkif *blkif,
647 int status = BLKIF_RSP_OKAY; 885 int status = BLKIF_RSP_OKAY;
648 struct block_device *bdev = blkif->vbd.bdev; 886 struct block_device *bdev = blkif->vbd.bdev;
649 unsigned long secure; 887 unsigned long secure;
888 struct phys_req preq;
889
890 preq.sector_number = req->u.discard.sector_number;
891 preq.nr_sects = req->u.discard.nr_sectors;
650 892
893 err = xen_vbd_translate(&preq, blkif, WRITE);
894 if (err) {
895 pr_warn(DRV_PFX "access denied: DISCARD [%llu->%llu] on dev=%04x\n",
896 preq.sector_number,
897 preq.sector_number + preq.nr_sects, blkif->vbd.pdevice);
898 goto fail_response;
899 }
651 blkif->st_ds_req++; 900 blkif->st_ds_req++;
652 901
653 xen_blkif_get(blkif); 902 xen_blkif_get(blkif);
@@ -658,7 +907,7 @@ static int dispatch_discard_io(struct xen_blkif *blkif,
658 err = blkdev_issue_discard(bdev, req->u.discard.sector_number, 907 err = blkdev_issue_discard(bdev, req->u.discard.sector_number,
659 req->u.discard.nr_sectors, 908 req->u.discard.nr_sectors,
660 GFP_KERNEL, secure); 909 GFP_KERNEL, secure);
661 910fail_response:
662 if (err == -EOPNOTSUPP) { 911 if (err == -EOPNOTSUPP) {
663 pr_debug(DRV_PFX "discard op failed, not supported\n"); 912 pr_debug(DRV_PFX "discard op failed, not supported\n");
664 status = BLKIF_RSP_EOPNOTSUPP; 913 status = BLKIF_RSP_EOPNOTSUPP;
@@ -674,7 +923,7 @@ static int dispatch_other_io(struct xen_blkif *blkif,
674 struct blkif_request *req, 923 struct blkif_request *req,
675 struct pending_req *pending_req) 924 struct pending_req *pending_req)
676{ 925{
677 free_req(pending_req); 926 free_req(blkif, pending_req);
678 make_response(blkif, req->u.other.id, req->operation, 927 make_response(blkif, req->u.other.id, req->operation,
679 BLKIF_RSP_EOPNOTSUPP); 928 BLKIF_RSP_EOPNOTSUPP);
680 return -EIO; 929 return -EIO;
@@ -726,7 +975,9 @@ static void __end_block_io_op(struct pending_req *pending_req, int error)
726 * the proper response on the ring. 975 * the proper response on the ring.
727 */ 976 */
728 if (atomic_dec_and_test(&pending_req->pendcnt)) { 977 if (atomic_dec_and_test(&pending_req->pendcnt)) {
729 xen_blkbk_unmap(pending_req); 978 xen_blkbk_unmap(pending_req->blkif,
979 pending_req->segments,
980 pending_req->nr_pages);
730 make_response(pending_req->blkif, pending_req->id, 981 make_response(pending_req->blkif, pending_req->id,
731 pending_req->operation, pending_req->status); 982 pending_req->operation, pending_req->status);
732 xen_blkif_put(pending_req->blkif); 983 xen_blkif_put(pending_req->blkif);
@@ -734,7 +985,7 @@ static void __end_block_io_op(struct pending_req *pending_req, int error)
734 if (atomic_read(&pending_req->blkif->drain)) 985 if (atomic_read(&pending_req->blkif->drain))
735 complete(&pending_req->blkif->drain_complete); 986 complete(&pending_req->blkif->drain_complete);
736 } 987 }
737 free_req(pending_req); 988 free_req(pending_req->blkif, pending_req);
738 } 989 }
739} 990}
740 991
@@ -767,6 +1018,12 @@ __do_block_io_op(struct xen_blkif *blkif)
767 rp = blk_rings->common.sring->req_prod; 1018 rp = blk_rings->common.sring->req_prod;
768 rmb(); /* Ensure we see queued requests up to 'rp'. */ 1019 rmb(); /* Ensure we see queued requests up to 'rp'. */
769 1020
1021 if (RING_REQUEST_PROD_OVERFLOW(&blk_rings->common, rp)) {
1022 rc = blk_rings->common.rsp_prod_pvt;
1023 pr_warn(DRV_PFX "Frontend provided bogus ring requests (%d - %d = %d). Halting ring processing on dev=%04x\n",
1024 rp, rc, rp - rc, blkif->vbd.pdevice);
1025 return -EACCES;
1026 }
770 while (rc != rp) { 1027 while (rc != rp) {
771 1028
772 if (RING_REQUEST_CONS_OVERFLOW(&blk_rings->common, rc)) 1029 if (RING_REQUEST_CONS_OVERFLOW(&blk_rings->common, rc))
@@ -777,7 +1034,7 @@ __do_block_io_op(struct xen_blkif *blkif)
777 break; 1034 break;
778 } 1035 }
779 1036
780 pending_req = alloc_req(); 1037 pending_req = alloc_req(blkif);
781 if (NULL == pending_req) { 1038 if (NULL == pending_req) {
782 blkif->st_oo_req++; 1039 blkif->st_oo_req++;
783 more_to_do = 1; 1040 more_to_do = 1;
@@ -807,11 +1064,12 @@ __do_block_io_op(struct xen_blkif *blkif)
807 case BLKIF_OP_WRITE: 1064 case BLKIF_OP_WRITE:
808 case BLKIF_OP_WRITE_BARRIER: 1065 case BLKIF_OP_WRITE_BARRIER:
809 case BLKIF_OP_FLUSH_DISKCACHE: 1066 case BLKIF_OP_FLUSH_DISKCACHE:
1067 case BLKIF_OP_INDIRECT:
810 if (dispatch_rw_block_io(blkif, &req, pending_req)) 1068 if (dispatch_rw_block_io(blkif, &req, pending_req))
811 goto done; 1069 goto done;
812 break; 1070 break;
813 case BLKIF_OP_DISCARD: 1071 case BLKIF_OP_DISCARD:
814 free_req(pending_req); 1072 free_req(blkif, pending_req);
815 if (dispatch_discard_io(blkif, &req)) 1073 if (dispatch_discard_io(blkif, &req))
816 goto done; 1074 goto done;
817 break; 1075 break;
@@ -853,17 +1111,28 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
853 struct pending_req *pending_req) 1111 struct pending_req *pending_req)
854{ 1112{
855 struct phys_req preq; 1113 struct phys_req preq;
856 struct seg_buf seg[BLKIF_MAX_SEGMENTS_PER_REQUEST]; 1114 struct seg_buf *seg = pending_req->seg;
857 unsigned int nseg; 1115 unsigned int nseg;
858 struct bio *bio = NULL; 1116 struct bio *bio = NULL;
859 struct bio *biolist[BLKIF_MAX_SEGMENTS_PER_REQUEST]; 1117 struct bio **biolist = pending_req->biolist;
860 int i, nbio = 0; 1118 int i, nbio = 0;
861 int operation; 1119 int operation;
862 struct blk_plug plug; 1120 struct blk_plug plug;
863 bool drain = false; 1121 bool drain = false;
864 struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST]; 1122 struct grant_page **pages = pending_req->segments;
1123 unsigned short req_operation;
1124
1125 req_operation = req->operation == BLKIF_OP_INDIRECT ?
1126 req->u.indirect.indirect_op : req->operation;
1127 if ((req->operation == BLKIF_OP_INDIRECT) &&
1128 (req_operation != BLKIF_OP_READ) &&
1129 (req_operation != BLKIF_OP_WRITE)) {
1130 pr_debug(DRV_PFX "Invalid indirect operation (%u)\n",
1131 req_operation);
1132 goto fail_response;
1133 }
865 1134
866 switch (req->operation) { 1135 switch (req_operation) {
867 case BLKIF_OP_READ: 1136 case BLKIF_OP_READ:
868 blkif->st_rd_req++; 1137 blkif->st_rd_req++;
869 operation = READ; 1138 operation = READ;
@@ -885,33 +1154,47 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
885 } 1154 }
886 1155
887 /* Check that the number of segments is sane. */ 1156 /* Check that the number of segments is sane. */
888 nseg = req->u.rw.nr_segments; 1157 nseg = req->operation == BLKIF_OP_INDIRECT ?
1158 req->u.indirect.nr_segments : req->u.rw.nr_segments;
889 1159
890 if (unlikely(nseg == 0 && operation != WRITE_FLUSH) || 1160 if (unlikely(nseg == 0 && operation != WRITE_FLUSH) ||
891 unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) { 1161 unlikely((req->operation != BLKIF_OP_INDIRECT) &&
1162 (nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) ||
1163 unlikely((req->operation == BLKIF_OP_INDIRECT) &&
1164 (nseg > MAX_INDIRECT_SEGMENTS))) {
892 pr_debug(DRV_PFX "Bad number of segments in request (%d)\n", 1165 pr_debug(DRV_PFX "Bad number of segments in request (%d)\n",
893 nseg); 1166 nseg);
894 /* Haven't submitted any bio's yet. */ 1167 /* Haven't submitted any bio's yet. */
895 goto fail_response; 1168 goto fail_response;
896 } 1169 }
897 1170
898 preq.sector_number = req->u.rw.sector_number;
899 preq.nr_sects = 0; 1171 preq.nr_sects = 0;
900 1172
901 pending_req->blkif = blkif; 1173 pending_req->blkif = blkif;
902 pending_req->id = req->u.rw.id; 1174 pending_req->id = req->u.rw.id;
903 pending_req->operation = req->operation; 1175 pending_req->operation = req_operation;
904 pending_req->status = BLKIF_RSP_OKAY; 1176 pending_req->status = BLKIF_RSP_OKAY;
905 pending_req->nr_pages = nseg; 1177 pending_req->nr_pages = nseg;
906 1178
907 for (i = 0; i < nseg; i++) { 1179 if (req->operation != BLKIF_OP_INDIRECT) {
908 seg[i].nsec = req->u.rw.seg[i].last_sect - 1180 preq.dev = req->u.rw.handle;
909 req->u.rw.seg[i].first_sect + 1; 1181 preq.sector_number = req->u.rw.sector_number;
910 if ((req->u.rw.seg[i].last_sect >= (PAGE_SIZE >> 9)) || 1182 for (i = 0; i < nseg; i++) {
911 (req->u.rw.seg[i].last_sect < req->u.rw.seg[i].first_sect)) 1183 pages[i]->gref = req->u.rw.seg[i].gref;
1184 seg[i].nsec = req->u.rw.seg[i].last_sect -
1185 req->u.rw.seg[i].first_sect + 1;
1186 seg[i].offset = (req->u.rw.seg[i].first_sect << 9);
1187 if ((req->u.rw.seg[i].last_sect >= (PAGE_SIZE >> 9)) ||
1188 (req->u.rw.seg[i].last_sect <
1189 req->u.rw.seg[i].first_sect))
1190 goto fail_response;
1191 preq.nr_sects += seg[i].nsec;
1192 }
1193 } else {
1194 preq.dev = req->u.indirect.handle;
1195 preq.sector_number = req->u.indirect.sector_number;
1196 if (xen_blkbk_parse_indirect(req, pending_req, seg, &preq))
912 goto fail_response; 1197 goto fail_response;
913 preq.nr_sects += seg[i].nsec;
914
915 } 1198 }
916 1199
917 if (xen_vbd_translate(&preq, blkif, operation) != 0) { 1200 if (xen_vbd_translate(&preq, blkif, operation) != 0) {
@@ -948,7 +1231,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
948 * the hypercall to unmap the grants - that is all done in 1231 * the hypercall to unmap the grants - that is all done in
949 * xen_blkbk_unmap. 1232 * xen_blkbk_unmap.
950 */ 1233 */
951 if (xen_blkbk_map(req, pending_req, seg, pages)) 1234 if (xen_blkbk_map_seg(pending_req))
952 goto fail_flush; 1235 goto fail_flush;
953 1236
954 /* 1237 /*
@@ -960,11 +1243,12 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
960 for (i = 0; i < nseg; i++) { 1243 for (i = 0; i < nseg; i++) {
961 while ((bio == NULL) || 1244 while ((bio == NULL) ||
962 (bio_add_page(bio, 1245 (bio_add_page(bio,
963 pages[i], 1246 pages[i]->page,
964 seg[i].nsec << 9, 1247 seg[i].nsec << 9,
965 seg[i].offset) == 0)) { 1248 seg[i].offset) == 0)) {
966 1249
967 bio = bio_alloc(GFP_KERNEL, nseg-i); 1250 int nr_iovecs = min_t(int, (nseg-i), BIO_MAX_PAGES);
1251 bio = bio_alloc(GFP_KERNEL, nr_iovecs);
968 if (unlikely(bio == NULL)) 1252 if (unlikely(bio == NULL))
969 goto fail_put_bio; 1253 goto fail_put_bio;
970 1254
@@ -1009,11 +1293,12 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
1009 return 0; 1293 return 0;
1010 1294
1011 fail_flush: 1295 fail_flush:
1012 xen_blkbk_unmap(pending_req); 1296 xen_blkbk_unmap(blkif, pending_req->segments,
1297 pending_req->nr_pages);
1013 fail_response: 1298 fail_response:
1014 /* Haven't submitted any bio's yet. */ 1299 /* Haven't submitted any bio's yet. */
1015 make_response(blkif, req->u.rw.id, req->operation, BLKIF_RSP_ERROR); 1300 make_response(blkif, req->u.rw.id, req_operation, BLKIF_RSP_ERROR);
1016 free_req(pending_req); 1301 free_req(blkif, pending_req);
1017 msleep(1); /* back off a bit */ 1302 msleep(1); /* back off a bit */
1018 return -EIO; 1303 return -EIO;
1019 1304
@@ -1070,73 +1355,20 @@ static void make_response(struct xen_blkif *blkif, u64 id,
1070 1355
1071static int __init xen_blkif_init(void) 1356static int __init xen_blkif_init(void)
1072{ 1357{
1073 int i, mmap_pages;
1074 int rc = 0; 1358 int rc = 0;
1075 1359
1076 if (!xen_domain()) 1360 if (!xen_domain())
1077 return -ENODEV; 1361 return -ENODEV;
1078 1362
1079 blkbk = kzalloc(sizeof(struct xen_blkbk), GFP_KERNEL);
1080 if (!blkbk) {
1081 pr_alert(DRV_PFX "%s: out of memory!\n", __func__);
1082 return -ENOMEM;
1083 }
1084
1085 mmap_pages = xen_blkif_reqs * BLKIF_MAX_SEGMENTS_PER_REQUEST;
1086
1087 blkbk->pending_reqs = kzalloc(sizeof(blkbk->pending_reqs[0]) *
1088 xen_blkif_reqs, GFP_KERNEL);
1089 blkbk->pending_grant_handles = kmalloc(sizeof(blkbk->pending_grant_handles[0]) *
1090 mmap_pages, GFP_KERNEL);
1091 blkbk->pending_pages = kzalloc(sizeof(blkbk->pending_pages[0]) *
1092 mmap_pages, GFP_KERNEL);
1093
1094 if (!blkbk->pending_reqs || !blkbk->pending_grant_handles ||
1095 !blkbk->pending_pages) {
1096 rc = -ENOMEM;
1097 goto out_of_memory;
1098 }
1099
1100 for (i = 0; i < mmap_pages; i++) {
1101 blkbk->pending_grant_handles[i] = BLKBACK_INVALID_HANDLE;
1102 blkbk->pending_pages[i] = alloc_page(GFP_KERNEL);
1103 if (blkbk->pending_pages[i] == NULL) {
1104 rc = -ENOMEM;
1105 goto out_of_memory;
1106 }
1107 }
1108 rc = xen_blkif_interface_init(); 1363 rc = xen_blkif_interface_init();
1109 if (rc) 1364 if (rc)
1110 goto failed_init; 1365 goto failed_init;
1111 1366
1112 INIT_LIST_HEAD(&blkbk->pending_free);
1113 spin_lock_init(&blkbk->pending_free_lock);
1114 init_waitqueue_head(&blkbk->pending_free_wq);
1115
1116 for (i = 0; i < xen_blkif_reqs; i++)
1117 list_add_tail(&blkbk->pending_reqs[i].free_list,
1118 &blkbk->pending_free);
1119
1120 rc = xen_blkif_xenbus_init(); 1367 rc = xen_blkif_xenbus_init();
1121 if (rc) 1368 if (rc)
1122 goto failed_init; 1369 goto failed_init;
1123 1370
1124 return 0;
1125
1126 out_of_memory:
1127 pr_alert(DRV_PFX "%s: out of memory\n", __func__);
1128 failed_init: 1371 failed_init:
1129 kfree(blkbk->pending_reqs);
1130 kfree(blkbk->pending_grant_handles);
1131 if (blkbk->pending_pages) {
1132 for (i = 0; i < mmap_pages; i++) {
1133 if (blkbk->pending_pages[i])
1134 __free_page(blkbk->pending_pages[i]);
1135 }
1136 kfree(blkbk->pending_pages);
1137 }
1138 kfree(blkbk);
1139 blkbk = NULL;
1140 return rc; 1372 return rc;
1141} 1373}
1142 1374
diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h
index 60103e2517ba..8d8807563d99 100644
--- a/drivers/block/xen-blkback/common.h
+++ b/drivers/block/xen-blkback/common.h
@@ -50,6 +50,19 @@
50 __func__, __LINE__, ##args) 50 __func__, __LINE__, ##args)
51 51
52 52
53/*
54 * This is the maximum number of segments that would be allowed in indirect
55 * requests. This value will also be passed to the frontend.
56 */
57#define MAX_INDIRECT_SEGMENTS 256
58
59#define SEGS_PER_INDIRECT_FRAME \
60 (PAGE_SIZE/sizeof(struct blkif_request_segment_aligned))
61#define MAX_INDIRECT_PAGES \
62 ((MAX_INDIRECT_SEGMENTS + SEGS_PER_INDIRECT_FRAME - 1)/SEGS_PER_INDIRECT_FRAME)
63#define INDIRECT_PAGES(_segs) \
64 ((_segs + SEGS_PER_INDIRECT_FRAME - 1)/SEGS_PER_INDIRECT_FRAME)
65
53/* Not a real protocol. Used to generate ring structs which contain 66/* Not a real protocol. Used to generate ring structs which contain
54 * the elements common to all protocols only. This way we get a 67 * the elements common to all protocols only. This way we get a
55 * compiler-checkable way to use common struct elements, so we can 68 * compiler-checkable way to use common struct elements, so we can
@@ -83,12 +96,31 @@ struct blkif_x86_32_request_other {
83 uint64_t id; /* private guest value, echoed in resp */ 96 uint64_t id; /* private guest value, echoed in resp */
84} __attribute__((__packed__)); 97} __attribute__((__packed__));
85 98
99struct blkif_x86_32_request_indirect {
100 uint8_t indirect_op;
101 uint16_t nr_segments;
102 uint64_t id;
103 blkif_sector_t sector_number;
104 blkif_vdev_t handle;
105 uint16_t _pad1;
106 grant_ref_t indirect_grefs[BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST];
107 /*
108 * The maximum number of indirect segments (and pages) that will
109 * be used is determined by MAX_INDIRECT_SEGMENTS, this value
110 * is also exported to the guest (via xenstore
111 * feature-max-indirect-segments entry), so the frontend knows how
112 * many indirect segments the backend supports.
113 */
114 uint64_t _pad2; /* make it 64 byte aligned */
115} __attribute__((__packed__));
116
86struct blkif_x86_32_request { 117struct blkif_x86_32_request {
87 uint8_t operation; /* BLKIF_OP_??? */ 118 uint8_t operation; /* BLKIF_OP_??? */
88 union { 119 union {
89 struct blkif_x86_32_request_rw rw; 120 struct blkif_x86_32_request_rw rw;
90 struct blkif_x86_32_request_discard discard; 121 struct blkif_x86_32_request_discard discard;
91 struct blkif_x86_32_request_other other; 122 struct blkif_x86_32_request_other other;
123 struct blkif_x86_32_request_indirect indirect;
92 } u; 124 } u;
93} __attribute__((__packed__)); 125} __attribute__((__packed__));
94 126
@@ -127,12 +159,32 @@ struct blkif_x86_64_request_other {
127 uint64_t id; /* private guest value, echoed in resp */ 159 uint64_t id; /* private guest value, echoed in resp */
128} __attribute__((__packed__)); 160} __attribute__((__packed__));
129 161
162struct blkif_x86_64_request_indirect {
163 uint8_t indirect_op;
164 uint16_t nr_segments;
165 uint32_t _pad1; /* offsetof(blkif_..,u.indirect.id)==8 */
166 uint64_t id;
167 blkif_sector_t sector_number;
168 blkif_vdev_t handle;
169 uint16_t _pad2;
170 grant_ref_t indirect_grefs[BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST];
171 /*
172 * The maximum number of indirect segments (and pages) that will
173 * be used is determined by MAX_INDIRECT_SEGMENTS, this value
174 * is also exported to the guest (via xenstore
175 * feature-max-indirect-segments entry), so the frontend knows how
176 * many indirect segments the backend supports.
177 */
178 uint32_t _pad3; /* make it 64 byte aligned */
179} __attribute__((__packed__));
180
130struct blkif_x86_64_request { 181struct blkif_x86_64_request {
131 uint8_t operation; /* BLKIF_OP_??? */ 182 uint8_t operation; /* BLKIF_OP_??? */
132 union { 183 union {
133 struct blkif_x86_64_request_rw rw; 184 struct blkif_x86_64_request_rw rw;
134 struct blkif_x86_64_request_discard discard; 185 struct blkif_x86_64_request_discard discard;
135 struct blkif_x86_64_request_other other; 186 struct blkif_x86_64_request_other other;
187 struct blkif_x86_64_request_indirect indirect;
136 } u; 188 } u;
137} __attribute__((__packed__)); 189} __attribute__((__packed__));
138 190
@@ -182,12 +234,26 @@ struct xen_vbd {
182 234
183struct backend_info; 235struct backend_info;
184 236
237/* Number of available flags */
238#define PERSISTENT_GNT_FLAGS_SIZE 2
239/* This persistent grant is currently in use */
240#define PERSISTENT_GNT_ACTIVE 0
241/*
242 * This persistent grant has been used, this flag is set when we remove the
243 * PERSISTENT_GNT_ACTIVE, to know that this grant has been used recently.
244 */
245#define PERSISTENT_GNT_WAS_ACTIVE 1
246
247/* Number of requests that we can fit in a ring */
248#define XEN_BLKIF_REQS 32
185 249
186struct persistent_gnt { 250struct persistent_gnt {
187 struct page *page; 251 struct page *page;
188 grant_ref_t gnt; 252 grant_ref_t gnt;
189 grant_handle_t handle; 253 grant_handle_t handle;
254 DECLARE_BITMAP(flags, PERSISTENT_GNT_FLAGS_SIZE);
190 struct rb_node node; 255 struct rb_node node;
256 struct list_head remove_node;
191}; 257};
192 258
193struct xen_blkif { 259struct xen_blkif {
@@ -219,6 +285,23 @@ struct xen_blkif {
219 /* tree to store persistent grants */ 285 /* tree to store persistent grants */
220 struct rb_root persistent_gnts; 286 struct rb_root persistent_gnts;
221 unsigned int persistent_gnt_c; 287 unsigned int persistent_gnt_c;
288 atomic_t persistent_gnt_in_use;
289 unsigned long next_lru;
290
291 /* used by the kworker that offload work from the persistent purge */
292 struct list_head persistent_purge_list;
293 struct work_struct persistent_purge_work;
294
295 /* buffer of free pages to map grant refs */
296 spinlock_t free_pages_lock;
297 int free_pages_num;
298 struct list_head free_pages;
299
300 /* List of all 'pending_req' available */
301 struct list_head pending_free;
302 /* And its spinlock. */
303 spinlock_t pending_free_lock;
304 wait_queue_head_t pending_free_wq;
222 305
223 /* statistics */ 306 /* statistics */
224 unsigned long st_print; 307 unsigned long st_print;
@@ -231,6 +314,41 @@ struct xen_blkif {
231 unsigned long long st_wr_sect; 314 unsigned long long st_wr_sect;
232 315
233 wait_queue_head_t waiting_to_free; 316 wait_queue_head_t waiting_to_free;
317 /* Thread shutdown wait queue. */
318 wait_queue_head_t shutdown_wq;
319};
320
321struct seg_buf {
322 unsigned long offset;
323 unsigned int nsec;
324};
325
326struct grant_page {
327 struct page *page;
328 struct persistent_gnt *persistent_gnt;
329 grant_handle_t handle;
330 grant_ref_t gref;
331};
332
333/*
334 * Each outstanding request that we've passed to the lower device layers has a
335 * 'pending_req' allocated to it. Each buffer_head that completes decrements
336 * the pendcnt towards zero. When it hits zero, the specified domain has a
337 * response queued for it, with the saved 'id' passed back.
338 */
339struct pending_req {
340 struct xen_blkif *blkif;
341 u64 id;
342 int nr_pages;
343 atomic_t pendcnt;
344 unsigned short operation;
345 int status;
346 struct list_head free_list;
347 struct grant_page *segments[MAX_INDIRECT_SEGMENTS];
348 /* Indirect descriptors */
349 struct grant_page *indirect_pages[MAX_INDIRECT_PAGES];
350 struct seg_buf seg[MAX_INDIRECT_SEGMENTS];
351 struct bio *biolist[MAX_INDIRECT_SEGMENTS];
234}; 352};
235 353
236 354
@@ -257,6 +375,7 @@ int xen_blkif_xenbus_init(void);
257 375
258irqreturn_t xen_blkif_be_int(int irq, void *dev_id); 376irqreturn_t xen_blkif_be_int(int irq, void *dev_id);
259int xen_blkif_schedule(void *arg); 377int xen_blkif_schedule(void *arg);
378int xen_blkif_purge_persistent(void *arg);
260 379
261int xen_blkbk_flush_diskcache(struct xenbus_transaction xbt, 380int xen_blkbk_flush_diskcache(struct xenbus_transaction xbt,
262 struct backend_info *be, int state); 381 struct backend_info *be, int state);
@@ -268,7 +387,7 @@ struct xenbus_device *xen_blkbk_xenbus(struct backend_info *be);
268static inline void blkif_get_x86_32_req(struct blkif_request *dst, 387static inline void blkif_get_x86_32_req(struct blkif_request *dst,
269 struct blkif_x86_32_request *src) 388 struct blkif_x86_32_request *src)
270{ 389{
271 int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST; 390 int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST, j;
272 dst->operation = src->operation; 391 dst->operation = src->operation;
273 switch (src->operation) { 392 switch (src->operation) {
274 case BLKIF_OP_READ: 393 case BLKIF_OP_READ:
@@ -291,6 +410,18 @@ static inline void blkif_get_x86_32_req(struct blkif_request *dst,
291 dst->u.discard.sector_number = src->u.discard.sector_number; 410 dst->u.discard.sector_number = src->u.discard.sector_number;
292 dst->u.discard.nr_sectors = src->u.discard.nr_sectors; 411 dst->u.discard.nr_sectors = src->u.discard.nr_sectors;
293 break; 412 break;
413 case BLKIF_OP_INDIRECT:
414 dst->u.indirect.indirect_op = src->u.indirect.indirect_op;
415 dst->u.indirect.nr_segments = src->u.indirect.nr_segments;
416 dst->u.indirect.handle = src->u.indirect.handle;
417 dst->u.indirect.id = src->u.indirect.id;
418 dst->u.indirect.sector_number = src->u.indirect.sector_number;
419 barrier();
420 j = min(MAX_INDIRECT_PAGES, INDIRECT_PAGES(dst->u.indirect.nr_segments));
421 for (i = 0; i < j; i++)
422 dst->u.indirect.indirect_grefs[i] =
423 src->u.indirect.indirect_grefs[i];
424 break;
294 default: 425 default:
295 /* 426 /*
296 * Don't know how to translate this op. Only get the 427 * Don't know how to translate this op. Only get the
@@ -304,7 +435,7 @@ static inline void blkif_get_x86_32_req(struct blkif_request *dst,
304static inline void blkif_get_x86_64_req(struct blkif_request *dst, 435static inline void blkif_get_x86_64_req(struct blkif_request *dst,
305 struct blkif_x86_64_request *src) 436 struct blkif_x86_64_request *src)
306{ 437{
307 int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST; 438 int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST, j;
308 dst->operation = src->operation; 439 dst->operation = src->operation;
309 switch (src->operation) { 440 switch (src->operation) {
310 case BLKIF_OP_READ: 441 case BLKIF_OP_READ:
@@ -327,6 +458,18 @@ static inline void blkif_get_x86_64_req(struct blkif_request *dst,
327 dst->u.discard.sector_number = src->u.discard.sector_number; 458 dst->u.discard.sector_number = src->u.discard.sector_number;
328 dst->u.discard.nr_sectors = src->u.discard.nr_sectors; 459 dst->u.discard.nr_sectors = src->u.discard.nr_sectors;
329 break; 460 break;
461 case BLKIF_OP_INDIRECT:
462 dst->u.indirect.indirect_op = src->u.indirect.indirect_op;
463 dst->u.indirect.nr_segments = src->u.indirect.nr_segments;
464 dst->u.indirect.handle = src->u.indirect.handle;
465 dst->u.indirect.id = src->u.indirect.id;
466 dst->u.indirect.sector_number = src->u.indirect.sector_number;
467 barrier();
468 j = min(MAX_INDIRECT_PAGES, INDIRECT_PAGES(dst->u.indirect.nr_segments));
469 for (i = 0; i < j; i++)
470 dst->u.indirect.indirect_grefs[i] =
471 src->u.indirect.indirect_grefs[i];
472 break;
330 default: 473 default:
331 /* 474 /*
332 * Don't know how to translate this op. Only get the 475 * Don't know how to translate this op. Only get the
diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c
index 8bfd1bcf95ec..2e5b69d612ac 100644
--- a/drivers/block/xen-blkback/xenbus.c
+++ b/drivers/block/xen-blkback/xenbus.c
@@ -98,12 +98,17 @@ static void xen_update_blkif_status(struct xen_blkif *blkif)
98 err = PTR_ERR(blkif->xenblkd); 98 err = PTR_ERR(blkif->xenblkd);
99 blkif->xenblkd = NULL; 99 blkif->xenblkd = NULL;
100 xenbus_dev_error(blkif->be->dev, err, "start xenblkd"); 100 xenbus_dev_error(blkif->be->dev, err, "start xenblkd");
101 return;
101 } 102 }
102} 103}
103 104
104static struct xen_blkif *xen_blkif_alloc(domid_t domid) 105static struct xen_blkif *xen_blkif_alloc(domid_t domid)
105{ 106{
106 struct xen_blkif *blkif; 107 struct xen_blkif *blkif;
108 struct pending_req *req, *n;
109 int i, j;
110
111 BUILD_BUG_ON(MAX_INDIRECT_PAGES > BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST);
107 112
108 blkif = kmem_cache_zalloc(xen_blkif_cachep, GFP_KERNEL); 113 blkif = kmem_cache_zalloc(xen_blkif_cachep, GFP_KERNEL);
109 if (!blkif) 114 if (!blkif)
@@ -118,8 +123,57 @@ static struct xen_blkif *xen_blkif_alloc(domid_t domid)
118 blkif->st_print = jiffies; 123 blkif->st_print = jiffies;
119 init_waitqueue_head(&blkif->waiting_to_free); 124 init_waitqueue_head(&blkif->waiting_to_free);
120 blkif->persistent_gnts.rb_node = NULL; 125 blkif->persistent_gnts.rb_node = NULL;
126 spin_lock_init(&blkif->free_pages_lock);
127 INIT_LIST_HEAD(&blkif->free_pages);
128 blkif->free_pages_num = 0;
129 atomic_set(&blkif->persistent_gnt_in_use, 0);
130
131 INIT_LIST_HEAD(&blkif->pending_free);
132
133 for (i = 0; i < XEN_BLKIF_REQS; i++) {
134 req = kzalloc(sizeof(*req), GFP_KERNEL);
135 if (!req)
136 goto fail;
137 list_add_tail(&req->free_list,
138 &blkif->pending_free);
139 for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++) {
140 req->segments[j] = kzalloc(sizeof(*req->segments[0]),
141 GFP_KERNEL);
142 if (!req->segments[j])
143 goto fail;
144 }
145 for (j = 0; j < MAX_INDIRECT_PAGES; j++) {
146 req->indirect_pages[j] = kzalloc(sizeof(*req->indirect_pages[0]),
147 GFP_KERNEL);
148 if (!req->indirect_pages[j])
149 goto fail;
150 }
151 }
152 spin_lock_init(&blkif->pending_free_lock);
153 init_waitqueue_head(&blkif->pending_free_wq);
154 init_waitqueue_head(&blkif->shutdown_wq);
121 155
122 return blkif; 156 return blkif;
157
158fail:
159 list_for_each_entry_safe(req, n, &blkif->pending_free, free_list) {
160 list_del(&req->free_list);
161 for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++) {
162 if (!req->segments[j])
163 break;
164 kfree(req->segments[j]);
165 }
166 for (j = 0; j < MAX_INDIRECT_PAGES; j++) {
167 if (!req->indirect_pages[j])
168 break;
169 kfree(req->indirect_pages[j]);
170 }
171 kfree(req);
172 }
173
174 kmem_cache_free(xen_blkif_cachep, blkif);
175
176 return ERR_PTR(-ENOMEM);
123} 177}
124 178
125static int xen_blkif_map(struct xen_blkif *blkif, unsigned long shared_page, 179static int xen_blkif_map(struct xen_blkif *blkif, unsigned long shared_page,
@@ -178,6 +232,7 @@ static void xen_blkif_disconnect(struct xen_blkif *blkif)
178{ 232{
179 if (blkif->xenblkd) { 233 if (blkif->xenblkd) {
180 kthread_stop(blkif->xenblkd); 234 kthread_stop(blkif->xenblkd);
235 wake_up(&blkif->shutdown_wq);
181 blkif->xenblkd = NULL; 236 blkif->xenblkd = NULL;
182 } 237 }
183 238
@@ -198,8 +253,28 @@ static void xen_blkif_disconnect(struct xen_blkif *blkif)
198 253
199static void xen_blkif_free(struct xen_blkif *blkif) 254static void xen_blkif_free(struct xen_blkif *blkif)
200{ 255{
256 struct pending_req *req, *n;
257 int i = 0, j;
258
201 if (!atomic_dec_and_test(&blkif->refcnt)) 259 if (!atomic_dec_and_test(&blkif->refcnt))
202 BUG(); 260 BUG();
261
262 /* Check that there is no request in use */
263 list_for_each_entry_safe(req, n, &blkif->pending_free, free_list) {
264 list_del(&req->free_list);
265
266 for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++)
267 kfree(req->segments[j]);
268
269 for (j = 0; j < MAX_INDIRECT_PAGES; j++)
270 kfree(req->indirect_pages[j]);
271
272 kfree(req);
273 i++;
274 }
275
276 WARN_ON(i != XEN_BLKIF_REQS);
277
203 kmem_cache_free(xen_blkif_cachep, blkif); 278 kmem_cache_free(xen_blkif_cachep, blkif);
204} 279}
205 280
@@ -678,6 +753,11 @@ again:
678 dev->nodename); 753 dev->nodename);
679 goto abort; 754 goto abort;
680 } 755 }
756 err = xenbus_printf(xbt, dev->nodename, "feature-max-indirect-segments", "%u",
757 MAX_INDIRECT_SEGMENTS);
758 if (err)
759 dev_warn(&dev->dev, "writing %s/feature-max-indirect-segments (%d)",
760 dev->nodename, err);
681 761
682 err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu", 762 err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu",
683 (unsigned long long)vbd_sz(&be->blkif->vbd)); 763 (unsigned long long)vbd_sz(&be->blkif->vbd));
@@ -704,6 +784,11 @@ again:
704 dev->nodename); 784 dev->nodename);
705 goto abort; 785 goto abort;
706 } 786 }
787 err = xenbus_printf(xbt, dev->nodename, "physical-sector-size", "%u",
788 bdev_physical_block_size(be->blkif->vbd.bdev));
789 if (err)
790 xenbus_dev_error(dev, err, "writing %s/physical-sector-size",
791 dev->nodename);
707 792
708 err = xenbus_transaction_end(xbt, 0); 793 err = xenbus_transaction_end(xbt, 0);
709 if (err == -EAGAIN) 794 if (err == -EAGAIN)
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index d89ef86220f4..a4660bbee8a6 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -74,12 +74,30 @@ struct grant {
74struct blk_shadow { 74struct blk_shadow {
75 struct blkif_request req; 75 struct blkif_request req;
76 struct request *request; 76 struct request *request;
77 struct grant *grants_used[BLKIF_MAX_SEGMENTS_PER_REQUEST]; 77 struct grant **grants_used;
78 struct grant **indirect_grants;
79 struct scatterlist *sg;
80};
81
82struct split_bio {
83 struct bio *bio;
84 atomic_t pending;
85 int err;
78}; 86};
79 87
80static DEFINE_MUTEX(blkfront_mutex); 88static DEFINE_MUTEX(blkfront_mutex);
81static const struct block_device_operations xlvbd_block_fops; 89static const struct block_device_operations xlvbd_block_fops;
82 90
91/*
92 * Maximum number of segments in indirect requests, the actual value used by
93 * the frontend driver is the minimum of this value and the value provided
94 * by the backend driver.
95 */
96
97static unsigned int xen_blkif_max_segments = 32;
98module_param_named(max, xen_blkif_max_segments, int, S_IRUGO);
99MODULE_PARM_DESC(max, "Maximum amount of segments in indirect requests (default is 32)");
100
83#define BLK_RING_SIZE __CONST_RING_SIZE(blkif, PAGE_SIZE) 101#define BLK_RING_SIZE __CONST_RING_SIZE(blkif, PAGE_SIZE)
84 102
85/* 103/*
@@ -98,7 +116,6 @@ struct blkfront_info
98 enum blkif_state connected; 116 enum blkif_state connected;
99 int ring_ref; 117 int ring_ref;
100 struct blkif_front_ring ring; 118 struct blkif_front_ring ring;
101 struct scatterlist sg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
102 unsigned int evtchn, irq; 119 unsigned int evtchn, irq;
103 struct request_queue *rq; 120 struct request_queue *rq;
104 struct work_struct work; 121 struct work_struct work;
@@ -114,6 +131,7 @@ struct blkfront_info
114 unsigned int discard_granularity; 131 unsigned int discard_granularity;
115 unsigned int discard_alignment; 132 unsigned int discard_alignment;
116 unsigned int feature_persistent:1; 133 unsigned int feature_persistent:1;
134 unsigned int max_indirect_segments;
117 int is_ready; 135 int is_ready;
118}; 136};
119 137
@@ -142,6 +160,13 @@ static DEFINE_SPINLOCK(minor_lock);
142 160
143#define DEV_NAME "xvd" /* name in /dev */ 161#define DEV_NAME "xvd" /* name in /dev */
144 162
163#define SEGS_PER_INDIRECT_FRAME \
164 (PAGE_SIZE/sizeof(struct blkif_request_segment_aligned))
165#define INDIRECT_GREFS(_segs) \
166 ((_segs + SEGS_PER_INDIRECT_FRAME - 1)/SEGS_PER_INDIRECT_FRAME)
167
168static int blkfront_setup_indirect(struct blkfront_info *info);
169
145static int get_id_from_freelist(struct blkfront_info *info) 170static int get_id_from_freelist(struct blkfront_info *info)
146{ 171{
147 unsigned long free = info->shadow_free; 172 unsigned long free = info->shadow_free;
@@ -358,7 +383,8 @@ static int blkif_queue_request(struct request *req)
358 struct blkif_request *ring_req; 383 struct blkif_request *ring_req;
359 unsigned long id; 384 unsigned long id;
360 unsigned int fsect, lsect; 385 unsigned int fsect, lsect;
361 int i, ref; 386 int i, ref, n;
387 struct blkif_request_segment_aligned *segments = NULL;
362 388
363 /* 389 /*
364 * Used to store if we are able to queue the request by just using 390 * Used to store if we are able to queue the request by just using
@@ -369,21 +395,27 @@ static int blkif_queue_request(struct request *req)
369 grant_ref_t gref_head; 395 grant_ref_t gref_head;
370 struct grant *gnt_list_entry = NULL; 396 struct grant *gnt_list_entry = NULL;
371 struct scatterlist *sg; 397 struct scatterlist *sg;
398 int nseg, max_grefs;
372 399
373 if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) 400 if (unlikely(info->connected != BLKIF_STATE_CONNECTED))
374 return 1; 401 return 1;
375 402
376 /* Check if we have enought grants to allocate a requests */ 403 max_grefs = info->max_indirect_segments ?
377 if (info->persistent_gnts_c < BLKIF_MAX_SEGMENTS_PER_REQUEST) { 404 info->max_indirect_segments +
405 INDIRECT_GREFS(info->max_indirect_segments) :
406 BLKIF_MAX_SEGMENTS_PER_REQUEST;
407
408 /* Check if we have enough grants to allocate a requests */
409 if (info->persistent_gnts_c < max_grefs) {
378 new_persistent_gnts = 1; 410 new_persistent_gnts = 1;
379 if (gnttab_alloc_grant_references( 411 if (gnttab_alloc_grant_references(
380 BLKIF_MAX_SEGMENTS_PER_REQUEST - info->persistent_gnts_c, 412 max_grefs - info->persistent_gnts_c,
381 &gref_head) < 0) { 413 &gref_head) < 0) {
382 gnttab_request_free_callback( 414 gnttab_request_free_callback(
383 &info->callback, 415 &info->callback,
384 blkif_restart_queue_callback, 416 blkif_restart_queue_callback,
385 info, 417 info,
386 BLKIF_MAX_SEGMENTS_PER_REQUEST); 418 max_grefs);
387 return 1; 419 return 1;
388 } 420 }
389 } else 421 } else
@@ -394,42 +426,67 @@ static int blkif_queue_request(struct request *req)
394 id = get_id_from_freelist(info); 426 id = get_id_from_freelist(info);
395 info->shadow[id].request = req; 427 info->shadow[id].request = req;
396 428
397 ring_req->u.rw.id = id;
398 ring_req->u.rw.sector_number = (blkif_sector_t)blk_rq_pos(req);
399 ring_req->u.rw.handle = info->handle;
400
401 ring_req->operation = rq_data_dir(req) ?
402 BLKIF_OP_WRITE : BLKIF_OP_READ;
403
404 if (req->cmd_flags & (REQ_FLUSH | REQ_FUA)) {
405 /*
406 * Ideally we can do an unordered flush-to-disk. In case the
407 * backend onlysupports barriers, use that. A barrier request
408 * a superset of FUA, so we can implement it the same
409 * way. (It's also a FLUSH+FUA, since it is
410 * guaranteed ordered WRT previous writes.)
411 */
412 ring_req->operation = info->flush_op;
413 }
414
415 if (unlikely(req->cmd_flags & (REQ_DISCARD | REQ_SECURE))) { 429 if (unlikely(req->cmd_flags & (REQ_DISCARD | REQ_SECURE))) {
416 /* id, sector_number and handle are set above. */
417 ring_req->operation = BLKIF_OP_DISCARD; 430 ring_req->operation = BLKIF_OP_DISCARD;
418 ring_req->u.discard.nr_sectors = blk_rq_sectors(req); 431 ring_req->u.discard.nr_sectors = blk_rq_sectors(req);
432 ring_req->u.discard.id = id;
433 ring_req->u.discard.sector_number = (blkif_sector_t)blk_rq_pos(req);
419 if ((req->cmd_flags & REQ_SECURE) && info->feature_secdiscard) 434 if ((req->cmd_flags & REQ_SECURE) && info->feature_secdiscard)
420 ring_req->u.discard.flag = BLKIF_DISCARD_SECURE; 435 ring_req->u.discard.flag = BLKIF_DISCARD_SECURE;
421 else 436 else
422 ring_req->u.discard.flag = 0; 437 ring_req->u.discard.flag = 0;
423 } else { 438 } else {
424 ring_req->u.rw.nr_segments = blk_rq_map_sg(req->q, req, 439 BUG_ON(info->max_indirect_segments == 0 &&
425 info->sg); 440 req->nr_phys_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST);
426 BUG_ON(ring_req->u.rw.nr_segments > 441 BUG_ON(info->max_indirect_segments &&
427 BLKIF_MAX_SEGMENTS_PER_REQUEST); 442 req->nr_phys_segments > info->max_indirect_segments);
428 443 nseg = blk_rq_map_sg(req->q, req, info->shadow[id].sg);
429 for_each_sg(info->sg, sg, ring_req->u.rw.nr_segments, i) { 444 ring_req->u.rw.id = id;
445 if (nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST) {
446 /*
447 * The indirect operation can only be a BLKIF_OP_READ or
448 * BLKIF_OP_WRITE
449 */
450 BUG_ON(req->cmd_flags & (REQ_FLUSH | REQ_FUA));
451 ring_req->operation = BLKIF_OP_INDIRECT;
452 ring_req->u.indirect.indirect_op = rq_data_dir(req) ?
453 BLKIF_OP_WRITE : BLKIF_OP_READ;
454 ring_req->u.indirect.sector_number = (blkif_sector_t)blk_rq_pos(req);
455 ring_req->u.indirect.handle = info->handle;
456 ring_req->u.indirect.nr_segments = nseg;
457 } else {
458 ring_req->u.rw.sector_number = (blkif_sector_t)blk_rq_pos(req);
459 ring_req->u.rw.handle = info->handle;
460 ring_req->operation = rq_data_dir(req) ?
461 BLKIF_OP_WRITE : BLKIF_OP_READ;
462 if (req->cmd_flags & (REQ_FLUSH | REQ_FUA)) {
463 /*
464 * Ideally we can do an unordered flush-to-disk. In case the
465 * backend onlysupports barriers, use that. A barrier request
466 * a superset of FUA, so we can implement it the same
467 * way. (It's also a FLUSH+FUA, since it is
468 * guaranteed ordered WRT previous writes.)
469 */
470 ring_req->operation = info->flush_op;
471 }
472 ring_req->u.rw.nr_segments = nseg;
473 }
474 for_each_sg(info->shadow[id].sg, sg, nseg, i) {
430 fsect = sg->offset >> 9; 475 fsect = sg->offset >> 9;
431 lsect = fsect + (sg->length >> 9) - 1; 476 lsect = fsect + (sg->length >> 9) - 1;
432 477
478 if ((ring_req->operation == BLKIF_OP_INDIRECT) &&
479 (i % SEGS_PER_INDIRECT_FRAME == 0)) {
480 if (segments)
481 kunmap_atomic(segments);
482
483 n = i / SEGS_PER_INDIRECT_FRAME;
484 gnt_list_entry = get_grant(&gref_head, info);
485 info->shadow[id].indirect_grants[n] = gnt_list_entry;
486 segments = kmap_atomic(pfn_to_page(gnt_list_entry->pfn));
487 ring_req->u.indirect.indirect_grefs[n] = gnt_list_entry->gref;
488 }
489
433 gnt_list_entry = get_grant(&gref_head, info); 490 gnt_list_entry = get_grant(&gref_head, info);
434 ref = gnt_list_entry->gref; 491 ref = gnt_list_entry->gref;
435 492
@@ -441,8 +498,7 @@ static int blkif_queue_request(struct request *req)
441 498
442 BUG_ON(sg->offset + sg->length > PAGE_SIZE); 499 BUG_ON(sg->offset + sg->length > PAGE_SIZE);
443 500
444 shared_data = kmap_atomic( 501 shared_data = kmap_atomic(pfn_to_page(gnt_list_entry->pfn));
445 pfn_to_page(gnt_list_entry->pfn));
446 bvec_data = kmap_atomic(sg_page(sg)); 502 bvec_data = kmap_atomic(sg_page(sg));
447 503
448 /* 504 /*
@@ -461,13 +517,23 @@ static int blkif_queue_request(struct request *req)
461 kunmap_atomic(bvec_data); 517 kunmap_atomic(bvec_data);
462 kunmap_atomic(shared_data); 518 kunmap_atomic(shared_data);
463 } 519 }
464 520 if (ring_req->operation != BLKIF_OP_INDIRECT) {
465 ring_req->u.rw.seg[i] = 521 ring_req->u.rw.seg[i] =
466 (struct blkif_request_segment) { 522 (struct blkif_request_segment) {
467 .gref = ref, 523 .gref = ref,
468 .first_sect = fsect, 524 .first_sect = fsect,
469 .last_sect = lsect }; 525 .last_sect = lsect };
526 } else {
527 n = i % SEGS_PER_INDIRECT_FRAME;
528 segments[n] =
529 (struct blkif_request_segment_aligned) {
530 .gref = ref,
531 .first_sect = fsect,
532 .last_sect = lsect };
533 }
470 } 534 }
535 if (segments)
536 kunmap_atomic(segments);
471 } 537 }
472 538
473 info->ring.req_prod_pvt++; 539 info->ring.req_prod_pvt++;
@@ -542,7 +608,9 @@ wait:
542 flush_requests(info); 608 flush_requests(info);
543} 609}
544 610
545static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size) 611static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size,
612 unsigned int physical_sector_size,
613 unsigned int segments)
546{ 614{
547 struct request_queue *rq; 615 struct request_queue *rq;
548 struct blkfront_info *info = gd->private_data; 616 struct blkfront_info *info = gd->private_data;
@@ -564,14 +632,15 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size)
564 632
565 /* Hard sector size and max sectors impersonate the equiv. hardware. */ 633 /* Hard sector size and max sectors impersonate the equiv. hardware. */
566 blk_queue_logical_block_size(rq, sector_size); 634 blk_queue_logical_block_size(rq, sector_size);
567 blk_queue_max_hw_sectors(rq, 512); 635 blk_queue_physical_block_size(rq, physical_sector_size);
636 blk_queue_max_hw_sectors(rq, (segments * PAGE_SIZE) / 512);
568 637
569 /* Each segment in a request is up to an aligned page in size. */ 638 /* Each segment in a request is up to an aligned page in size. */
570 blk_queue_segment_boundary(rq, PAGE_SIZE - 1); 639 blk_queue_segment_boundary(rq, PAGE_SIZE - 1);
571 blk_queue_max_segment_size(rq, PAGE_SIZE); 640 blk_queue_max_segment_size(rq, PAGE_SIZE);
572 641
573 /* Ensure a merged request will fit in a single I/O ring slot. */ 642 /* Ensure a merged request will fit in a single I/O ring slot. */
574 blk_queue_max_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST); 643 blk_queue_max_segments(rq, segments);
575 644
576 /* Make sure buffer addresses are sector-aligned. */ 645 /* Make sure buffer addresses are sector-aligned. */
577 blk_queue_dma_alignment(rq, 511); 646 blk_queue_dma_alignment(rq, 511);
@@ -588,13 +657,16 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size)
588static void xlvbd_flush(struct blkfront_info *info) 657static void xlvbd_flush(struct blkfront_info *info)
589{ 658{
590 blk_queue_flush(info->rq, info->feature_flush); 659 blk_queue_flush(info->rq, info->feature_flush);
591 printk(KERN_INFO "blkfront: %s: %s: %s %s\n", 660 printk(KERN_INFO "blkfront: %s: %s: %s %s %s %s %s\n",
592 info->gd->disk_name, 661 info->gd->disk_name,
593 info->flush_op == BLKIF_OP_WRITE_BARRIER ? 662 info->flush_op == BLKIF_OP_WRITE_BARRIER ?
594 "barrier" : (info->flush_op == BLKIF_OP_FLUSH_DISKCACHE ? 663 "barrier" : (info->flush_op == BLKIF_OP_FLUSH_DISKCACHE ?
595 "flush diskcache" : "barrier or flush"), 664 "flush diskcache" : "barrier or flush"),
596 info->feature_flush ? "enabled" : "disabled", 665 info->feature_flush ? "enabled;" : "disabled;",
597 info->feature_persistent ? "using persistent grants" : ""); 666 "persistent grants:",
667 info->feature_persistent ? "enabled;" : "disabled;",
668 "indirect descriptors:",
669 info->max_indirect_segments ? "enabled;" : "disabled;");
598} 670}
599 671
600static int xen_translate_vdev(int vdevice, int *minor, unsigned int *offset) 672static int xen_translate_vdev(int vdevice, int *minor, unsigned int *offset)
@@ -667,7 +739,8 @@ static char *encode_disk_name(char *ptr, unsigned int n)
667 739
668static int xlvbd_alloc_gendisk(blkif_sector_t capacity, 740static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
669 struct blkfront_info *info, 741 struct blkfront_info *info,
670 u16 vdisk_info, u16 sector_size) 742 u16 vdisk_info, u16 sector_size,
743 unsigned int physical_sector_size)
671{ 744{
672 struct gendisk *gd; 745 struct gendisk *gd;
673 int nr_minors = 1; 746 int nr_minors = 1;
@@ -734,7 +807,9 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
734 gd->driverfs_dev = &(info->xbdev->dev); 807 gd->driverfs_dev = &(info->xbdev->dev);
735 set_capacity(gd, capacity); 808 set_capacity(gd, capacity);
736 809
737 if (xlvbd_init_blk_queue(gd, sector_size)) { 810 if (xlvbd_init_blk_queue(gd, sector_size, physical_sector_size,
811 info->max_indirect_segments ? :
812 BLKIF_MAX_SEGMENTS_PER_REQUEST)) {
738 del_gendisk(gd); 813 del_gendisk(gd);
739 goto release; 814 goto release;
740 } 815 }
@@ -818,6 +893,7 @@ static void blkif_free(struct blkfront_info *info, int suspend)
818{ 893{
819 struct grant *persistent_gnt; 894 struct grant *persistent_gnt;
820 struct grant *n; 895 struct grant *n;
896 int i, j, segs;
821 897
822 /* Prevent new requests being issued until we fix things up. */ 898 /* Prevent new requests being issued until we fix things up. */
823 spin_lock_irq(&info->io_lock); 899 spin_lock_irq(&info->io_lock);
@@ -843,6 +919,47 @@ static void blkif_free(struct blkfront_info *info, int suspend)
843 } 919 }
844 BUG_ON(info->persistent_gnts_c != 0); 920 BUG_ON(info->persistent_gnts_c != 0);
845 921
922 for (i = 0; i < BLK_RING_SIZE; i++) {
923 /*
924 * Clear persistent grants present in requests already
925 * on the shared ring
926 */
927 if (!info->shadow[i].request)
928 goto free_shadow;
929
930 segs = info->shadow[i].req.operation == BLKIF_OP_INDIRECT ?
931 info->shadow[i].req.u.indirect.nr_segments :
932 info->shadow[i].req.u.rw.nr_segments;
933 for (j = 0; j < segs; j++) {
934 persistent_gnt = info->shadow[i].grants_used[j];
935 gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL);
936 __free_page(pfn_to_page(persistent_gnt->pfn));
937 kfree(persistent_gnt);
938 }
939
940 if (info->shadow[i].req.operation != BLKIF_OP_INDIRECT)
941 /*
942 * If this is not an indirect operation don't try to
943 * free indirect segments
944 */
945 goto free_shadow;
946
947 for (j = 0; j < INDIRECT_GREFS(segs); j++) {
948 persistent_gnt = info->shadow[i].indirect_grants[j];
949 gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL);
950 __free_page(pfn_to_page(persistent_gnt->pfn));
951 kfree(persistent_gnt);
952 }
953
954free_shadow:
955 kfree(info->shadow[i].grants_used);
956 info->shadow[i].grants_used = NULL;
957 kfree(info->shadow[i].indirect_grants);
958 info->shadow[i].indirect_grants = NULL;
959 kfree(info->shadow[i].sg);
960 info->shadow[i].sg = NULL;
961 }
962
846 /* No more gnttab callback work. */ 963 /* No more gnttab callback work. */
847 gnttab_cancel_free_callback(&info->callback); 964 gnttab_cancel_free_callback(&info->callback);
848 spin_unlock_irq(&info->io_lock); 965 spin_unlock_irq(&info->io_lock);
@@ -867,12 +984,13 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info,
867 struct blkif_response *bret) 984 struct blkif_response *bret)
868{ 985{
869 int i = 0; 986 int i = 0;
870 struct bio_vec *bvec; 987 struct scatterlist *sg;
871 struct req_iterator iter;
872 unsigned long flags;
873 char *bvec_data; 988 char *bvec_data;
874 void *shared_data; 989 void *shared_data;
875 unsigned int offset = 0; 990 int nseg;
991
992 nseg = s->req.operation == BLKIF_OP_INDIRECT ?
993 s->req.u.indirect.nr_segments : s->req.u.rw.nr_segments;
876 994
877 if (bret->operation == BLKIF_OP_READ) { 995 if (bret->operation == BLKIF_OP_READ) {
878 /* 996 /*
@@ -881,26 +999,29 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info,
881 * than PAGE_SIZE, we have to keep track of the current offset, 999 * than PAGE_SIZE, we have to keep track of the current offset,
882 * to be sure we are copying the data from the right shared page. 1000 * to be sure we are copying the data from the right shared page.
883 */ 1001 */
884 rq_for_each_segment(bvec, s->request, iter) { 1002 for_each_sg(s->sg, sg, nseg, i) {
885 BUG_ON((bvec->bv_offset + bvec->bv_len) > PAGE_SIZE); 1003 BUG_ON(sg->offset + sg->length > PAGE_SIZE);
886 if (bvec->bv_offset < offset)
887 i++;
888 BUG_ON(i >= s->req.u.rw.nr_segments);
889 shared_data = kmap_atomic( 1004 shared_data = kmap_atomic(
890 pfn_to_page(s->grants_used[i]->pfn)); 1005 pfn_to_page(s->grants_used[i]->pfn));
891 bvec_data = bvec_kmap_irq(bvec, &flags); 1006 bvec_data = kmap_atomic(sg_page(sg));
892 memcpy(bvec_data, shared_data + bvec->bv_offset, 1007 memcpy(bvec_data + sg->offset,
893 bvec->bv_len); 1008 shared_data + sg->offset,
894 bvec_kunmap_irq(bvec_data, &flags); 1009 sg->length);
1010 kunmap_atomic(bvec_data);
895 kunmap_atomic(shared_data); 1011 kunmap_atomic(shared_data);
896 offset = bvec->bv_offset + bvec->bv_len;
897 } 1012 }
898 } 1013 }
899 /* Add the persistent grant into the list of free grants */ 1014 /* Add the persistent grant into the list of free grants */
900 for (i = 0; i < s->req.u.rw.nr_segments; i++) { 1015 for (i = 0; i < nseg; i++) {
901 list_add(&s->grants_used[i]->node, &info->persistent_gnts); 1016 list_add(&s->grants_used[i]->node, &info->persistent_gnts);
902 info->persistent_gnts_c++; 1017 info->persistent_gnts_c++;
903 } 1018 }
1019 if (s->req.operation == BLKIF_OP_INDIRECT) {
1020 for (i = 0; i < INDIRECT_GREFS(nseg); i++) {
1021 list_add(&s->indirect_grants[i]->node, &info->persistent_gnts);
1022 info->persistent_gnts_c++;
1023 }
1024 }
904} 1025}
905 1026
906static irqreturn_t blkif_interrupt(int irq, void *dev_id) 1027static irqreturn_t blkif_interrupt(int irq, void *dev_id)
@@ -1034,14 +1155,6 @@ static int setup_blkring(struct xenbus_device *dev,
1034 SHARED_RING_INIT(sring); 1155 SHARED_RING_INIT(sring);
1035 FRONT_RING_INIT(&info->ring, sring, PAGE_SIZE); 1156 FRONT_RING_INIT(&info->ring, sring, PAGE_SIZE);
1036 1157
1037 sg_init_table(info->sg, BLKIF_MAX_SEGMENTS_PER_REQUEST);
1038
1039 /* Allocate memory for grants */
1040 err = fill_grant_buffer(info, BLK_RING_SIZE *
1041 BLKIF_MAX_SEGMENTS_PER_REQUEST);
1042 if (err)
1043 goto fail;
1044
1045 err = xenbus_grant_ring(dev, virt_to_mfn(info->ring.sring)); 1158 err = xenbus_grant_ring(dev, virt_to_mfn(info->ring.sring));
1046 if (err < 0) { 1159 if (err < 0) {
1047 free_page((unsigned long)sring); 1160 free_page((unsigned long)sring);
@@ -1223,13 +1336,84 @@ static int blkfront_probe(struct xenbus_device *dev,
1223 return 0; 1336 return 0;
1224} 1337}
1225 1338
1339/*
1340 * This is a clone of md_trim_bio, used to split a bio into smaller ones
1341 */
1342static void trim_bio(struct bio *bio, int offset, int size)
1343{
1344 /* 'bio' is a cloned bio which we need to trim to match
1345 * the given offset and size.
1346 * This requires adjusting bi_sector, bi_size, and bi_io_vec
1347 */
1348 int i;
1349 struct bio_vec *bvec;
1350 int sofar = 0;
1351
1352 size <<= 9;
1353 if (offset == 0 && size == bio->bi_size)
1354 return;
1355
1356 bio->bi_sector += offset;
1357 bio->bi_size = size;
1358 offset <<= 9;
1359 clear_bit(BIO_SEG_VALID, &bio->bi_flags);
1360
1361 while (bio->bi_idx < bio->bi_vcnt &&
1362 bio->bi_io_vec[bio->bi_idx].bv_len <= offset) {
1363 /* remove this whole bio_vec */
1364 offset -= bio->bi_io_vec[bio->bi_idx].bv_len;
1365 bio->bi_idx++;
1366 }
1367 if (bio->bi_idx < bio->bi_vcnt) {
1368 bio->bi_io_vec[bio->bi_idx].bv_offset += offset;
1369 bio->bi_io_vec[bio->bi_idx].bv_len -= offset;
1370 }
1371 /* avoid any complications with bi_idx being non-zero*/
1372 if (bio->bi_idx) {
1373 memmove(bio->bi_io_vec, bio->bi_io_vec+bio->bi_idx,
1374 (bio->bi_vcnt - bio->bi_idx) * sizeof(struct bio_vec));
1375 bio->bi_vcnt -= bio->bi_idx;
1376 bio->bi_idx = 0;
1377 }
1378 /* Make sure vcnt and last bv are not too big */
1379 bio_for_each_segment(bvec, bio, i) {
1380 if (sofar + bvec->bv_len > size)
1381 bvec->bv_len = size - sofar;
1382 if (bvec->bv_len == 0) {
1383 bio->bi_vcnt = i;
1384 break;
1385 }
1386 sofar += bvec->bv_len;
1387 }
1388}
1389
1390static void split_bio_end(struct bio *bio, int error)
1391{
1392 struct split_bio *split_bio = bio->bi_private;
1393
1394 if (error)
1395 split_bio->err = error;
1396
1397 if (atomic_dec_and_test(&split_bio->pending)) {
1398 split_bio->bio->bi_phys_segments = 0;
1399 bio_endio(split_bio->bio, split_bio->err);
1400 kfree(split_bio);
1401 }
1402 bio_put(bio);
1403}
1226 1404
1227static int blkif_recover(struct blkfront_info *info) 1405static int blkif_recover(struct blkfront_info *info)
1228{ 1406{
1229 int i; 1407 int i;
1230 struct blkif_request *req; 1408 struct request *req, *n;
1231 struct blk_shadow *copy; 1409 struct blk_shadow *copy;
1232 int j; 1410 int rc;
1411 struct bio *bio, *cloned_bio;
1412 struct bio_list bio_list, merge_bio;
1413 unsigned int segs, offset;
1414 int pending, size;
1415 struct split_bio *split_bio;
1416 struct list_head requests;
1233 1417
1234 /* Stage 1: Make a safe copy of the shadow state. */ 1418 /* Stage 1: Make a safe copy of the shadow state. */
1235 copy = kmemdup(info->shadow, sizeof(info->shadow), 1419 copy = kmemdup(info->shadow, sizeof(info->shadow),
@@ -1244,36 +1428,64 @@ static int blkif_recover(struct blkfront_info *info)
1244 info->shadow_free = info->ring.req_prod_pvt; 1428 info->shadow_free = info->ring.req_prod_pvt;
1245 info->shadow[BLK_RING_SIZE-1].req.u.rw.id = 0x0fffffff; 1429 info->shadow[BLK_RING_SIZE-1].req.u.rw.id = 0x0fffffff;
1246 1430
1247 /* Stage 3: Find pending requests and requeue them. */ 1431 rc = blkfront_setup_indirect(info);
1432 if (rc) {
1433 kfree(copy);
1434 return rc;
1435 }
1436
1437 segs = info->max_indirect_segments ? : BLKIF_MAX_SEGMENTS_PER_REQUEST;
1438 blk_queue_max_segments(info->rq, segs);
1439 bio_list_init(&bio_list);
1440 INIT_LIST_HEAD(&requests);
1248 for (i = 0; i < BLK_RING_SIZE; i++) { 1441 for (i = 0; i < BLK_RING_SIZE; i++) {
1249 /* Not in use? */ 1442 /* Not in use? */
1250 if (!copy[i].request) 1443 if (!copy[i].request)
1251 continue; 1444 continue;
1252 1445
1253 /* Grab a request slot and copy shadow state into it. */ 1446 /*
1254 req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt); 1447 * Get the bios in the request so we can re-queue them.
1255 *req = copy[i].req; 1448 */
1256 1449 if (copy[i].request->cmd_flags &
1257 /* We get a new request id, and must reset the shadow state. */ 1450 (REQ_FLUSH | REQ_FUA | REQ_DISCARD | REQ_SECURE)) {
1258 req->u.rw.id = get_id_from_freelist(info); 1451 /*
1259 memcpy(&info->shadow[req->u.rw.id], &copy[i], sizeof(copy[i])); 1452 * Flush operations don't contain bios, so
1260 1453 * we need to requeue the whole request
1261 if (req->operation != BLKIF_OP_DISCARD) { 1454 */
1262 /* Rewrite any grant references invalidated by susp/resume. */ 1455 list_add(&copy[i].request->queuelist, &requests);
1263 for (j = 0; j < req->u.rw.nr_segments; j++) 1456 continue;
1264 gnttab_grant_foreign_access_ref(
1265 req->u.rw.seg[j].gref,
1266 info->xbdev->otherend_id,
1267 pfn_to_mfn(copy[i].grants_used[j]->pfn),
1268 0);
1269 } 1457 }
1270 info->shadow[req->u.rw.id].req = *req; 1458 merge_bio.head = copy[i].request->bio;
1271 1459 merge_bio.tail = copy[i].request->biotail;
1272 info->ring.req_prod_pvt++; 1460 bio_list_merge(&bio_list, &merge_bio);
1461 copy[i].request->bio = NULL;
1462 blk_put_request(copy[i].request);
1273 } 1463 }
1274 1464
1275 kfree(copy); 1465 kfree(copy);
1276 1466
1467 /*
1468 * Empty the queue, this is important because we might have
1469 * requests in the queue with more segments than what we
1470 * can handle now.
1471 */
1472 spin_lock_irq(&info->io_lock);
1473 while ((req = blk_fetch_request(info->rq)) != NULL) {
1474 if (req->cmd_flags &
1475 (REQ_FLUSH | REQ_FUA | REQ_DISCARD | REQ_SECURE)) {
1476 list_add(&req->queuelist, &requests);
1477 continue;
1478 }
1479 merge_bio.head = req->bio;
1480 merge_bio.tail = req->biotail;
1481 bio_list_merge(&bio_list, &merge_bio);
1482 req->bio = NULL;
1483 if (req->cmd_flags & (REQ_FLUSH | REQ_FUA))
1484 pr_alert("diskcache flush request found!\n");
1485 __blk_put_request(info->rq, req);
1486 }
1487 spin_unlock_irq(&info->io_lock);
1488
1277 xenbus_switch_state(info->xbdev, XenbusStateConnected); 1489 xenbus_switch_state(info->xbdev, XenbusStateConnected);
1278 1490
1279 spin_lock_irq(&info->io_lock); 1491 spin_lock_irq(&info->io_lock);
@@ -1281,14 +1493,50 @@ static int blkif_recover(struct blkfront_info *info)
1281 /* Now safe for us to use the shared ring */ 1493 /* Now safe for us to use the shared ring */
1282 info->connected = BLKIF_STATE_CONNECTED; 1494 info->connected = BLKIF_STATE_CONNECTED;
1283 1495
1284 /* Send off requeued requests */
1285 flush_requests(info);
1286
1287 /* Kick any other new requests queued since we resumed */ 1496 /* Kick any other new requests queued since we resumed */
1288 kick_pending_request_queues(info); 1497 kick_pending_request_queues(info);
1289 1498
1499 list_for_each_entry_safe(req, n, &requests, queuelist) {
1500 /* Requeue pending requests (flush or discard) */
1501 list_del_init(&req->queuelist);
1502 BUG_ON(req->nr_phys_segments > segs);
1503 blk_requeue_request(info->rq, req);
1504 }
1290 spin_unlock_irq(&info->io_lock); 1505 spin_unlock_irq(&info->io_lock);
1291 1506
1507 while ((bio = bio_list_pop(&bio_list)) != NULL) {
1508 /* Traverse the list of pending bios and re-queue them */
1509 if (bio_segments(bio) > segs) {
1510 /*
1511 * This bio has more segments than what we can
1512 * handle, we have to split it.
1513 */
1514 pending = (bio_segments(bio) + segs - 1) / segs;
1515 split_bio = kzalloc(sizeof(*split_bio), GFP_NOIO);
1516 BUG_ON(split_bio == NULL);
1517 atomic_set(&split_bio->pending, pending);
1518 split_bio->bio = bio;
1519 for (i = 0; i < pending; i++) {
1520 offset = (i * segs * PAGE_SIZE) >> 9;
1521 size = min((unsigned int)(segs * PAGE_SIZE) >> 9,
1522 (unsigned int)(bio->bi_size >> 9) - offset);
1523 cloned_bio = bio_clone(bio, GFP_NOIO);
1524 BUG_ON(cloned_bio == NULL);
1525 trim_bio(cloned_bio, offset, size);
1526 cloned_bio->bi_private = split_bio;
1527 cloned_bio->bi_end_io = split_bio_end;
1528 submit_bio(cloned_bio->bi_rw, cloned_bio);
1529 }
1530 /*
1531 * Now we have to wait for all those smaller bios to
1532 * end, so we can also end the "parent" bio.
1533 */
1534 continue;
1535 }
1536 /* We don't need to split this bio */
1537 submit_bio(bio->bi_rw, bio);
1538 }
1539
1292 return 0; 1540 return 0;
1293} 1541}
1294 1542
@@ -1308,8 +1556,12 @@ static int blkfront_resume(struct xenbus_device *dev)
1308 blkif_free(info, info->connected == BLKIF_STATE_CONNECTED); 1556 blkif_free(info, info->connected == BLKIF_STATE_CONNECTED);
1309 1557
1310 err = talk_to_blkback(dev, info); 1558 err = talk_to_blkback(dev, info);
1311 if (info->connected == BLKIF_STATE_SUSPENDED && !err) 1559
1312 err = blkif_recover(info); 1560 /*
1561 * We have to wait for the backend to switch to
1562 * connected state, since we want to read which
1563 * features it supports.
1564 */
1313 1565
1314 return err; 1566 return err;
1315} 1567}
@@ -1387,6 +1639,60 @@ static void blkfront_setup_discard(struct blkfront_info *info)
1387 kfree(type); 1639 kfree(type);
1388} 1640}
1389 1641
1642static int blkfront_setup_indirect(struct blkfront_info *info)
1643{
1644 unsigned int indirect_segments, segs;
1645 int err, i;
1646
1647 err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
1648 "feature-max-indirect-segments", "%u", &indirect_segments,
1649 NULL);
1650 if (err) {
1651 info->max_indirect_segments = 0;
1652 segs = BLKIF_MAX_SEGMENTS_PER_REQUEST;
1653 } else {
1654 info->max_indirect_segments = min(indirect_segments,
1655 xen_blkif_max_segments);
1656 segs = info->max_indirect_segments;
1657 }
1658
1659 err = fill_grant_buffer(info, (segs + INDIRECT_GREFS(segs)) * BLK_RING_SIZE);
1660 if (err)
1661 goto out_of_memory;
1662
1663 for (i = 0; i < BLK_RING_SIZE; i++) {
1664 info->shadow[i].grants_used = kzalloc(
1665 sizeof(info->shadow[i].grants_used[0]) * segs,
1666 GFP_NOIO);
1667 info->shadow[i].sg = kzalloc(sizeof(info->shadow[i].sg[0]) * segs, GFP_NOIO);
1668 if (info->max_indirect_segments)
1669 info->shadow[i].indirect_grants = kzalloc(
1670 sizeof(info->shadow[i].indirect_grants[0]) *
1671 INDIRECT_GREFS(segs),
1672 GFP_NOIO);
1673 if ((info->shadow[i].grants_used == NULL) ||
1674 (info->shadow[i].sg == NULL) ||
1675 (info->max_indirect_segments &&
1676 (info->shadow[i].indirect_grants == NULL)))
1677 goto out_of_memory;
1678 sg_init_table(info->shadow[i].sg, segs);
1679 }
1680
1681
1682 return 0;
1683
1684out_of_memory:
1685 for (i = 0; i < BLK_RING_SIZE; i++) {
1686 kfree(info->shadow[i].grants_used);
1687 info->shadow[i].grants_used = NULL;
1688 kfree(info->shadow[i].sg);
1689 info->shadow[i].sg = NULL;
1690 kfree(info->shadow[i].indirect_grants);
1691 info->shadow[i].indirect_grants = NULL;
1692 }
1693 return -ENOMEM;
1694}
1695
1390/* 1696/*
1391 * Invoked when the backend is finally 'ready' (and has told produced 1697 * Invoked when the backend is finally 'ready' (and has told produced
1392 * the details about the physical device - #sectors, size, etc). 1698 * the details about the physical device - #sectors, size, etc).
@@ -1395,6 +1701,7 @@ static void blkfront_connect(struct blkfront_info *info)
1395{ 1701{
1396 unsigned long long sectors; 1702 unsigned long long sectors;
1397 unsigned long sector_size; 1703 unsigned long sector_size;
1704 unsigned int physical_sector_size;
1398 unsigned int binfo; 1705 unsigned int binfo;
1399 int err; 1706 int err;
1400 int barrier, flush, discard, persistent; 1707 int barrier, flush, discard, persistent;
@@ -1414,8 +1721,15 @@ static void blkfront_connect(struct blkfront_info *info)
1414 set_capacity(info->gd, sectors); 1721 set_capacity(info->gd, sectors);
1415 revalidate_disk(info->gd); 1722 revalidate_disk(info->gd);
1416 1723
1417 /* fall through */ 1724 return;
1418 case BLKIF_STATE_SUSPENDED: 1725 case BLKIF_STATE_SUSPENDED:
1726 /*
1727 * If we are recovering from suspension, we need to wait
1728 * for the backend to announce it's features before
1729 * reconnecting, at least we need to know if the backend
1730 * supports indirect descriptors, and how many.
1731 */
1732 blkif_recover(info);
1419 return; 1733 return;
1420 1734
1421 default: 1735 default:
@@ -1437,6 +1751,16 @@ static void blkfront_connect(struct blkfront_info *info)
1437 return; 1751 return;
1438 } 1752 }
1439 1753
1754 /*
1755 * physcial-sector-size is a newer field, so old backends may not
1756 * provide this. Assume physical sector size to be the same as
1757 * sector_size in that case.
1758 */
1759 err = xenbus_scanf(XBT_NIL, info->xbdev->otherend,
1760 "physical-sector-size", "%u", &physical_sector_size);
1761 if (err != 1)
1762 physical_sector_size = sector_size;
1763
1440 info->feature_flush = 0; 1764 info->feature_flush = 0;
1441 info->flush_op = 0; 1765 info->flush_op = 0;
1442 1766
@@ -1483,7 +1807,15 @@ static void blkfront_connect(struct blkfront_info *info)
1483 else 1807 else
1484 info->feature_persistent = persistent; 1808 info->feature_persistent = persistent;
1485 1809
1486 err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size); 1810 err = blkfront_setup_indirect(info);
1811 if (err) {
1812 xenbus_dev_fatal(info->xbdev, err, "setup_indirect at %s",
1813 info->xbdev->otherend);
1814 return;
1815 }
1816
1817 err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size,
1818 physical_sector_size);
1487 if (err) { 1819 if (err) {
1488 xenbus_dev_fatal(info->xbdev, err, "xlvbd_add at %s", 1820 xenbus_dev_fatal(info->xbdev, err, "xlvbd_add at %s",
1489 info->xbdev->otherend); 1821 info->xbdev->otherend);