aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJens Axboe <axboe@fb.com>2016-01-13 10:20:36 -0500
committerJens Axboe <axboe@fb.com>2016-01-13 10:20:36 -0500
commit038a75afc54d4b4dc9794213bb16e88c1a31a752 (patch)
treeb092390b1fa88956571577e8ed9edb391383c37d
parent9e35fdcb9cd54e381135310aae3d9bbb23cecda3 (diff)
parentc31ecf6c126dbc7f30234eaf6c4a079649a38de7 (diff)
Merge branch 'stable/for-jens-4.5' of git://git.kernel.org/pub/scm/linux/kernel/git/konrad/xen into for-4.5/drivers
Konrad writes: The pull is based on converting the backend driver into an multiqueue driver and exposing more than one queue to the frontend. As such we had to modify the frontend and also fix a bunch of bugs around this. The original work is based on Arianna Avanzini's work as an OPW intern. Bob took over the work and had been massaging it for quite some time. Also included are are features to 64KB page support for ARM and various bug-fixes.
-rw-r--r--drivers/block/xen-blkback/blkback.c391
-rw-r--r--drivers/block/xen-blkback/common.h86
-rw-r--r--drivers/block/xen-blkback/xenbus.c416
-rw-r--r--drivers/block/xen-blkfront.c1061
-rw-r--r--include/xen/interface/io/blkif.h48
5 files changed, 1292 insertions, 710 deletions
diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c
index f9099940c272..148930c8c121 100644
--- a/drivers/block/xen-blkback/blkback.c
+++ b/drivers/block/xen-blkback/blkback.c
@@ -84,6 +84,16 @@ MODULE_PARM_DESC(max_persistent_grants,
84 "Maximum number of grants to map persistently"); 84 "Maximum number of grants to map persistently");
85 85
86/* 86/*
87 * Maximum number of rings/queues blkback supports, allow as many queues as there
88 * are CPUs if user has not specified a value.
89 */
90unsigned int xenblk_max_queues;
91module_param_named(max_queues, xenblk_max_queues, uint, 0644);
92MODULE_PARM_DESC(max_queues,
93 "Maximum number of hardware queues per virtual disk." \
94 "By default it is the number of online CPUs.");
95
96/*
87 * Maximum order of pages to be used for the shared ring between front and 97 * Maximum order of pages to be used for the shared ring between front and
88 * backend, 4KB page granularity is used. 98 * backend, 4KB page granularity is used.
89 */ 99 */
@@ -113,71 +123,71 @@ module_param(log_stats, int, 0644);
113/* Number of free pages to remove on each call to gnttab_free_pages */ 123/* Number of free pages to remove on each call to gnttab_free_pages */
114#define NUM_BATCH_FREE_PAGES 10 124#define NUM_BATCH_FREE_PAGES 10
115 125
116static inline int get_free_page(struct xen_blkif *blkif, struct page **page) 126static inline int get_free_page(struct xen_blkif_ring *ring, struct page **page)
117{ 127{
118 unsigned long flags; 128 unsigned long flags;
119 129
120 spin_lock_irqsave(&blkif->free_pages_lock, flags); 130 spin_lock_irqsave(&ring->free_pages_lock, flags);
121 if (list_empty(&blkif->free_pages)) { 131 if (list_empty(&ring->free_pages)) {
122 BUG_ON(blkif->free_pages_num != 0); 132 BUG_ON(ring->free_pages_num != 0);
123 spin_unlock_irqrestore(&blkif->free_pages_lock, flags); 133 spin_unlock_irqrestore(&ring->free_pages_lock, flags);
124 return gnttab_alloc_pages(1, page); 134 return gnttab_alloc_pages(1, page);
125 } 135 }
126 BUG_ON(blkif->free_pages_num == 0); 136 BUG_ON(ring->free_pages_num == 0);
127 page[0] = list_first_entry(&blkif->free_pages, struct page, lru); 137 page[0] = list_first_entry(&ring->free_pages, struct page, lru);
128 list_del(&page[0]->lru); 138 list_del(&page[0]->lru);
129 blkif->free_pages_num--; 139 ring->free_pages_num--;
130 spin_unlock_irqrestore(&blkif->free_pages_lock, flags); 140 spin_unlock_irqrestore(&ring->free_pages_lock, flags);
131 141
132 return 0; 142 return 0;
133} 143}
134 144
135static inline void put_free_pages(struct xen_blkif *blkif, struct page **page, 145static inline void put_free_pages(struct xen_blkif_ring *ring, struct page **page,
136 int num) 146 int num)
137{ 147{
138 unsigned long flags; 148 unsigned long flags;
139 int i; 149 int i;
140 150
141 spin_lock_irqsave(&blkif->free_pages_lock, flags); 151 spin_lock_irqsave(&ring->free_pages_lock, flags);
142 for (i = 0; i < num; i++) 152 for (i = 0; i < num; i++)
143 list_add(&page[i]->lru, &blkif->free_pages); 153 list_add(&page[i]->lru, &ring->free_pages);
144 blkif->free_pages_num += num; 154 ring->free_pages_num += num;
145 spin_unlock_irqrestore(&blkif->free_pages_lock, flags); 155 spin_unlock_irqrestore(&ring->free_pages_lock, flags);
146} 156}
147 157
148static inline void shrink_free_pagepool(struct xen_blkif *blkif, int num) 158static inline void shrink_free_pagepool(struct xen_blkif_ring *ring, int num)
149{ 159{
150 /* Remove requested pages in batches of NUM_BATCH_FREE_PAGES */ 160 /* Remove requested pages in batches of NUM_BATCH_FREE_PAGES */
151 struct page *page[NUM_BATCH_FREE_PAGES]; 161 struct page *page[NUM_BATCH_FREE_PAGES];
152 unsigned int num_pages = 0; 162 unsigned int num_pages = 0;
153 unsigned long flags; 163 unsigned long flags;
154 164
155 spin_lock_irqsave(&blkif->free_pages_lock, flags); 165 spin_lock_irqsave(&ring->free_pages_lock, flags);
156 while (blkif->free_pages_num > num) { 166 while (ring->free_pages_num > num) {
157 BUG_ON(list_empty(&blkif->free_pages)); 167 BUG_ON(list_empty(&ring->free_pages));
158 page[num_pages] = list_first_entry(&blkif->free_pages, 168 page[num_pages] = list_first_entry(&ring->free_pages,
159 struct page, lru); 169 struct page, lru);
160 list_del(&page[num_pages]->lru); 170 list_del(&page[num_pages]->lru);
161 blkif->free_pages_num--; 171 ring->free_pages_num--;
162 if (++num_pages == NUM_BATCH_FREE_PAGES) { 172 if (++num_pages == NUM_BATCH_FREE_PAGES) {
163 spin_unlock_irqrestore(&blkif->free_pages_lock, flags); 173 spin_unlock_irqrestore(&ring->free_pages_lock, flags);
164 gnttab_free_pages(num_pages, page); 174 gnttab_free_pages(num_pages, page);
165 spin_lock_irqsave(&blkif->free_pages_lock, flags); 175 spin_lock_irqsave(&ring->free_pages_lock, flags);
166 num_pages = 0; 176 num_pages = 0;
167 } 177 }
168 } 178 }
169 spin_unlock_irqrestore(&blkif->free_pages_lock, flags); 179 spin_unlock_irqrestore(&ring->free_pages_lock, flags);
170 if (num_pages != 0) 180 if (num_pages != 0)
171 gnttab_free_pages(num_pages, page); 181 gnttab_free_pages(num_pages, page);
172} 182}
173 183
174#define vaddr(page) ((unsigned long)pfn_to_kaddr(page_to_pfn(page))) 184#define vaddr(page) ((unsigned long)pfn_to_kaddr(page_to_pfn(page)))
175 185
176static int do_block_io_op(struct xen_blkif *blkif); 186static int do_block_io_op(struct xen_blkif_ring *ring);
177static int dispatch_rw_block_io(struct xen_blkif *blkif, 187static int dispatch_rw_block_io(struct xen_blkif_ring *ring,
178 struct blkif_request *req, 188 struct blkif_request *req,
179 struct pending_req *pending_req); 189 struct pending_req *pending_req);
180static void make_response(struct xen_blkif *blkif, u64 id, 190static void make_response(struct xen_blkif_ring *ring, u64 id,
181 unsigned short op, int st); 191 unsigned short op, int st);
182 192
183#define foreach_grant_safe(pos, n, rbtree, node) \ 193#define foreach_grant_safe(pos, n, rbtree, node) \
@@ -190,7 +200,7 @@ static void make_response(struct xen_blkif *blkif, u64 id,
190 200
191/* 201/*
192 * We don't need locking around the persistent grant helpers 202 * We don't need locking around the persistent grant helpers
193 * because blkback uses a single-thread for each backed, so we 203 * because blkback uses a single-thread for each backend, so we
194 * can be sure that this functions will never be called recursively. 204 * can be sure that this functions will never be called recursively.
195 * 205 *
196 * The only exception to that is put_persistent_grant, that can be called 206 * The only exception to that is put_persistent_grant, that can be called
@@ -198,19 +208,20 @@ static void make_response(struct xen_blkif *blkif, u64 id,
198 * bit operations to modify the flags of a persistent grant and to count 208 * bit operations to modify the flags of a persistent grant and to count
199 * the number of used grants. 209 * the number of used grants.
200 */ 210 */
201static int add_persistent_gnt(struct xen_blkif *blkif, 211static int add_persistent_gnt(struct xen_blkif_ring *ring,
202 struct persistent_gnt *persistent_gnt) 212 struct persistent_gnt *persistent_gnt)
203{ 213{
204 struct rb_node **new = NULL, *parent = NULL; 214 struct rb_node **new = NULL, *parent = NULL;
205 struct persistent_gnt *this; 215 struct persistent_gnt *this;
216 struct xen_blkif *blkif = ring->blkif;
206 217
207 if (blkif->persistent_gnt_c >= xen_blkif_max_pgrants) { 218 if (ring->persistent_gnt_c >= xen_blkif_max_pgrants) {
208 if (!blkif->vbd.overflow_max_grants) 219 if (!blkif->vbd.overflow_max_grants)
209 blkif->vbd.overflow_max_grants = 1; 220 blkif->vbd.overflow_max_grants = 1;
210 return -EBUSY; 221 return -EBUSY;
211 } 222 }
212 /* Figure out where to put new node */ 223 /* Figure out where to put new node */
213 new = &blkif->persistent_gnts.rb_node; 224 new = &ring->persistent_gnts.rb_node;
214 while (*new) { 225 while (*new) {
215 this = container_of(*new, struct persistent_gnt, node); 226 this = container_of(*new, struct persistent_gnt, node);
216 227
@@ -229,19 +240,19 @@ static int add_persistent_gnt(struct xen_blkif *blkif,
229 set_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags); 240 set_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags);
230 /* Add new node and rebalance tree. */ 241 /* Add new node and rebalance tree. */
231 rb_link_node(&(persistent_gnt->node), parent, new); 242 rb_link_node(&(persistent_gnt->node), parent, new);
232 rb_insert_color(&(persistent_gnt->node), &blkif->persistent_gnts); 243 rb_insert_color(&(persistent_gnt->node), &ring->persistent_gnts);
233 blkif->persistent_gnt_c++; 244 ring->persistent_gnt_c++;
234 atomic_inc(&blkif->persistent_gnt_in_use); 245 atomic_inc(&ring->persistent_gnt_in_use);
235 return 0; 246 return 0;
236} 247}
237 248
238static struct persistent_gnt *get_persistent_gnt(struct xen_blkif *blkif, 249static struct persistent_gnt *get_persistent_gnt(struct xen_blkif_ring *ring,
239 grant_ref_t gref) 250 grant_ref_t gref)
240{ 251{
241 struct persistent_gnt *data; 252 struct persistent_gnt *data;
242 struct rb_node *node = NULL; 253 struct rb_node *node = NULL;
243 254
244 node = blkif->persistent_gnts.rb_node; 255 node = ring->persistent_gnts.rb_node;
245 while (node) { 256 while (node) {
246 data = container_of(node, struct persistent_gnt, node); 257 data = container_of(node, struct persistent_gnt, node);
247 258
@@ -255,24 +266,24 @@ static struct persistent_gnt *get_persistent_gnt(struct xen_blkif *blkif,
255 return NULL; 266 return NULL;
256 } 267 }
257 set_bit(PERSISTENT_GNT_ACTIVE, data->flags); 268 set_bit(PERSISTENT_GNT_ACTIVE, data->flags);
258 atomic_inc(&blkif->persistent_gnt_in_use); 269 atomic_inc(&ring->persistent_gnt_in_use);
259 return data; 270 return data;
260 } 271 }
261 } 272 }
262 return NULL; 273 return NULL;
263} 274}
264 275
265static void put_persistent_gnt(struct xen_blkif *blkif, 276static void put_persistent_gnt(struct xen_blkif_ring *ring,
266 struct persistent_gnt *persistent_gnt) 277 struct persistent_gnt *persistent_gnt)
267{ 278{
268 if(!test_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags)) 279 if(!test_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags))
269 pr_alert_ratelimited("freeing a grant already unused\n"); 280 pr_alert_ratelimited("freeing a grant already unused\n");
270 set_bit(PERSISTENT_GNT_WAS_ACTIVE, persistent_gnt->flags); 281 set_bit(PERSISTENT_GNT_WAS_ACTIVE, persistent_gnt->flags);
271 clear_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags); 282 clear_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags);
272 atomic_dec(&blkif->persistent_gnt_in_use); 283 atomic_dec(&ring->persistent_gnt_in_use);
273} 284}
274 285
275static void free_persistent_gnts(struct xen_blkif *blkif, struct rb_root *root, 286static void free_persistent_gnts(struct xen_blkif_ring *ring, struct rb_root *root,
276 unsigned int num) 287 unsigned int num)
277{ 288{
278 struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST]; 289 struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
@@ -303,7 +314,7 @@ static void free_persistent_gnts(struct xen_blkif *blkif, struct rb_root *root,
303 unmap_data.count = segs_to_unmap; 314 unmap_data.count = segs_to_unmap;
304 BUG_ON(gnttab_unmap_refs_sync(&unmap_data)); 315 BUG_ON(gnttab_unmap_refs_sync(&unmap_data));
305 316
306 put_free_pages(blkif, pages, segs_to_unmap); 317 put_free_pages(ring, pages, segs_to_unmap);
307 segs_to_unmap = 0; 318 segs_to_unmap = 0;
308 } 319 }
309 320
@@ -320,15 +331,15 @@ void xen_blkbk_unmap_purged_grants(struct work_struct *work)
320 struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST]; 331 struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST];
321 struct persistent_gnt *persistent_gnt; 332 struct persistent_gnt *persistent_gnt;
322 int segs_to_unmap = 0; 333 int segs_to_unmap = 0;
323 struct xen_blkif *blkif = container_of(work, typeof(*blkif), persistent_purge_work); 334 struct xen_blkif_ring *ring = container_of(work, typeof(*ring), persistent_purge_work);
324 struct gntab_unmap_queue_data unmap_data; 335 struct gntab_unmap_queue_data unmap_data;
325 336
326 unmap_data.pages = pages; 337 unmap_data.pages = pages;
327 unmap_data.unmap_ops = unmap; 338 unmap_data.unmap_ops = unmap;
328 unmap_data.kunmap_ops = NULL; 339 unmap_data.kunmap_ops = NULL;
329 340
330 while(!list_empty(&blkif->persistent_purge_list)) { 341 while(!list_empty(&ring->persistent_purge_list)) {
331 persistent_gnt = list_first_entry(&blkif->persistent_purge_list, 342 persistent_gnt = list_first_entry(&ring->persistent_purge_list,
332 struct persistent_gnt, 343 struct persistent_gnt,
333 remove_node); 344 remove_node);
334 list_del(&persistent_gnt->remove_node); 345 list_del(&persistent_gnt->remove_node);
@@ -343,7 +354,7 @@ void xen_blkbk_unmap_purged_grants(struct work_struct *work)
343 if (++segs_to_unmap == BLKIF_MAX_SEGMENTS_PER_REQUEST) { 354 if (++segs_to_unmap == BLKIF_MAX_SEGMENTS_PER_REQUEST) {
344 unmap_data.count = segs_to_unmap; 355 unmap_data.count = segs_to_unmap;
345 BUG_ON(gnttab_unmap_refs_sync(&unmap_data)); 356 BUG_ON(gnttab_unmap_refs_sync(&unmap_data));
346 put_free_pages(blkif, pages, segs_to_unmap); 357 put_free_pages(ring, pages, segs_to_unmap);
347 segs_to_unmap = 0; 358 segs_to_unmap = 0;
348 } 359 }
349 kfree(persistent_gnt); 360 kfree(persistent_gnt);
@@ -351,11 +362,11 @@ void xen_blkbk_unmap_purged_grants(struct work_struct *work)
351 if (segs_to_unmap > 0) { 362 if (segs_to_unmap > 0) {
352 unmap_data.count = segs_to_unmap; 363 unmap_data.count = segs_to_unmap;
353 BUG_ON(gnttab_unmap_refs_sync(&unmap_data)); 364 BUG_ON(gnttab_unmap_refs_sync(&unmap_data));
354 put_free_pages(blkif, pages, segs_to_unmap); 365 put_free_pages(ring, pages, segs_to_unmap);
355 } 366 }
356} 367}
357 368
358static void purge_persistent_gnt(struct xen_blkif *blkif) 369static void purge_persistent_gnt(struct xen_blkif_ring *ring)
359{ 370{
360 struct persistent_gnt *persistent_gnt; 371 struct persistent_gnt *persistent_gnt;
361 struct rb_node *n; 372 struct rb_node *n;
@@ -363,23 +374,23 @@ static void purge_persistent_gnt(struct xen_blkif *blkif)
363 bool scan_used = false, clean_used = false; 374 bool scan_used = false, clean_used = false;
364 struct rb_root *root; 375 struct rb_root *root;
365 376
366 if (blkif->persistent_gnt_c < xen_blkif_max_pgrants || 377 if (ring->persistent_gnt_c < xen_blkif_max_pgrants ||
367 (blkif->persistent_gnt_c == xen_blkif_max_pgrants && 378 (ring->persistent_gnt_c == xen_blkif_max_pgrants &&
368 !blkif->vbd.overflow_max_grants)) { 379 !ring->blkif->vbd.overflow_max_grants)) {
369 return; 380 goto out;
370 } 381 }
371 382
372 if (work_busy(&blkif->persistent_purge_work)) { 383 if (work_busy(&ring->persistent_purge_work)) {
373 pr_alert_ratelimited("Scheduled work from previous purge is still busy, cannot purge list\n"); 384 pr_alert_ratelimited("Scheduled work from previous purge is still busy, cannot purge list\n");
374 return; 385 goto out;
375 } 386 }
376 387
377 num_clean = (xen_blkif_max_pgrants / 100) * LRU_PERCENT_CLEAN; 388 num_clean = (xen_blkif_max_pgrants / 100) * LRU_PERCENT_CLEAN;
378 num_clean = blkif->persistent_gnt_c - xen_blkif_max_pgrants + num_clean; 389 num_clean = ring->persistent_gnt_c - xen_blkif_max_pgrants + num_clean;
379 num_clean = min(blkif->persistent_gnt_c, num_clean); 390 num_clean = min(ring->persistent_gnt_c, num_clean);
380 if ((num_clean == 0) || 391 if ((num_clean == 0) ||
381 (num_clean > (blkif->persistent_gnt_c - atomic_read(&blkif->persistent_gnt_in_use)))) 392 (num_clean > (ring->persistent_gnt_c - atomic_read(&ring->persistent_gnt_in_use))))
382 return; 393 goto out;
383 394
384 /* 395 /*
385 * At this point, we can assure that there will be no calls 396 * At this point, we can assure that there will be no calls
@@ -394,8 +405,8 @@ static void purge_persistent_gnt(struct xen_blkif *blkif)
394 405
395 pr_debug("Going to purge %u persistent grants\n", num_clean); 406 pr_debug("Going to purge %u persistent grants\n", num_clean);
396 407
397 BUG_ON(!list_empty(&blkif->persistent_purge_list)); 408 BUG_ON(!list_empty(&ring->persistent_purge_list));
398 root = &blkif->persistent_gnts; 409 root = &ring->persistent_gnts;
399purge_list: 410purge_list:
400 foreach_grant_safe(persistent_gnt, n, root, node) { 411 foreach_grant_safe(persistent_gnt, n, root, node) {
401 BUG_ON(persistent_gnt->handle == 412 BUG_ON(persistent_gnt->handle ==
@@ -414,7 +425,7 @@ purge_list:
414 425
415 rb_erase(&persistent_gnt->node, root); 426 rb_erase(&persistent_gnt->node, root);
416 list_add(&persistent_gnt->remove_node, 427 list_add(&persistent_gnt->remove_node,
417 &blkif->persistent_purge_list); 428 &ring->persistent_purge_list);
418 if (--num_clean == 0) 429 if (--num_clean == 0)
419 goto finished; 430 goto finished;
420 } 431 }
@@ -435,30 +446,32 @@ finished:
435 goto purge_list; 446 goto purge_list;
436 } 447 }
437 448
438 blkif->persistent_gnt_c -= (total - num_clean); 449 ring->persistent_gnt_c -= (total - num_clean);
439 blkif->vbd.overflow_max_grants = 0; 450 ring->blkif->vbd.overflow_max_grants = 0;
440 451
441 /* We can defer this work */ 452 /* We can defer this work */
442 schedule_work(&blkif->persistent_purge_work); 453 schedule_work(&ring->persistent_purge_work);
443 pr_debug("Purged %u/%u\n", (total - num_clean), total); 454 pr_debug("Purged %u/%u\n", (total - num_clean), total);
455
456out:
444 return; 457 return;
445} 458}
446 459
447/* 460/*
448 * Retrieve from the 'pending_reqs' a free pending_req structure to be used. 461 * Retrieve from the 'pending_reqs' a free pending_req structure to be used.
449 */ 462 */
450static struct pending_req *alloc_req(struct xen_blkif *blkif) 463static struct pending_req *alloc_req(struct xen_blkif_ring *ring)
451{ 464{
452 struct pending_req *req = NULL; 465 struct pending_req *req = NULL;
453 unsigned long flags; 466 unsigned long flags;
454 467
455 spin_lock_irqsave(&blkif->pending_free_lock, flags); 468 spin_lock_irqsave(&ring->pending_free_lock, flags);
456 if (!list_empty(&blkif->pending_free)) { 469 if (!list_empty(&ring->pending_free)) {
457 req = list_entry(blkif->pending_free.next, struct pending_req, 470 req = list_entry(ring->pending_free.next, struct pending_req,
458 free_list); 471 free_list);
459 list_del(&req->free_list); 472 list_del(&req->free_list);
460 } 473 }
461 spin_unlock_irqrestore(&blkif->pending_free_lock, flags); 474 spin_unlock_irqrestore(&ring->pending_free_lock, flags);
462 return req; 475 return req;
463} 476}
464 477
@@ -466,17 +479,17 @@ static struct pending_req *alloc_req(struct xen_blkif *blkif)
466 * Return the 'pending_req' structure back to the freepool. We also 479 * Return the 'pending_req' structure back to the freepool. We also
467 * wake up the thread if it was waiting for a free page. 480 * wake up the thread if it was waiting for a free page.
468 */ 481 */
469static void free_req(struct xen_blkif *blkif, struct pending_req *req) 482static void free_req(struct xen_blkif_ring *ring, struct pending_req *req)
470{ 483{
471 unsigned long flags; 484 unsigned long flags;
472 int was_empty; 485 int was_empty;
473 486
474 spin_lock_irqsave(&blkif->pending_free_lock, flags); 487 spin_lock_irqsave(&ring->pending_free_lock, flags);
475 was_empty = list_empty(&blkif->pending_free); 488 was_empty = list_empty(&ring->pending_free);
476 list_add(&req->free_list, &blkif->pending_free); 489 list_add(&req->free_list, &ring->pending_free);
477 spin_unlock_irqrestore(&blkif->pending_free_lock, flags); 490 spin_unlock_irqrestore(&ring->pending_free_lock, flags);
478 if (was_empty) 491 if (was_empty)
479 wake_up(&blkif->pending_free_wq); 492 wake_up(&ring->pending_free_wq);
480} 493}
481 494
482/* 495/*
@@ -556,10 +569,10 @@ abort:
556/* 569/*
557 * Notification from the guest OS. 570 * Notification from the guest OS.
558 */ 571 */
559static void blkif_notify_work(struct xen_blkif *blkif) 572static void blkif_notify_work(struct xen_blkif_ring *ring)
560{ 573{
561 blkif->waiting_reqs = 1; 574 ring->waiting_reqs = 1;
562 wake_up(&blkif->wq); 575 wake_up(&ring->wq);
563} 576}
564 577
565irqreturn_t xen_blkif_be_int(int irq, void *dev_id) 578irqreturn_t xen_blkif_be_int(int irq, void *dev_id)
@@ -572,31 +585,33 @@ irqreturn_t xen_blkif_be_int(int irq, void *dev_id)
572 * SCHEDULER FUNCTIONS 585 * SCHEDULER FUNCTIONS
573 */ 586 */
574 587
575static void print_stats(struct xen_blkif *blkif) 588static void print_stats(struct xen_blkif_ring *ring)
576{ 589{
577 pr_info("(%s): oo %3llu | rd %4llu | wr %4llu | f %4llu" 590 pr_info("(%s): oo %3llu | rd %4llu | wr %4llu | f %4llu"
578 " | ds %4llu | pg: %4u/%4d\n", 591 " | ds %4llu | pg: %4u/%4d\n",
579 current->comm, blkif->st_oo_req, 592 current->comm, ring->st_oo_req,
580 blkif->st_rd_req, blkif->st_wr_req, 593 ring->st_rd_req, ring->st_wr_req,
581 blkif->st_f_req, blkif->st_ds_req, 594 ring->st_f_req, ring->st_ds_req,
582 blkif->persistent_gnt_c, 595 ring->persistent_gnt_c,
583 xen_blkif_max_pgrants); 596 xen_blkif_max_pgrants);
584 blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000); 597 ring->st_print = jiffies + msecs_to_jiffies(10 * 1000);
585 blkif->st_rd_req = 0; 598 ring->st_rd_req = 0;
586 blkif->st_wr_req = 0; 599 ring->st_wr_req = 0;
587 blkif->st_oo_req = 0; 600 ring->st_oo_req = 0;
588 blkif->st_ds_req = 0; 601 ring->st_ds_req = 0;
589} 602}
590 603
591int xen_blkif_schedule(void *arg) 604int xen_blkif_schedule(void *arg)
592{ 605{
593 struct xen_blkif *blkif = arg; 606 struct xen_blkif_ring *ring = arg;
607 struct xen_blkif *blkif = ring->blkif;
594 struct xen_vbd *vbd = &blkif->vbd; 608 struct xen_vbd *vbd = &blkif->vbd;
595 unsigned long timeout; 609 unsigned long timeout;
596 int ret; 610 int ret;
597 611
598 xen_blkif_get(blkif); 612 xen_blkif_get(blkif);
599 613
614 set_freezable();
600 while (!kthread_should_stop()) { 615 while (!kthread_should_stop()) {
601 if (try_to_freeze()) 616 if (try_to_freeze())
602 continue; 617 continue;
@@ -606,50 +621,50 @@ int xen_blkif_schedule(void *arg)
606 timeout = msecs_to_jiffies(LRU_INTERVAL); 621 timeout = msecs_to_jiffies(LRU_INTERVAL);
607 622
608 timeout = wait_event_interruptible_timeout( 623 timeout = wait_event_interruptible_timeout(
609 blkif->wq, 624 ring->wq,
610 blkif->waiting_reqs || kthread_should_stop(), 625 ring->waiting_reqs || kthread_should_stop(),
611 timeout); 626 timeout);
612 if (timeout == 0) 627 if (timeout == 0)
613 goto purge_gnt_list; 628 goto purge_gnt_list;
614 timeout = wait_event_interruptible_timeout( 629 timeout = wait_event_interruptible_timeout(
615 blkif->pending_free_wq, 630 ring->pending_free_wq,
616 !list_empty(&blkif->pending_free) || 631 !list_empty(&ring->pending_free) ||
617 kthread_should_stop(), 632 kthread_should_stop(),
618 timeout); 633 timeout);
619 if (timeout == 0) 634 if (timeout == 0)
620 goto purge_gnt_list; 635 goto purge_gnt_list;
621 636
622 blkif->waiting_reqs = 0; 637 ring->waiting_reqs = 0;
623 smp_mb(); /* clear flag *before* checking for work */ 638 smp_mb(); /* clear flag *before* checking for work */
624 639
625 ret = do_block_io_op(blkif); 640 ret = do_block_io_op(ring);
626 if (ret > 0) 641 if (ret > 0)
627 blkif->waiting_reqs = 1; 642 ring->waiting_reqs = 1;
628 if (ret == -EACCES) 643 if (ret == -EACCES)
629 wait_event_interruptible(blkif->shutdown_wq, 644 wait_event_interruptible(ring->shutdown_wq,
630 kthread_should_stop()); 645 kthread_should_stop());
631 646
632purge_gnt_list: 647purge_gnt_list:
633 if (blkif->vbd.feature_gnt_persistent && 648 if (blkif->vbd.feature_gnt_persistent &&
634 time_after(jiffies, blkif->next_lru)) { 649 time_after(jiffies, ring->next_lru)) {
635 purge_persistent_gnt(blkif); 650 purge_persistent_gnt(ring);
636 blkif->next_lru = jiffies + msecs_to_jiffies(LRU_INTERVAL); 651 ring->next_lru = jiffies + msecs_to_jiffies(LRU_INTERVAL);
637 } 652 }
638 653
639 /* Shrink if we have more than xen_blkif_max_buffer_pages */ 654 /* Shrink if we have more than xen_blkif_max_buffer_pages */
640 shrink_free_pagepool(blkif, xen_blkif_max_buffer_pages); 655 shrink_free_pagepool(ring, xen_blkif_max_buffer_pages);
641 656
642 if (log_stats && time_after(jiffies, blkif->st_print)) 657 if (log_stats && time_after(jiffies, ring->st_print))
643 print_stats(blkif); 658 print_stats(ring);
644 } 659 }
645 660
646 /* Drain pending purge work */ 661 /* Drain pending purge work */
647 flush_work(&blkif->persistent_purge_work); 662 flush_work(&ring->persistent_purge_work);
648 663
649 if (log_stats) 664 if (log_stats)
650 print_stats(blkif); 665 print_stats(ring);
651 666
652 blkif->xenblkd = NULL; 667 ring->xenblkd = NULL;
653 xen_blkif_put(blkif); 668 xen_blkif_put(blkif);
654 669
655 return 0; 670 return 0;
@@ -658,22 +673,22 @@ purge_gnt_list:
658/* 673/*
659 * Remove persistent grants and empty the pool of free pages 674 * Remove persistent grants and empty the pool of free pages
660 */ 675 */
661void xen_blkbk_free_caches(struct xen_blkif *blkif) 676void xen_blkbk_free_caches(struct xen_blkif_ring *ring)
662{ 677{
663 /* Free all persistent grant pages */ 678 /* Free all persistent grant pages */
664 if (!RB_EMPTY_ROOT(&blkif->persistent_gnts)) 679 if (!RB_EMPTY_ROOT(&ring->persistent_gnts))
665 free_persistent_gnts(blkif, &blkif->persistent_gnts, 680 free_persistent_gnts(ring, &ring->persistent_gnts,
666 blkif->persistent_gnt_c); 681 ring->persistent_gnt_c);
667 682
668 BUG_ON(!RB_EMPTY_ROOT(&blkif->persistent_gnts)); 683 BUG_ON(!RB_EMPTY_ROOT(&ring->persistent_gnts));
669 blkif->persistent_gnt_c = 0; 684 ring->persistent_gnt_c = 0;
670 685
671 /* Since we are shutting down remove all pages from the buffer */ 686 /* Since we are shutting down remove all pages from the buffer */
672 shrink_free_pagepool(blkif, 0 /* All */); 687 shrink_free_pagepool(ring, 0 /* All */);
673} 688}
674 689
675static unsigned int xen_blkbk_unmap_prepare( 690static unsigned int xen_blkbk_unmap_prepare(
676 struct xen_blkif *blkif, 691 struct xen_blkif_ring *ring,
677 struct grant_page **pages, 692 struct grant_page **pages,
678 unsigned int num, 693 unsigned int num,
679 struct gnttab_unmap_grant_ref *unmap_ops, 694 struct gnttab_unmap_grant_ref *unmap_ops,
@@ -683,7 +698,7 @@ static unsigned int xen_blkbk_unmap_prepare(
683 698
684 for (i = 0; i < num; i++) { 699 for (i = 0; i < num; i++) {
685 if (pages[i]->persistent_gnt != NULL) { 700 if (pages[i]->persistent_gnt != NULL) {
686 put_persistent_gnt(blkif, pages[i]->persistent_gnt); 701 put_persistent_gnt(ring, pages[i]->persistent_gnt);
687 continue; 702 continue;
688 } 703 }
689 if (pages[i]->handle == BLKBACK_INVALID_HANDLE) 704 if (pages[i]->handle == BLKBACK_INVALID_HANDLE)
@@ -700,17 +715,18 @@ static unsigned int xen_blkbk_unmap_prepare(
700 715
701static void xen_blkbk_unmap_and_respond_callback(int result, struct gntab_unmap_queue_data *data) 716static void xen_blkbk_unmap_and_respond_callback(int result, struct gntab_unmap_queue_data *data)
702{ 717{
703 struct pending_req* pending_req = (struct pending_req*) (data->data); 718 struct pending_req *pending_req = (struct pending_req *)(data->data);
704 struct xen_blkif *blkif = pending_req->blkif; 719 struct xen_blkif_ring *ring = pending_req->ring;
720 struct xen_blkif *blkif = ring->blkif;
705 721
706 /* BUG_ON used to reproduce existing behaviour, 722 /* BUG_ON used to reproduce existing behaviour,
707 but is this the best way to deal with this? */ 723 but is this the best way to deal with this? */
708 BUG_ON(result); 724 BUG_ON(result);
709 725
710 put_free_pages(blkif, data->pages, data->count); 726 put_free_pages(ring, data->pages, data->count);
711 make_response(blkif, pending_req->id, 727 make_response(ring, pending_req->id,
712 pending_req->operation, pending_req->status); 728 pending_req->operation, pending_req->status);
713 free_req(blkif, pending_req); 729 free_req(ring, pending_req);
714 /* 730 /*
715 * Make sure the request is freed before releasing blkif, 731 * Make sure the request is freed before releasing blkif,
716 * or there could be a race between free_req and the 732 * or there could be a race between free_req and the
@@ -723,7 +739,7 @@ static void xen_blkbk_unmap_and_respond_callback(int result, struct gntab_unmap_
723 * pending_free_wq if there's a drain going on, but it has 739 * pending_free_wq if there's a drain going on, but it has
724 * to be taken into account if the current model is changed. 740 * to be taken into account if the current model is changed.
725 */ 741 */
726 if (atomic_dec_and_test(&blkif->inflight) && atomic_read(&blkif->drain)) { 742 if (atomic_dec_and_test(&ring->inflight) && atomic_read(&blkif->drain)) {
727 complete(&blkif->drain_complete); 743 complete(&blkif->drain_complete);
728 } 744 }
729 xen_blkif_put(blkif); 745 xen_blkif_put(blkif);
@@ -732,11 +748,11 @@ static void xen_blkbk_unmap_and_respond_callback(int result, struct gntab_unmap_
732static void xen_blkbk_unmap_and_respond(struct pending_req *req) 748static void xen_blkbk_unmap_and_respond(struct pending_req *req)
733{ 749{
734 struct gntab_unmap_queue_data* work = &req->gnttab_unmap_data; 750 struct gntab_unmap_queue_data* work = &req->gnttab_unmap_data;
735 struct xen_blkif *blkif = req->blkif; 751 struct xen_blkif_ring *ring = req->ring;
736 struct grant_page **pages = req->segments; 752 struct grant_page **pages = req->segments;
737 unsigned int invcount; 753 unsigned int invcount;
738 754
739 invcount = xen_blkbk_unmap_prepare(blkif, pages, req->nr_segs, 755 invcount = xen_blkbk_unmap_prepare(ring, pages, req->nr_segs,
740 req->unmap, req->unmap_pages); 756 req->unmap, req->unmap_pages);
741 757
742 work->data = req; 758 work->data = req;
@@ -757,7 +773,7 @@ static void xen_blkbk_unmap_and_respond(struct pending_req *req)
757 * of hypercalls, but since this is only used in error paths there's 773 * of hypercalls, but since this is only used in error paths there's
758 * no real need. 774 * no real need.
759 */ 775 */
760static void xen_blkbk_unmap(struct xen_blkif *blkif, 776static void xen_blkbk_unmap(struct xen_blkif_ring *ring,
761 struct grant_page *pages[], 777 struct grant_page *pages[],
762 int num) 778 int num)
763{ 779{
@@ -768,20 +784,20 @@ static void xen_blkbk_unmap(struct xen_blkif *blkif,
768 784
769 while (num) { 785 while (num) {
770 unsigned int batch = min(num, BLKIF_MAX_SEGMENTS_PER_REQUEST); 786 unsigned int batch = min(num, BLKIF_MAX_SEGMENTS_PER_REQUEST);
771 787
772 invcount = xen_blkbk_unmap_prepare(blkif, pages, batch, 788 invcount = xen_blkbk_unmap_prepare(ring, pages, batch,
773 unmap, unmap_pages); 789 unmap, unmap_pages);
774 if (invcount) { 790 if (invcount) {
775 ret = gnttab_unmap_refs(unmap, NULL, unmap_pages, invcount); 791 ret = gnttab_unmap_refs(unmap, NULL, unmap_pages, invcount);
776 BUG_ON(ret); 792 BUG_ON(ret);
777 put_free_pages(blkif, unmap_pages, invcount); 793 put_free_pages(ring, unmap_pages, invcount);
778 } 794 }
779 pages += batch; 795 pages += batch;
780 num -= batch; 796 num -= batch;
781 } 797 }
782} 798}
783 799
784static int xen_blkbk_map(struct xen_blkif *blkif, 800static int xen_blkbk_map(struct xen_blkif_ring *ring,
785 struct grant_page *pages[], 801 struct grant_page *pages[],
786 int num, bool ro) 802 int num, bool ro)
787{ 803{
@@ -794,6 +810,7 @@ static int xen_blkbk_map(struct xen_blkif *blkif,
794 int ret = 0; 810 int ret = 0;
795 int last_map = 0, map_until = 0; 811 int last_map = 0, map_until = 0;
796 int use_persistent_gnts; 812 int use_persistent_gnts;
813 struct xen_blkif *blkif = ring->blkif;
797 814
798 use_persistent_gnts = (blkif->vbd.feature_gnt_persistent); 815 use_persistent_gnts = (blkif->vbd.feature_gnt_persistent);
799 816
@@ -806,10 +823,11 @@ again:
806 for (i = map_until; i < num; i++) { 823 for (i = map_until; i < num; i++) {
807 uint32_t flags; 824 uint32_t flags;
808 825
809 if (use_persistent_gnts) 826 if (use_persistent_gnts) {
810 persistent_gnt = get_persistent_gnt( 827 persistent_gnt = get_persistent_gnt(
811 blkif, 828 ring,
812 pages[i]->gref); 829 pages[i]->gref);
830 }
813 831
814 if (persistent_gnt) { 832 if (persistent_gnt) {
815 /* 833 /*
@@ -819,7 +837,7 @@ again:
819 pages[i]->page = persistent_gnt->page; 837 pages[i]->page = persistent_gnt->page;
820 pages[i]->persistent_gnt = persistent_gnt; 838 pages[i]->persistent_gnt = persistent_gnt;
821 } else { 839 } else {
822 if (get_free_page(blkif, &pages[i]->page)) 840 if (get_free_page(ring, &pages[i]->page))
823 goto out_of_memory; 841 goto out_of_memory;
824 addr = vaddr(pages[i]->page); 842 addr = vaddr(pages[i]->page);
825 pages_to_gnt[segs_to_map] = pages[i]->page; 843 pages_to_gnt[segs_to_map] = pages[i]->page;
@@ -852,7 +870,7 @@ again:
852 BUG_ON(new_map_idx >= segs_to_map); 870 BUG_ON(new_map_idx >= segs_to_map);
853 if (unlikely(map[new_map_idx].status != 0)) { 871 if (unlikely(map[new_map_idx].status != 0)) {
854 pr_debug("invalid buffer -- could not remap it\n"); 872 pr_debug("invalid buffer -- could not remap it\n");
855 put_free_pages(blkif, &pages[seg_idx]->page, 1); 873 put_free_pages(ring, &pages[seg_idx]->page, 1);
856 pages[seg_idx]->handle = BLKBACK_INVALID_HANDLE; 874 pages[seg_idx]->handle = BLKBACK_INVALID_HANDLE;
857 ret |= 1; 875 ret |= 1;
858 goto next; 876 goto next;
@@ -862,7 +880,7 @@ again:
862 continue; 880 continue;
863 } 881 }
864 if (use_persistent_gnts && 882 if (use_persistent_gnts &&
865 blkif->persistent_gnt_c < xen_blkif_max_pgrants) { 883 ring->persistent_gnt_c < xen_blkif_max_pgrants) {
866 /* 884 /*
867 * We are using persistent grants, the grant is 885 * We are using persistent grants, the grant is
868 * not mapped but we might have room for it. 886 * not mapped but we might have room for it.
@@ -880,7 +898,7 @@ again:
880 persistent_gnt->gnt = map[new_map_idx].ref; 898 persistent_gnt->gnt = map[new_map_idx].ref;
881 persistent_gnt->handle = map[new_map_idx].handle; 899 persistent_gnt->handle = map[new_map_idx].handle;
882 persistent_gnt->page = pages[seg_idx]->page; 900 persistent_gnt->page = pages[seg_idx]->page;
883 if (add_persistent_gnt(blkif, 901 if (add_persistent_gnt(ring,
884 persistent_gnt)) { 902 persistent_gnt)) {
885 kfree(persistent_gnt); 903 kfree(persistent_gnt);
886 persistent_gnt = NULL; 904 persistent_gnt = NULL;
@@ -888,7 +906,7 @@ again:
888 } 906 }
889 pages[seg_idx]->persistent_gnt = persistent_gnt; 907 pages[seg_idx]->persistent_gnt = persistent_gnt;
890 pr_debug("grant %u added to the tree of persistent grants, using %u/%u\n", 908 pr_debug("grant %u added to the tree of persistent grants, using %u/%u\n",
891 persistent_gnt->gnt, blkif->persistent_gnt_c, 909 persistent_gnt->gnt, ring->persistent_gnt_c,
892 xen_blkif_max_pgrants); 910 xen_blkif_max_pgrants);
893 goto next; 911 goto next;
894 } 912 }
@@ -913,7 +931,7 @@ next:
913 931
914out_of_memory: 932out_of_memory:
915 pr_alert("%s: out of memory\n", __func__); 933 pr_alert("%s: out of memory\n", __func__);
916 put_free_pages(blkif, pages_to_gnt, segs_to_map); 934 put_free_pages(ring, pages_to_gnt, segs_to_map);
917 return -ENOMEM; 935 return -ENOMEM;
918} 936}
919 937
@@ -921,7 +939,7 @@ static int xen_blkbk_map_seg(struct pending_req *pending_req)
921{ 939{
922 int rc; 940 int rc;
923 941
924 rc = xen_blkbk_map(pending_req->blkif, pending_req->segments, 942 rc = xen_blkbk_map(pending_req->ring, pending_req->segments,
925 pending_req->nr_segs, 943 pending_req->nr_segs,
926 (pending_req->operation != BLKIF_OP_READ)); 944 (pending_req->operation != BLKIF_OP_READ));
927 945
@@ -934,7 +952,7 @@ static int xen_blkbk_parse_indirect(struct blkif_request *req,
934 struct phys_req *preq) 952 struct phys_req *preq)
935{ 953{
936 struct grant_page **pages = pending_req->indirect_pages; 954 struct grant_page **pages = pending_req->indirect_pages;
937 struct xen_blkif *blkif = pending_req->blkif; 955 struct xen_blkif_ring *ring = pending_req->ring;
938 int indirect_grefs, rc, n, nseg, i; 956 int indirect_grefs, rc, n, nseg, i;
939 struct blkif_request_segment *segments = NULL; 957 struct blkif_request_segment *segments = NULL;
940 958
@@ -945,7 +963,7 @@ static int xen_blkbk_parse_indirect(struct blkif_request *req,
945 for (i = 0; i < indirect_grefs; i++) 963 for (i = 0; i < indirect_grefs; i++)
946 pages[i]->gref = req->u.indirect.indirect_grefs[i]; 964 pages[i]->gref = req->u.indirect.indirect_grefs[i];
947 965
948 rc = xen_blkbk_map(blkif, pages, indirect_grefs, true); 966 rc = xen_blkbk_map(ring, pages, indirect_grefs, true);
949 if (rc) 967 if (rc)
950 goto unmap; 968 goto unmap;
951 969
@@ -972,15 +990,16 @@ static int xen_blkbk_parse_indirect(struct blkif_request *req,
972unmap: 990unmap:
973 if (segments) 991 if (segments)
974 kunmap_atomic(segments); 992 kunmap_atomic(segments);
975 xen_blkbk_unmap(blkif, pages, indirect_grefs); 993 xen_blkbk_unmap(ring, pages, indirect_grefs);
976 return rc; 994 return rc;
977} 995}
978 996
979static int dispatch_discard_io(struct xen_blkif *blkif, 997static int dispatch_discard_io(struct xen_blkif_ring *ring,
980 struct blkif_request *req) 998 struct blkif_request *req)
981{ 999{
982 int err = 0; 1000 int err = 0;
983 int status = BLKIF_RSP_OKAY; 1001 int status = BLKIF_RSP_OKAY;
1002 struct xen_blkif *blkif = ring->blkif;
984 struct block_device *bdev = blkif->vbd.bdev; 1003 struct block_device *bdev = blkif->vbd.bdev;
985 unsigned long secure; 1004 unsigned long secure;
986 struct phys_req preq; 1005 struct phys_req preq;
@@ -997,7 +1016,7 @@ static int dispatch_discard_io(struct xen_blkif *blkif,
997 preq.sector_number + preq.nr_sects, blkif->vbd.pdevice); 1016 preq.sector_number + preq.nr_sects, blkif->vbd.pdevice);
998 goto fail_response; 1017 goto fail_response;
999 } 1018 }
1000 blkif->st_ds_req++; 1019 ring->st_ds_req++;
1001 1020
1002 secure = (blkif->vbd.discard_secure && 1021 secure = (blkif->vbd.discard_secure &&
1003 (req->u.discard.flag & BLKIF_DISCARD_SECURE)) ? 1022 (req->u.discard.flag & BLKIF_DISCARD_SECURE)) ?
@@ -1013,26 +1032,28 @@ fail_response:
1013 } else if (err) 1032 } else if (err)
1014 status = BLKIF_RSP_ERROR; 1033 status = BLKIF_RSP_ERROR;
1015 1034
1016 make_response(blkif, req->u.discard.id, req->operation, status); 1035 make_response(ring, req->u.discard.id, req->operation, status);
1017 xen_blkif_put(blkif); 1036 xen_blkif_put(blkif);
1018 return err; 1037 return err;
1019} 1038}
1020 1039
1021static int dispatch_other_io(struct xen_blkif *blkif, 1040static int dispatch_other_io(struct xen_blkif_ring *ring,
1022 struct blkif_request *req, 1041 struct blkif_request *req,
1023 struct pending_req *pending_req) 1042 struct pending_req *pending_req)
1024{ 1043{
1025 free_req(blkif, pending_req); 1044 free_req(ring, pending_req);
1026 make_response(blkif, req->u.other.id, req->operation, 1045 make_response(ring, req->u.other.id, req->operation,
1027 BLKIF_RSP_EOPNOTSUPP); 1046 BLKIF_RSP_EOPNOTSUPP);
1028 return -EIO; 1047 return -EIO;
1029} 1048}
1030 1049
1031static void xen_blk_drain_io(struct xen_blkif *blkif) 1050static void xen_blk_drain_io(struct xen_blkif_ring *ring)
1032{ 1051{
1052 struct xen_blkif *blkif = ring->blkif;
1053
1033 atomic_set(&blkif->drain, 1); 1054 atomic_set(&blkif->drain, 1);
1034 do { 1055 do {
1035 if (atomic_read(&blkif->inflight) == 0) 1056 if (atomic_read(&ring->inflight) == 0)
1036 break; 1057 break;
1037 wait_for_completion_interruptible_timeout( 1058 wait_for_completion_interruptible_timeout(
1038 &blkif->drain_complete, HZ); 1059 &blkif->drain_complete, HZ);
@@ -1053,12 +1074,12 @@ static void __end_block_io_op(struct pending_req *pending_req, int error)
1053 if ((pending_req->operation == BLKIF_OP_FLUSH_DISKCACHE) && 1074 if ((pending_req->operation == BLKIF_OP_FLUSH_DISKCACHE) &&
1054 (error == -EOPNOTSUPP)) { 1075 (error == -EOPNOTSUPP)) {
1055 pr_debug("flush diskcache op failed, not supported\n"); 1076 pr_debug("flush diskcache op failed, not supported\n");
1056 xen_blkbk_flush_diskcache(XBT_NIL, pending_req->blkif->be, 0); 1077 xen_blkbk_flush_diskcache(XBT_NIL, pending_req->ring->blkif->be, 0);
1057 pending_req->status = BLKIF_RSP_EOPNOTSUPP; 1078 pending_req->status = BLKIF_RSP_EOPNOTSUPP;
1058 } else if ((pending_req->operation == BLKIF_OP_WRITE_BARRIER) && 1079 } else if ((pending_req->operation == BLKIF_OP_WRITE_BARRIER) &&
1059 (error == -EOPNOTSUPP)) { 1080 (error == -EOPNOTSUPP)) {
1060 pr_debug("write barrier op failed, not supported\n"); 1081 pr_debug("write barrier op failed, not supported\n");
1061 xen_blkbk_barrier(XBT_NIL, pending_req->blkif->be, 0); 1082 xen_blkbk_barrier(XBT_NIL, pending_req->ring->blkif->be, 0);
1062 pending_req->status = BLKIF_RSP_EOPNOTSUPP; 1083 pending_req->status = BLKIF_RSP_EOPNOTSUPP;
1063 } else if (error) { 1084 } else if (error) {
1064 pr_debug("Buffer not up-to-date at end of operation," 1085 pr_debug("Buffer not up-to-date at end of operation,"
@@ -1092,9 +1113,9 @@ static void end_block_io_op(struct bio *bio)
1092 * and transmute it to the block API to hand it over to the proper block disk. 1113 * and transmute it to the block API to hand it over to the proper block disk.
1093 */ 1114 */
1094static int 1115static int
1095__do_block_io_op(struct xen_blkif *blkif) 1116__do_block_io_op(struct xen_blkif_ring *ring)
1096{ 1117{
1097 union blkif_back_rings *blk_rings = &blkif->blk_rings; 1118 union blkif_back_rings *blk_rings = &ring->blk_rings;
1098 struct blkif_request req; 1119 struct blkif_request req;
1099 struct pending_req *pending_req; 1120 struct pending_req *pending_req;
1100 RING_IDX rc, rp; 1121 RING_IDX rc, rp;
@@ -1107,7 +1128,7 @@ __do_block_io_op(struct xen_blkif *blkif)
1107 if (RING_REQUEST_PROD_OVERFLOW(&blk_rings->common, rp)) { 1128 if (RING_REQUEST_PROD_OVERFLOW(&blk_rings->common, rp)) {
1108 rc = blk_rings->common.rsp_prod_pvt; 1129 rc = blk_rings->common.rsp_prod_pvt;
1109 pr_warn("Frontend provided bogus ring requests (%d - %d = %d). Halting ring processing on dev=%04x\n", 1130 pr_warn("Frontend provided bogus ring requests (%d - %d = %d). Halting ring processing on dev=%04x\n",
1110 rp, rc, rp - rc, blkif->vbd.pdevice); 1131 rp, rc, rp - rc, ring->blkif->vbd.pdevice);
1111 return -EACCES; 1132 return -EACCES;
1112 } 1133 }
1113 while (rc != rp) { 1134 while (rc != rp) {
@@ -1120,14 +1141,14 @@ __do_block_io_op(struct xen_blkif *blkif)
1120 break; 1141 break;
1121 } 1142 }
1122 1143
1123 pending_req = alloc_req(blkif); 1144 pending_req = alloc_req(ring);
1124 if (NULL == pending_req) { 1145 if (NULL == pending_req) {
1125 blkif->st_oo_req++; 1146 ring->st_oo_req++;
1126 more_to_do = 1; 1147 more_to_do = 1;
1127 break; 1148 break;
1128 } 1149 }
1129 1150
1130 switch (blkif->blk_protocol) { 1151 switch (ring->blkif->blk_protocol) {
1131 case BLKIF_PROTOCOL_NATIVE: 1152 case BLKIF_PROTOCOL_NATIVE:
1132 memcpy(&req, RING_GET_REQUEST(&blk_rings->native, rc), sizeof(req)); 1153 memcpy(&req, RING_GET_REQUEST(&blk_rings->native, rc), sizeof(req));
1133 break; 1154 break;
@@ -1151,16 +1172,16 @@ __do_block_io_op(struct xen_blkif *blkif)
1151 case BLKIF_OP_WRITE_BARRIER: 1172 case BLKIF_OP_WRITE_BARRIER:
1152 case BLKIF_OP_FLUSH_DISKCACHE: 1173 case BLKIF_OP_FLUSH_DISKCACHE:
1153 case BLKIF_OP_INDIRECT: 1174 case BLKIF_OP_INDIRECT:
1154 if (dispatch_rw_block_io(blkif, &req, pending_req)) 1175 if (dispatch_rw_block_io(ring, &req, pending_req))
1155 goto done; 1176 goto done;
1156 break; 1177 break;
1157 case BLKIF_OP_DISCARD: 1178 case BLKIF_OP_DISCARD:
1158 free_req(blkif, pending_req); 1179 free_req(ring, pending_req);
1159 if (dispatch_discard_io(blkif, &req)) 1180 if (dispatch_discard_io(ring, &req))
1160 goto done; 1181 goto done;
1161 break; 1182 break;
1162 default: 1183 default:
1163 if (dispatch_other_io(blkif, &req, pending_req)) 1184 if (dispatch_other_io(ring, &req, pending_req))
1164 goto done; 1185 goto done;
1165 break; 1186 break;
1166 } 1187 }
@@ -1173,13 +1194,13 @@ done:
1173} 1194}
1174 1195
1175static int 1196static int
1176do_block_io_op(struct xen_blkif *blkif) 1197do_block_io_op(struct xen_blkif_ring *ring)
1177{ 1198{
1178 union blkif_back_rings *blk_rings = &blkif->blk_rings; 1199 union blkif_back_rings *blk_rings = &ring->blk_rings;
1179 int more_to_do; 1200 int more_to_do;
1180 1201
1181 do { 1202 do {
1182 more_to_do = __do_block_io_op(blkif); 1203 more_to_do = __do_block_io_op(ring);
1183 if (more_to_do) 1204 if (more_to_do)
1184 break; 1205 break;
1185 1206
@@ -1192,7 +1213,7 @@ do_block_io_op(struct xen_blkif *blkif)
1192 * Transmutation of the 'struct blkif_request' to a proper 'struct bio' 1213 * Transmutation of the 'struct blkif_request' to a proper 'struct bio'
1193 * and call the 'submit_bio' to pass it to the underlying storage. 1214 * and call the 'submit_bio' to pass it to the underlying storage.
1194 */ 1215 */
1195static int dispatch_rw_block_io(struct xen_blkif *blkif, 1216static int dispatch_rw_block_io(struct xen_blkif_ring *ring,
1196 struct blkif_request *req, 1217 struct blkif_request *req,
1197 struct pending_req *pending_req) 1218 struct pending_req *pending_req)
1198{ 1219{
@@ -1220,17 +1241,17 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
1220 1241
1221 switch (req_operation) { 1242 switch (req_operation) {
1222 case BLKIF_OP_READ: 1243 case BLKIF_OP_READ:
1223 blkif->st_rd_req++; 1244 ring->st_rd_req++;
1224 operation = READ; 1245 operation = READ;
1225 break; 1246 break;
1226 case BLKIF_OP_WRITE: 1247 case BLKIF_OP_WRITE:
1227 blkif->st_wr_req++; 1248 ring->st_wr_req++;
1228 operation = WRITE_ODIRECT; 1249 operation = WRITE_ODIRECT;
1229 break; 1250 break;
1230 case BLKIF_OP_WRITE_BARRIER: 1251 case BLKIF_OP_WRITE_BARRIER:
1231 drain = true; 1252 drain = true;
1232 case BLKIF_OP_FLUSH_DISKCACHE: 1253 case BLKIF_OP_FLUSH_DISKCACHE:
1233 blkif->st_f_req++; 1254 ring->st_f_req++;
1234 operation = WRITE_FLUSH; 1255 operation = WRITE_FLUSH;
1235 break; 1256 break;
1236 default: 1257 default:
@@ -1255,7 +1276,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
1255 1276
1256 preq.nr_sects = 0; 1277 preq.nr_sects = 0;
1257 1278
1258 pending_req->blkif = blkif; 1279 pending_req->ring = ring;
1259 pending_req->id = req->u.rw.id; 1280 pending_req->id = req->u.rw.id;
1260 pending_req->operation = req_operation; 1281 pending_req->operation = req_operation;
1261 pending_req->status = BLKIF_RSP_OKAY; 1282 pending_req->status = BLKIF_RSP_OKAY;
@@ -1282,12 +1303,12 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
1282 goto fail_response; 1303 goto fail_response;
1283 } 1304 }
1284 1305
1285 if (xen_vbd_translate(&preq, blkif, operation) != 0) { 1306 if (xen_vbd_translate(&preq, ring->blkif, operation) != 0) {
1286 pr_debug("access denied: %s of [%llu,%llu] on dev=%04x\n", 1307 pr_debug("access denied: %s of [%llu,%llu] on dev=%04x\n",
1287 operation == READ ? "read" : "write", 1308 operation == READ ? "read" : "write",
1288 preq.sector_number, 1309 preq.sector_number,
1289 preq.sector_number + preq.nr_sects, 1310 preq.sector_number + preq.nr_sects,
1290 blkif->vbd.pdevice); 1311 ring->blkif->vbd.pdevice);
1291 goto fail_response; 1312 goto fail_response;
1292 } 1313 }
1293 1314
@@ -1299,7 +1320,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
1299 if (((int)preq.sector_number|(int)seg[i].nsec) & 1320 if (((int)preq.sector_number|(int)seg[i].nsec) &
1300 ((bdev_logical_block_size(preq.bdev) >> 9) - 1)) { 1321 ((bdev_logical_block_size(preq.bdev) >> 9) - 1)) {
1301 pr_debug("Misaligned I/O request from domain %d\n", 1322 pr_debug("Misaligned I/O request from domain %d\n",
1302 blkif->domid); 1323 ring->blkif->domid);
1303 goto fail_response; 1324 goto fail_response;
1304 } 1325 }
1305 } 1326 }
@@ -1308,7 +1329,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
1308 * issue the WRITE_FLUSH. 1329 * issue the WRITE_FLUSH.
1309 */ 1330 */
1310 if (drain) 1331 if (drain)
1311 xen_blk_drain_io(pending_req->blkif); 1332 xen_blk_drain_io(pending_req->ring);
1312 1333
1313 /* 1334 /*
1314 * If we have failed at this point, we need to undo the M2P override, 1335 * If we have failed at this point, we need to undo the M2P override,
@@ -1323,8 +1344,8 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
1323 * This corresponding xen_blkif_put is done in __end_block_io_op, or 1344 * This corresponding xen_blkif_put is done in __end_block_io_op, or
1324 * below (in "!bio") if we are handling a BLKIF_OP_DISCARD. 1345 * below (in "!bio") if we are handling a BLKIF_OP_DISCARD.
1325 */ 1346 */
1326 xen_blkif_get(blkif); 1347 xen_blkif_get(ring->blkif);
1327 atomic_inc(&blkif->inflight); 1348 atomic_inc(&ring->inflight);
1328 1349
1329 for (i = 0; i < nseg; i++) { 1350 for (i = 0; i < nseg; i++) {
1330 while ((bio == NULL) || 1351 while ((bio == NULL) ||
@@ -1372,19 +1393,19 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
1372 blk_finish_plug(&plug); 1393 blk_finish_plug(&plug);
1373 1394
1374 if (operation == READ) 1395 if (operation == READ)
1375 blkif->st_rd_sect += preq.nr_sects; 1396 ring->st_rd_sect += preq.nr_sects;
1376 else if (operation & WRITE) 1397 else if (operation & WRITE)
1377 blkif->st_wr_sect += preq.nr_sects; 1398 ring->st_wr_sect += preq.nr_sects;
1378 1399
1379 return 0; 1400 return 0;
1380 1401
1381 fail_flush: 1402 fail_flush:
1382 xen_blkbk_unmap(blkif, pending_req->segments, 1403 xen_blkbk_unmap(ring, pending_req->segments,
1383 pending_req->nr_segs); 1404 pending_req->nr_segs);
1384 fail_response: 1405 fail_response:
1385 /* Haven't submitted any bio's yet. */ 1406 /* Haven't submitted any bio's yet. */
1386 make_response(blkif, req->u.rw.id, req_operation, BLKIF_RSP_ERROR); 1407 make_response(ring, req->u.rw.id, req_operation, BLKIF_RSP_ERROR);
1387 free_req(blkif, pending_req); 1408 free_req(ring, pending_req);
1388 msleep(1); /* back off a bit */ 1409 msleep(1); /* back off a bit */
1389 return -EIO; 1410 return -EIO;
1390 1411
@@ -1402,21 +1423,22 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
1402/* 1423/*
1403 * Put a response on the ring on how the operation fared. 1424 * Put a response on the ring on how the operation fared.
1404 */ 1425 */
1405static void make_response(struct xen_blkif *blkif, u64 id, 1426static void make_response(struct xen_blkif_ring *ring, u64 id,
1406 unsigned short op, int st) 1427 unsigned short op, int st)
1407{ 1428{
1408 struct blkif_response resp; 1429 struct blkif_response resp;
1409 unsigned long flags; 1430 unsigned long flags;
1410 union blkif_back_rings *blk_rings = &blkif->blk_rings; 1431 union blkif_back_rings *blk_rings;
1411 int notify; 1432 int notify;
1412 1433
1413 resp.id = id; 1434 resp.id = id;
1414 resp.operation = op; 1435 resp.operation = op;
1415 resp.status = st; 1436 resp.status = st;
1416 1437
1417 spin_lock_irqsave(&blkif->blk_ring_lock, flags); 1438 spin_lock_irqsave(&ring->blk_ring_lock, flags);
1439 blk_rings = &ring->blk_rings;
1418 /* Place on the response ring for the relevant domain. */ 1440 /* Place on the response ring for the relevant domain. */
1419 switch (blkif->blk_protocol) { 1441 switch (ring->blkif->blk_protocol) {
1420 case BLKIF_PROTOCOL_NATIVE: 1442 case BLKIF_PROTOCOL_NATIVE:
1421 memcpy(RING_GET_RESPONSE(&blk_rings->native, blk_rings->native.rsp_prod_pvt), 1443 memcpy(RING_GET_RESPONSE(&blk_rings->native, blk_rings->native.rsp_prod_pvt),
1422 &resp, sizeof(resp)); 1444 &resp, sizeof(resp));
@@ -1434,9 +1456,9 @@ static void make_response(struct xen_blkif *blkif, u64 id,
1434 } 1456 }
1435 blk_rings->common.rsp_prod_pvt++; 1457 blk_rings->common.rsp_prod_pvt++;
1436 RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&blk_rings->common, notify); 1458 RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&blk_rings->common, notify);
1437 spin_unlock_irqrestore(&blkif->blk_ring_lock, flags); 1459 spin_unlock_irqrestore(&ring->blk_ring_lock, flags);
1438 if (notify) 1460 if (notify)
1439 notify_remote_via_irq(blkif->irq); 1461 notify_remote_via_irq(ring->irq);
1440} 1462}
1441 1463
1442static int __init xen_blkif_init(void) 1464static int __init xen_blkif_init(void)
@@ -1452,6 +1474,9 @@ static int __init xen_blkif_init(void)
1452 xen_blkif_max_ring_order = XENBUS_MAX_RING_GRANT_ORDER; 1474 xen_blkif_max_ring_order = XENBUS_MAX_RING_GRANT_ORDER;
1453 } 1475 }
1454 1476
1477 if (xenblk_max_queues == 0)
1478 xenblk_max_queues = num_online_cpus();
1479
1455 rc = xen_blkif_interface_init(); 1480 rc = xen_blkif_interface_init();
1456 if (rc) 1481 if (rc)
1457 goto failed_init; 1482 goto failed_init;
diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h
index 68e87a037b99..b27c5ba15600 100644
--- a/drivers/block/xen-blkback/common.h
+++ b/drivers/block/xen-blkback/common.h
@@ -46,6 +46,7 @@
46#include <xen/interface/io/protocols.h> 46#include <xen/interface/io/protocols.h>
47 47
48extern unsigned int xen_blkif_max_ring_order; 48extern unsigned int xen_blkif_max_ring_order;
49extern unsigned int xenblk_max_queues;
49/* 50/*
50 * This is the maximum number of segments that would be allowed in indirect 51 * This is the maximum number of segments that would be allowed in indirect
51 * requests. This value will also be passed to the frontend. 52 * requests. This value will also be passed to the frontend.
@@ -269,68 +270,79 @@ struct persistent_gnt {
269 struct list_head remove_node; 270 struct list_head remove_node;
270}; 271};
271 272
272struct xen_blkif { 273/* Per-ring information. */
273 /* Unique identifier for this interface. */ 274struct xen_blkif_ring {
274 domid_t domid;
275 unsigned int handle;
276 /* Physical parameters of the comms window. */ 275 /* Physical parameters of the comms window. */
277 unsigned int irq; 276 unsigned int irq;
278 /* Comms information. */
279 enum blkif_protocol blk_protocol;
280 union blkif_back_rings blk_rings; 277 union blkif_back_rings blk_rings;
281 void *blk_ring; 278 void *blk_ring;
282 /* The VBD attached to this interface. */
283 struct xen_vbd vbd;
284 /* Back pointer to the backend_info. */
285 struct backend_info *be;
286 /* Private fields. */ 279 /* Private fields. */
287 spinlock_t blk_ring_lock; 280 spinlock_t blk_ring_lock;
288 atomic_t refcnt;
289 281
290 wait_queue_head_t wq; 282 wait_queue_head_t wq;
291 /* for barrier (drain) requests */
292 struct completion drain_complete;
293 atomic_t drain;
294 atomic_t inflight; 283 atomic_t inflight;
295 /* One thread per one blkif. */ 284 /* One thread per blkif ring. */
296 struct task_struct *xenblkd; 285 struct task_struct *xenblkd;
297 unsigned int waiting_reqs; 286 unsigned int waiting_reqs;
298 287
299 /* tree to store persistent grants */ 288 /* List of all 'pending_req' available */
289 struct list_head pending_free;
290 /* And its spinlock. */
291 spinlock_t pending_free_lock;
292 wait_queue_head_t pending_free_wq;
293
294 /* Tree to store persistent grants. */
295 spinlock_t pers_gnts_lock;
300 struct rb_root persistent_gnts; 296 struct rb_root persistent_gnts;
301 unsigned int persistent_gnt_c; 297 unsigned int persistent_gnt_c;
302 atomic_t persistent_gnt_in_use; 298 atomic_t persistent_gnt_in_use;
303 unsigned long next_lru; 299 unsigned long next_lru;
304 300
305 /* used by the kworker that offload work from the persistent purge */ 301 /* Statistics. */
302 unsigned long st_print;
303 unsigned long long st_rd_req;
304 unsigned long long st_wr_req;
305 unsigned long long st_oo_req;
306 unsigned long long st_f_req;
307 unsigned long long st_ds_req;
308 unsigned long long st_rd_sect;
309 unsigned long long st_wr_sect;
310
311 /* Used by the kworker that offload work from the persistent purge. */
306 struct list_head persistent_purge_list; 312 struct list_head persistent_purge_list;
307 struct work_struct persistent_purge_work; 313 struct work_struct persistent_purge_work;
308 314
309 /* buffer of free pages to map grant refs */ 315 /* Buffer of free pages to map grant refs. */
310 spinlock_t free_pages_lock; 316 spinlock_t free_pages_lock;
311 int free_pages_num; 317 int free_pages_num;
312 struct list_head free_pages; 318 struct list_head free_pages;
313 319
314 /* List of all 'pending_req' available */
315 struct list_head pending_free;
316 /* And its spinlock. */
317 spinlock_t pending_free_lock;
318 wait_queue_head_t pending_free_wq;
319
320 /* statistics */
321 unsigned long st_print;
322 unsigned long long st_rd_req;
323 unsigned long long st_wr_req;
324 unsigned long long st_oo_req;
325 unsigned long long st_f_req;
326 unsigned long long st_ds_req;
327 unsigned long long st_rd_sect;
328 unsigned long long st_wr_sect;
329
330 struct work_struct free_work; 320 struct work_struct free_work;
331 /* Thread shutdown wait queue. */ 321 /* Thread shutdown wait queue. */
332 wait_queue_head_t shutdown_wq; 322 wait_queue_head_t shutdown_wq;
333 unsigned int nr_ring_pages; 323 struct xen_blkif *blkif;
324};
325
326struct xen_blkif {
327 /* Unique identifier for this interface. */
328 domid_t domid;
329 unsigned int handle;
330 /* Comms information. */
331 enum blkif_protocol blk_protocol;
332 /* The VBD attached to this interface. */
333 struct xen_vbd vbd;
334 /* Back pointer to the backend_info. */
335 struct backend_info *be;
336 atomic_t refcnt;
337 /* for barrier (drain) requests */
338 struct completion drain_complete;
339 atomic_t drain;
340
341 struct work_struct free_work;
342 unsigned int nr_ring_pages;
343 /* All rings for this device. */
344 struct xen_blkif_ring *rings;
345 unsigned int nr_rings;
334}; 346};
335 347
336struct seg_buf { 348struct seg_buf {
@@ -352,7 +364,7 @@ struct grant_page {
352 * response queued for it, with the saved 'id' passed back. 364 * response queued for it, with the saved 'id' passed back.
353 */ 365 */
354struct pending_req { 366struct pending_req {
355 struct xen_blkif *blkif; 367 struct xen_blkif_ring *ring;
356 u64 id; 368 u64 id;
357 int nr_segs; 369 int nr_segs;
358 atomic_t pendcnt; 370 atomic_t pendcnt;
@@ -394,7 +406,7 @@ int xen_blkif_xenbus_init(void);
394irqreturn_t xen_blkif_be_int(int irq, void *dev_id); 406irqreturn_t xen_blkif_be_int(int irq, void *dev_id);
395int xen_blkif_schedule(void *arg); 407int xen_blkif_schedule(void *arg);
396int xen_blkif_purge_persistent(void *arg); 408int xen_blkif_purge_persistent(void *arg);
397void xen_blkbk_free_caches(struct xen_blkif *blkif); 409void xen_blkbk_free_caches(struct xen_blkif_ring *ring);
398 410
399int xen_blkbk_flush_diskcache(struct xenbus_transaction xbt, 411int xen_blkbk_flush_diskcache(struct xenbus_transaction xbt,
400 struct backend_info *be, int state); 412 struct backend_info *be, int state);
diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c
index f53cff42f8da..876763f7f13e 100644
--- a/drivers/block/xen-blkback/xenbus.c
+++ b/drivers/block/xen-blkback/xenbus.c
@@ -86,9 +86,11 @@ static void xen_update_blkif_status(struct xen_blkif *blkif)
86{ 86{
87 int err; 87 int err;
88 char name[BLKBACK_NAME_LEN]; 88 char name[BLKBACK_NAME_LEN];
89 struct xen_blkif_ring *ring;
90 int i;
89 91
90 /* Not ready to connect? */ 92 /* Not ready to connect? */
91 if (!blkif->irq || !blkif->vbd.bdev) 93 if (!blkif->rings || !blkif->rings[0].irq || !blkif->vbd.bdev)
92 return; 94 return;
93 95
94 /* Already connected? */ 96 /* Already connected? */
@@ -113,13 +115,55 @@ static void xen_update_blkif_status(struct xen_blkif *blkif)
113 } 115 }
114 invalidate_inode_pages2(blkif->vbd.bdev->bd_inode->i_mapping); 116 invalidate_inode_pages2(blkif->vbd.bdev->bd_inode->i_mapping);
115 117
116 blkif->xenblkd = kthread_run(xen_blkif_schedule, blkif, "%s", name); 118 for (i = 0; i < blkif->nr_rings; i++) {
117 if (IS_ERR(blkif->xenblkd)) { 119 ring = &blkif->rings[i];
118 err = PTR_ERR(blkif->xenblkd); 120 ring->xenblkd = kthread_run(xen_blkif_schedule, ring, "%s-%d", name, i);
119 blkif->xenblkd = NULL; 121 if (IS_ERR(ring->xenblkd)) {
120 xenbus_dev_error(blkif->be->dev, err, "start xenblkd"); 122 err = PTR_ERR(ring->xenblkd);
121 return; 123 ring->xenblkd = NULL;
124 xenbus_dev_fatal(blkif->be->dev, err,
125 "start %s-%d xenblkd", name, i);
126 goto out;
127 }
128 }
129 return;
130
131out:
132 while (--i >= 0) {
133 ring = &blkif->rings[i];
134 kthread_stop(ring->xenblkd);
135 }
136 return;
137}
138
139static int xen_blkif_alloc_rings(struct xen_blkif *blkif)
140{
141 unsigned int r;
142
143 blkif->rings = kzalloc(blkif->nr_rings * sizeof(struct xen_blkif_ring), GFP_KERNEL);
144 if (!blkif->rings)
145 return -ENOMEM;
146
147 for (r = 0; r < blkif->nr_rings; r++) {
148 struct xen_blkif_ring *ring = &blkif->rings[r];
149
150 spin_lock_init(&ring->blk_ring_lock);
151 init_waitqueue_head(&ring->wq);
152 INIT_LIST_HEAD(&ring->pending_free);
153 INIT_LIST_HEAD(&ring->persistent_purge_list);
154 INIT_WORK(&ring->persistent_purge_work, xen_blkbk_unmap_purged_grants);
155 spin_lock_init(&ring->free_pages_lock);
156 INIT_LIST_HEAD(&ring->free_pages);
157
158 spin_lock_init(&ring->pending_free_lock);
159 init_waitqueue_head(&ring->pending_free_wq);
160 init_waitqueue_head(&ring->shutdown_wq);
161 ring->blkif = blkif;
162 ring->st_print = jiffies;
163 xen_blkif_get(blkif);
122 } 164 }
165
166 return 0;
123} 167}
124 168
125static struct xen_blkif *xen_blkif_alloc(domid_t domid) 169static struct xen_blkif *xen_blkif_alloc(domid_t domid)
@@ -133,41 +177,25 @@ static struct xen_blkif *xen_blkif_alloc(domid_t domid)
133 return ERR_PTR(-ENOMEM); 177 return ERR_PTR(-ENOMEM);
134 178
135 blkif->domid = domid; 179 blkif->domid = domid;
136 spin_lock_init(&blkif->blk_ring_lock);
137 atomic_set(&blkif->refcnt, 1); 180 atomic_set(&blkif->refcnt, 1);
138 init_waitqueue_head(&blkif->wq);
139 init_completion(&blkif->drain_complete); 181 init_completion(&blkif->drain_complete);
140 atomic_set(&blkif->drain, 0);
141 blkif->st_print = jiffies;
142 blkif->persistent_gnts.rb_node = NULL;
143 spin_lock_init(&blkif->free_pages_lock);
144 INIT_LIST_HEAD(&blkif->free_pages);
145 INIT_LIST_HEAD(&blkif->persistent_purge_list);
146 blkif->free_pages_num = 0;
147 atomic_set(&blkif->persistent_gnt_in_use, 0);
148 atomic_set(&blkif->inflight, 0);
149 INIT_WORK(&blkif->persistent_purge_work, xen_blkbk_unmap_purged_grants);
150
151 INIT_LIST_HEAD(&blkif->pending_free);
152 INIT_WORK(&blkif->free_work, xen_blkif_deferred_free); 182 INIT_WORK(&blkif->free_work, xen_blkif_deferred_free);
153 spin_lock_init(&blkif->pending_free_lock);
154 init_waitqueue_head(&blkif->pending_free_wq);
155 init_waitqueue_head(&blkif->shutdown_wq);
156 183
157 return blkif; 184 return blkif;
158} 185}
159 186
160static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t *gref, 187static int xen_blkif_map(struct xen_blkif_ring *ring, grant_ref_t *gref,
161 unsigned int nr_grefs, unsigned int evtchn) 188 unsigned int nr_grefs, unsigned int evtchn)
162{ 189{
163 int err; 190 int err;
191 struct xen_blkif *blkif = ring->blkif;
164 192
165 /* Already connected through? */ 193 /* Already connected through? */
166 if (blkif->irq) 194 if (ring->irq)
167 return 0; 195 return 0;
168 196
169 err = xenbus_map_ring_valloc(blkif->be->dev, gref, nr_grefs, 197 err = xenbus_map_ring_valloc(blkif->be->dev, gref, nr_grefs,
170 &blkif->blk_ring); 198 &ring->blk_ring);
171 if (err < 0) 199 if (err < 0)
172 return err; 200 return err;
173 201
@@ -175,24 +203,24 @@ static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t *gref,
175 case BLKIF_PROTOCOL_NATIVE: 203 case BLKIF_PROTOCOL_NATIVE:
176 { 204 {
177 struct blkif_sring *sring; 205 struct blkif_sring *sring;
178 sring = (struct blkif_sring *)blkif->blk_ring; 206 sring = (struct blkif_sring *)ring->blk_ring;
179 BACK_RING_INIT(&blkif->blk_rings.native, sring, 207 BACK_RING_INIT(&ring->blk_rings.native, sring,
180 XEN_PAGE_SIZE * nr_grefs); 208 XEN_PAGE_SIZE * nr_grefs);
181 break; 209 break;
182 } 210 }
183 case BLKIF_PROTOCOL_X86_32: 211 case BLKIF_PROTOCOL_X86_32:
184 { 212 {
185 struct blkif_x86_32_sring *sring_x86_32; 213 struct blkif_x86_32_sring *sring_x86_32;
186 sring_x86_32 = (struct blkif_x86_32_sring *)blkif->blk_ring; 214 sring_x86_32 = (struct blkif_x86_32_sring *)ring->blk_ring;
187 BACK_RING_INIT(&blkif->blk_rings.x86_32, sring_x86_32, 215 BACK_RING_INIT(&ring->blk_rings.x86_32, sring_x86_32,
188 XEN_PAGE_SIZE * nr_grefs); 216 XEN_PAGE_SIZE * nr_grefs);
189 break; 217 break;
190 } 218 }
191 case BLKIF_PROTOCOL_X86_64: 219 case BLKIF_PROTOCOL_X86_64:
192 { 220 {
193 struct blkif_x86_64_sring *sring_x86_64; 221 struct blkif_x86_64_sring *sring_x86_64;
194 sring_x86_64 = (struct blkif_x86_64_sring *)blkif->blk_ring; 222 sring_x86_64 = (struct blkif_x86_64_sring *)ring->blk_ring;
195 BACK_RING_INIT(&blkif->blk_rings.x86_64, sring_x86_64, 223 BACK_RING_INIT(&ring->blk_rings.x86_64, sring_x86_64,
196 XEN_PAGE_SIZE * nr_grefs); 224 XEN_PAGE_SIZE * nr_grefs);
197 break; 225 break;
198 } 226 }
@@ -202,13 +230,13 @@ static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t *gref,
202 230
203 err = bind_interdomain_evtchn_to_irqhandler(blkif->domid, evtchn, 231 err = bind_interdomain_evtchn_to_irqhandler(blkif->domid, evtchn,
204 xen_blkif_be_int, 0, 232 xen_blkif_be_int, 0,
205 "blkif-backend", blkif); 233 "blkif-backend", ring);
206 if (err < 0) { 234 if (err < 0) {
207 xenbus_unmap_ring_vfree(blkif->be->dev, blkif->blk_ring); 235 xenbus_unmap_ring_vfree(blkif->be->dev, ring->blk_ring);
208 blkif->blk_rings.common.sring = NULL; 236 ring->blk_rings.common.sring = NULL;
209 return err; 237 return err;
210 } 238 }
211 blkif->irq = err; 239 ring->irq = err;
212 240
213 return 0; 241 return 0;
214} 242}
@@ -216,50 +244,69 @@ static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t *gref,
216static int xen_blkif_disconnect(struct xen_blkif *blkif) 244static int xen_blkif_disconnect(struct xen_blkif *blkif)
217{ 245{
218 struct pending_req *req, *n; 246 struct pending_req *req, *n;
219 int i = 0, j; 247 unsigned int j, r;
220 248
221 if (blkif->xenblkd) { 249 for (r = 0; r < blkif->nr_rings; r++) {
222 kthread_stop(blkif->xenblkd); 250 struct xen_blkif_ring *ring = &blkif->rings[r];
223 wake_up(&blkif->shutdown_wq); 251 unsigned int i = 0;
224 blkif->xenblkd = NULL;
225 }
226 252
227 /* The above kthread_stop() guarantees that at this point we 253 if (ring->xenblkd) {
228 * don't have any discard_io or other_io requests. So, checking 254 kthread_stop(ring->xenblkd);
229 * for inflight IO is enough. 255 wake_up(&ring->shutdown_wq);
230 */ 256 ring->xenblkd = NULL;
231 if (atomic_read(&blkif->inflight) > 0) 257 }
232 return -EBUSY;
233 258
234 if (blkif->irq) { 259 /* The above kthread_stop() guarantees that at this point we
235 unbind_from_irqhandler(blkif->irq, blkif); 260 * don't have any discard_io or other_io requests. So, checking
236 blkif->irq = 0; 261 * for inflight IO is enough.
237 } 262 */
263 if (atomic_read(&ring->inflight) > 0)
264 return -EBUSY;
238 265
239 if (blkif->blk_rings.common.sring) { 266 if (ring->irq) {
240 xenbus_unmap_ring_vfree(blkif->be->dev, blkif->blk_ring); 267 unbind_from_irqhandler(ring->irq, ring);
241 blkif->blk_rings.common.sring = NULL; 268 ring->irq = 0;
242 } 269 }
243 270
244 /* Remove all persistent grants and the cache of ballooned pages. */ 271 if (ring->blk_rings.common.sring) {
245 xen_blkbk_free_caches(blkif); 272 xenbus_unmap_ring_vfree(blkif->be->dev, ring->blk_ring);
273 ring->blk_rings.common.sring = NULL;
274 }
246 275
247 /* Check that there is no request in use */ 276 /* Remove all persistent grants and the cache of ballooned pages. */
248 list_for_each_entry_safe(req, n, &blkif->pending_free, free_list) { 277 xen_blkbk_free_caches(ring);
249 list_del(&req->free_list);
250 278
251 for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++) 279 /* Check that there is no request in use */
252 kfree(req->segments[j]); 280 list_for_each_entry_safe(req, n, &ring->pending_free, free_list) {
281 list_del(&req->free_list);
253 282
254 for (j = 0; j < MAX_INDIRECT_PAGES; j++) 283 for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++)
255 kfree(req->indirect_pages[j]); 284 kfree(req->segments[j]);
256 285
257 kfree(req); 286 for (j = 0; j < MAX_INDIRECT_PAGES; j++)
258 i++; 287 kfree(req->indirect_pages[j]);
259 } 288
289 kfree(req);
290 i++;
291 }
260 292
261 WARN_ON(i != (XEN_BLKIF_REQS_PER_PAGE * blkif->nr_ring_pages)); 293 BUG_ON(atomic_read(&ring->persistent_gnt_in_use) != 0);
294 BUG_ON(!list_empty(&ring->persistent_purge_list));
295 BUG_ON(!RB_EMPTY_ROOT(&ring->persistent_gnts));
296 BUG_ON(!list_empty(&ring->free_pages));
297 BUG_ON(ring->free_pages_num != 0);
298 BUG_ON(ring->persistent_gnt_c != 0);
299 WARN_ON(i != (XEN_BLKIF_REQS_PER_PAGE * blkif->nr_ring_pages));
300 xen_blkif_put(blkif);
301 }
262 blkif->nr_ring_pages = 0; 302 blkif->nr_ring_pages = 0;
303 /*
304 * blkif->rings was allocated in connect_ring, so we should free it in
305 * here.
306 */
307 kfree(blkif->rings);
308 blkif->rings = NULL;
309 blkif->nr_rings = 0;
263 310
264 return 0; 311 return 0;
265} 312}
@@ -271,13 +318,6 @@ static void xen_blkif_free(struct xen_blkif *blkif)
271 xen_vbd_free(&blkif->vbd); 318 xen_vbd_free(&blkif->vbd);
272 319
273 /* Make sure everything is drained before shutting down */ 320 /* Make sure everything is drained before shutting down */
274 BUG_ON(blkif->persistent_gnt_c != 0);
275 BUG_ON(atomic_read(&blkif->persistent_gnt_in_use) != 0);
276 BUG_ON(blkif->free_pages_num != 0);
277 BUG_ON(!list_empty(&blkif->persistent_purge_list));
278 BUG_ON(!list_empty(&blkif->free_pages));
279 BUG_ON(!RB_EMPTY_ROOT(&blkif->persistent_gnts));
280
281 kmem_cache_free(xen_blkif_cachep, blkif); 321 kmem_cache_free(xen_blkif_cachep, blkif);
282} 322}
283 323
@@ -296,25 +336,38 @@ int __init xen_blkif_interface_init(void)
296 * sysfs interface for VBD I/O requests 336 * sysfs interface for VBD I/O requests
297 */ 337 */
298 338
299#define VBD_SHOW(name, format, args...) \ 339#define VBD_SHOW_ALLRING(name, format) \
300 static ssize_t show_##name(struct device *_dev, \ 340 static ssize_t show_##name(struct device *_dev, \
301 struct device_attribute *attr, \ 341 struct device_attribute *attr, \
302 char *buf) \ 342 char *buf) \
303 { \ 343 { \
304 struct xenbus_device *dev = to_xenbus_device(_dev); \ 344 struct xenbus_device *dev = to_xenbus_device(_dev); \
305 struct backend_info *be = dev_get_drvdata(&dev->dev); \ 345 struct backend_info *be = dev_get_drvdata(&dev->dev); \
346 struct xen_blkif *blkif = be->blkif; \
347 unsigned int i; \
348 unsigned long long result = 0; \
306 \ 349 \
307 return sprintf(buf, format, ##args); \ 350 if (!blkif->rings) \
351 goto out; \
352 \
353 for (i = 0; i < blkif->nr_rings; i++) { \
354 struct xen_blkif_ring *ring = &blkif->rings[i]; \
355 \
356 result += ring->st_##name; \
357 } \
358 \
359out: \
360 return sprintf(buf, format, result); \
308 } \ 361 } \
309 static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL) 362 static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL)
310 363
311VBD_SHOW(oo_req, "%llu\n", be->blkif->st_oo_req); 364VBD_SHOW_ALLRING(oo_req, "%llu\n");
312VBD_SHOW(rd_req, "%llu\n", be->blkif->st_rd_req); 365VBD_SHOW_ALLRING(rd_req, "%llu\n");
313VBD_SHOW(wr_req, "%llu\n", be->blkif->st_wr_req); 366VBD_SHOW_ALLRING(wr_req, "%llu\n");
314VBD_SHOW(f_req, "%llu\n", be->blkif->st_f_req); 367VBD_SHOW_ALLRING(f_req, "%llu\n");
315VBD_SHOW(ds_req, "%llu\n", be->blkif->st_ds_req); 368VBD_SHOW_ALLRING(ds_req, "%llu\n");
316VBD_SHOW(rd_sect, "%llu\n", be->blkif->st_rd_sect); 369VBD_SHOW_ALLRING(rd_sect, "%llu\n");
317VBD_SHOW(wr_sect, "%llu\n", be->blkif->st_wr_sect); 370VBD_SHOW_ALLRING(wr_sect, "%llu\n");
318 371
319static struct attribute *xen_vbdstat_attrs[] = { 372static struct attribute *xen_vbdstat_attrs[] = {
320 &dev_attr_oo_req.attr, 373 &dev_attr_oo_req.attr,
@@ -332,6 +385,18 @@ static struct attribute_group xen_vbdstat_group = {
332 .attrs = xen_vbdstat_attrs, 385 .attrs = xen_vbdstat_attrs,
333}; 386};
334 387
388#define VBD_SHOW(name, format, args...) \
389 static ssize_t show_##name(struct device *_dev, \
390 struct device_attribute *attr, \
391 char *buf) \
392 { \
393 struct xenbus_device *dev = to_xenbus_device(_dev); \
394 struct backend_info *be = dev_get_drvdata(&dev->dev); \
395 \
396 return sprintf(buf, format, ##args); \
397 } \
398 static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL)
399
335VBD_SHOW(physical_device, "%x:%x\n", be->major, be->minor); 400VBD_SHOW(physical_device, "%x:%x\n", be->major, be->minor);
336VBD_SHOW(mode, "%s\n", be->mode); 401VBD_SHOW(mode, "%s\n", be->mode);
337 402
@@ -440,11 +505,11 @@ static int xen_blkbk_remove(struct xenbus_device *dev)
440 505
441 dev_set_drvdata(&dev->dev, NULL); 506 dev_set_drvdata(&dev->dev, NULL);
442 507
443 if (be->blkif) { 508 if (be->blkif)
444 xen_blkif_disconnect(be->blkif); 509 xen_blkif_disconnect(be->blkif);
445 xen_blkif_put(be->blkif);
446 }
447 510
511 /* Put the reference we set in xen_blkif_alloc(). */
512 xen_blkif_put(be->blkif);
448 kfree(be->mode); 513 kfree(be->mode);
449 kfree(be); 514 kfree(be);
450 return 0; 515 return 0;
@@ -553,6 +618,12 @@ static int xen_blkbk_probe(struct xenbus_device *dev,
553 goto fail; 618 goto fail;
554 } 619 }
555 620
621 /* Multi-queue: advertise how many queues are supported by us.*/
622 err = xenbus_printf(XBT_NIL, dev->nodename,
623 "multi-queue-max-queues", "%u", xenblk_max_queues);
624 if (err)
625 pr_warn("Error writing multi-queue-max-queues\n");
626
556 /* setup back pointer */ 627 /* setup back pointer */
557 be->blkif->be = be; 628 be->blkif->be = be;
558 629
@@ -708,8 +779,14 @@ static void frontend_changed(struct xenbus_device *dev,
708 } 779 }
709 780
710 err = connect_ring(be); 781 err = connect_ring(be);
711 if (err) 782 if (err) {
783 /*
784 * Clean up so that memory resources can be used by
785 * other devices. connect_ring reported already error.
786 */
787 xen_blkif_disconnect(be->blkif);
712 break; 788 break;
789 }
713 xen_update_blkif_status(be->blkif); 790 xen_update_blkif_status(be->blkif);
714 break; 791 break;
715 792
@@ -825,50 +902,43 @@ again:
825 xenbus_transaction_end(xbt, 1); 902 xenbus_transaction_end(xbt, 1);
826} 903}
827 904
828 905/*
829static int connect_ring(struct backend_info *be) 906 * Each ring may have multi pages, depends on "ring-page-order".
907 */
908static int read_per_ring_refs(struct xen_blkif_ring *ring, const char *dir)
830{ 909{
831 struct xenbus_device *dev = be->dev;
832 unsigned int ring_ref[XENBUS_MAX_RING_GRANTS]; 910 unsigned int ring_ref[XENBUS_MAX_RING_GRANTS];
833 unsigned int evtchn, nr_grefs, ring_page_order;
834 unsigned int pers_grants;
835 char protocol[64] = "";
836 struct pending_req *req, *n; 911 struct pending_req *req, *n;
837 int err, i, j; 912 int err, i, j;
913 struct xen_blkif *blkif = ring->blkif;
914 struct xenbus_device *dev = blkif->be->dev;
915 unsigned int ring_page_order, nr_grefs, evtchn;
838 916
839 pr_debug("%s %s\n", __func__, dev->otherend); 917 err = xenbus_scanf(XBT_NIL, dir, "event-channel", "%u",
840
841 err = xenbus_scanf(XBT_NIL, dev->otherend, "event-channel", "%u",
842 &evtchn); 918 &evtchn);
843 if (err != 1) { 919 if (err != 1) {
844 err = -EINVAL; 920 err = -EINVAL;
845 xenbus_dev_fatal(dev, err, "reading %s/event-channel", 921 xenbus_dev_fatal(dev, err, "reading %s/event-channel", dir);
846 dev->otherend);
847 return err; 922 return err;
848 } 923 }
849 pr_info("event-channel %u\n", evtchn);
850 924
851 err = xenbus_scanf(XBT_NIL, dev->otherend, "ring-page-order", "%u", 925 err = xenbus_scanf(XBT_NIL, dev->otherend, "ring-page-order", "%u",
852 &ring_page_order); 926 &ring_page_order);
853 if (err != 1) { 927 if (err != 1) {
854 err = xenbus_scanf(XBT_NIL, dev->otherend, "ring-ref", 928 err = xenbus_scanf(XBT_NIL, dir, "ring-ref", "%u", &ring_ref[0]);
855 "%u", &ring_ref[0]);
856 if (err != 1) { 929 if (err != 1) {
857 err = -EINVAL; 930 err = -EINVAL;
858 xenbus_dev_fatal(dev, err, "reading %s/ring-ref", 931 xenbus_dev_fatal(dev, err, "reading %s/ring-ref", dir);
859 dev->otherend);
860 return err; 932 return err;
861 } 933 }
862 nr_grefs = 1; 934 nr_grefs = 1;
863 pr_info("%s:using single page: ring-ref %d\n", dev->otherend,
864 ring_ref[0]);
865 } else { 935 } else {
866 unsigned int i; 936 unsigned int i;
867 937
868 if (ring_page_order > xen_blkif_max_ring_order) { 938 if (ring_page_order > xen_blkif_max_ring_order) {
869 err = -EINVAL; 939 err = -EINVAL;
870 xenbus_dev_fatal(dev, err, "%s/request %d ring page order exceed max:%d", 940 xenbus_dev_fatal(dev, err, "%s/request %d ring page order exceed max:%d",
871 dev->otherend, ring_page_order, 941 dir, ring_page_order,
872 xen_blkif_max_ring_order); 942 xen_blkif_max_ring_order);
873 return err; 943 return err;
874 } 944 }
@@ -878,52 +948,23 @@ static int connect_ring(struct backend_info *be)
878 char ring_ref_name[RINGREF_NAME_LEN]; 948 char ring_ref_name[RINGREF_NAME_LEN];
879 949
880 snprintf(ring_ref_name, RINGREF_NAME_LEN, "ring-ref%u", i); 950 snprintf(ring_ref_name, RINGREF_NAME_LEN, "ring-ref%u", i);
881 err = xenbus_scanf(XBT_NIL, dev->otherend, ring_ref_name, 951 err = xenbus_scanf(XBT_NIL, dir, ring_ref_name,
882 "%u", &ring_ref[i]); 952 "%u", &ring_ref[i]);
883 if (err != 1) { 953 if (err != 1) {
884 err = -EINVAL; 954 err = -EINVAL;
885 xenbus_dev_fatal(dev, err, "reading %s/%s", 955 xenbus_dev_fatal(dev, err, "reading %s/%s",
886 dev->otherend, ring_ref_name); 956 dir, ring_ref_name);
887 return err; 957 return err;
888 } 958 }
889 pr_info("ring-ref%u: %u\n", i, ring_ref[i]);
890 } 959 }
891 } 960 }
892 961 blkif->nr_ring_pages = nr_grefs;
893 be->blkif->blk_protocol = BLKIF_PROTOCOL_DEFAULT;
894 err = xenbus_gather(XBT_NIL, dev->otherend, "protocol",
895 "%63s", protocol, NULL);
896 if (err)
897 strcpy(protocol, "unspecified, assuming default");
898 else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_NATIVE))
899 be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE;
900 else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_32))
901 be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_32;
902 else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_64))
903 be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_64;
904 else {
905 xenbus_dev_fatal(dev, err, "unknown fe protocol %s", protocol);
906 return -1;
907 }
908 err = xenbus_gather(XBT_NIL, dev->otherend,
909 "feature-persistent", "%u",
910 &pers_grants, NULL);
911 if (err)
912 pers_grants = 0;
913
914 be->blkif->vbd.feature_gnt_persistent = pers_grants;
915 be->blkif->vbd.overflow_max_grants = 0;
916 be->blkif->nr_ring_pages = nr_grefs;
917
918 pr_info("ring-pages:%d, event-channel %d, protocol %d (%s) %s\n",
919 nr_grefs, evtchn, be->blkif->blk_protocol, protocol,
920 pers_grants ? "persistent grants" : "");
921 962
922 for (i = 0; i < nr_grefs * XEN_BLKIF_REQS_PER_PAGE; i++) { 963 for (i = 0; i < nr_grefs * XEN_BLKIF_REQS_PER_PAGE; i++) {
923 req = kzalloc(sizeof(*req), GFP_KERNEL); 964 req = kzalloc(sizeof(*req), GFP_KERNEL);
924 if (!req) 965 if (!req)
925 goto fail; 966 goto fail;
926 list_add_tail(&req->free_list, &be->blkif->pending_free); 967 list_add_tail(&req->free_list, &ring->pending_free);
927 for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++) { 968 for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++) {
928 req->segments[j] = kzalloc(sizeof(*req->segments[0]), GFP_KERNEL); 969 req->segments[j] = kzalloc(sizeof(*req->segments[0]), GFP_KERNEL);
929 if (!req->segments[j]) 970 if (!req->segments[j])
@@ -938,7 +979,7 @@ static int connect_ring(struct backend_info *be)
938 } 979 }
939 980
940 /* Map the shared frame, irq etc. */ 981 /* Map the shared frame, irq etc. */
941 err = xen_blkif_map(be->blkif, ring_ref, nr_grefs, evtchn); 982 err = xen_blkif_map(ring, ring_ref, nr_grefs, evtchn);
942 if (err) { 983 if (err) {
943 xenbus_dev_fatal(dev, err, "mapping ring-ref port %u", evtchn); 984 xenbus_dev_fatal(dev, err, "mapping ring-ref port %u", evtchn);
944 return err; 985 return err;
@@ -947,7 +988,7 @@ static int connect_ring(struct backend_info *be)
947 return 0; 988 return 0;
948 989
949fail: 990fail:
950 list_for_each_entry_safe(req, n, &be->blkif->pending_free, free_list) { 991 list_for_each_entry_safe(req, n, &ring->pending_free, free_list) {
951 list_del(&req->free_list); 992 list_del(&req->free_list);
952 for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++) { 993 for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++) {
953 if (!req->segments[j]) 994 if (!req->segments[j])
@@ -962,6 +1003,93 @@ fail:
962 kfree(req); 1003 kfree(req);
963 } 1004 }
964 return -ENOMEM; 1005 return -ENOMEM;
1006
1007}
1008
1009static int connect_ring(struct backend_info *be)
1010{
1011 struct xenbus_device *dev = be->dev;
1012 unsigned int pers_grants;
1013 char protocol[64] = "";
1014 int err, i;
1015 char *xspath;
1016 size_t xspathsize;
1017 const size_t xenstore_path_ext_size = 11; /* sufficient for "/queue-NNN" */
1018 unsigned int requested_num_queues = 0;
1019
1020 pr_debug("%s %s\n", __func__, dev->otherend);
1021
1022 be->blkif->blk_protocol = BLKIF_PROTOCOL_DEFAULT;
1023 err = xenbus_gather(XBT_NIL, dev->otherend, "protocol",
1024 "%63s", protocol, NULL);
1025 if (err)
1026 strcpy(protocol, "unspecified, assuming default");
1027 else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_NATIVE))
1028 be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE;
1029 else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_32))
1030 be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_32;
1031 else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_64))
1032 be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_64;
1033 else {
1034 xenbus_dev_fatal(dev, err, "unknown fe protocol %s", protocol);
1035 return -ENOSYS;
1036 }
1037 err = xenbus_gather(XBT_NIL, dev->otherend,
1038 "feature-persistent", "%u",
1039 &pers_grants, NULL);
1040 if (err)
1041 pers_grants = 0;
1042
1043 be->blkif->vbd.feature_gnt_persistent = pers_grants;
1044 be->blkif->vbd.overflow_max_grants = 0;
1045
1046 /*
1047 * Read the number of hardware queues from frontend.
1048 */
1049 err = xenbus_scanf(XBT_NIL, dev->otherend, "multi-queue-num-queues",
1050 "%u", &requested_num_queues);
1051 if (err < 0) {
1052 requested_num_queues = 1;
1053 } else {
1054 if (requested_num_queues > xenblk_max_queues
1055 || requested_num_queues == 0) {
1056 /* Buggy or malicious guest. */
1057 xenbus_dev_fatal(dev, err,
1058 "guest requested %u queues, exceeding the maximum of %u.",
1059 requested_num_queues, xenblk_max_queues);
1060 return -ENOSYS;
1061 }
1062 }
1063 be->blkif->nr_rings = requested_num_queues;
1064 if (xen_blkif_alloc_rings(be->blkif))
1065 return -ENOMEM;
1066
1067 pr_info("%s: using %d queues, protocol %d (%s) %s\n", dev->nodename,
1068 be->blkif->nr_rings, be->blkif->blk_protocol, protocol,
1069 pers_grants ? "persistent grants" : "");
1070
1071 if (be->blkif->nr_rings == 1)
1072 return read_per_ring_refs(&be->blkif->rings[0], dev->otherend);
1073 else {
1074 xspathsize = strlen(dev->otherend) + xenstore_path_ext_size;
1075 xspath = kmalloc(xspathsize, GFP_KERNEL);
1076 if (!xspath) {
1077 xenbus_dev_fatal(dev, -ENOMEM, "reading ring references");
1078 return -ENOMEM;
1079 }
1080
1081 for (i = 0; i < be->blkif->nr_rings; i++) {
1082 memset(xspath, 0, xspathsize);
1083 snprintf(xspath, xspathsize, "%s/queue-%u", dev->otherend, i);
1084 err = read_per_ring_refs(&be->blkif->rings[i], xspath);
1085 if (err) {
1086 kfree(xspath);
1087 return err;
1088 }
1089 }
1090 kfree(xspath);
1091 }
1092 return 0;
965} 1093}
966 1094
967static const struct xenbus_device_id xen_blkbk_ids[] = { 1095static const struct xenbus_device_id xen_blkbk_ids[] = {
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 2fee2eef988d..8a8dc91c39f7 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -60,6 +60,20 @@
60 60
61#include <asm/xen/hypervisor.h> 61#include <asm/xen/hypervisor.h>
62 62
63/*
64 * The minimal size of segment supported by the block framework is PAGE_SIZE.
65 * When Linux is using a different page size than Xen, it may not be possible
66 * to put all the data in a single segment.
67 * This can happen when the backend doesn't support indirect descriptor and
68 * therefore the maximum amount of data that a request can carry is
69 * BLKIF_MAX_SEGMENTS_PER_REQUEST * XEN_PAGE_SIZE = 44KB
70 *
71 * Note that we only support one extra request. So the Linux page size
72 * should be <= ( 2 * BLKIF_MAX_SEGMENTS_PER_REQUEST * XEN_PAGE_SIZE) =
73 * 88KB.
74 */
75#define HAS_EXTRA_REQ (BLKIF_MAX_SEGMENTS_PER_REQUEST < XEN_PFN_PER_PAGE)
76
63enum blkif_state { 77enum blkif_state {
64 BLKIF_STATE_DISCONNECTED, 78 BLKIF_STATE_DISCONNECTED,
65 BLKIF_STATE_CONNECTED, 79 BLKIF_STATE_CONNECTED,
@@ -72,6 +86,13 @@ struct grant {
72 struct list_head node; 86 struct list_head node;
73}; 87};
74 88
89enum blk_req_status {
90 REQ_WAITING,
91 REQ_DONE,
92 REQ_ERROR,
93 REQ_EOPNOTSUPP,
94};
95
75struct blk_shadow { 96struct blk_shadow {
76 struct blkif_request req; 97 struct blkif_request req;
77 struct request *request; 98 struct request *request;
@@ -79,6 +100,14 @@ struct blk_shadow {
79 struct grant **indirect_grants; 100 struct grant **indirect_grants;
80 struct scatterlist *sg; 101 struct scatterlist *sg;
81 unsigned int num_sg; 102 unsigned int num_sg;
103 enum blk_req_status status;
104
105 #define NO_ASSOCIATED_ID ~0UL
106 /*
107 * Id of the sibling if we ever need 2 requests when handling a
108 * block I/O request
109 */
110 unsigned long associated_id;
82}; 111};
83 112
84struct split_bio { 113struct split_bio {
@@ -99,6 +128,10 @@ static unsigned int xen_blkif_max_segments = 32;
99module_param_named(max, xen_blkif_max_segments, int, S_IRUGO); 128module_param_named(max, xen_blkif_max_segments, int, S_IRUGO);
100MODULE_PARM_DESC(max, "Maximum amount of segments in indirect requests (default is 32)"); 129MODULE_PARM_DESC(max, "Maximum amount of segments in indirect requests (default is 32)");
101 130
131static unsigned int xen_blkif_max_queues = 4;
132module_param_named(max_queues, xen_blkif_max_queues, uint, S_IRUGO);
133MODULE_PARM_DESC(max_queues, "Maximum number of hardware queues/rings used per virtual disk");
134
102/* 135/*
103 * Maximum order of pages to be used for the shared ring between front and 136 * Maximum order of pages to be used for the shared ring between front and
104 * backend, 4KB page granularity is used. 137 * backend, 4KB page granularity is used.
@@ -114,10 +147,35 @@ MODULE_PARM_DESC(max_ring_page_order, "Maximum order of pages to be used for the
114 __CONST_RING_SIZE(blkif, XEN_PAGE_SIZE * XENBUS_MAX_RING_GRANTS) 147 __CONST_RING_SIZE(blkif, XEN_PAGE_SIZE * XENBUS_MAX_RING_GRANTS)
115 148
116/* 149/*
117 * ring-ref%i i=(-1UL) would take 11 characters + 'ring-ref' is 8, so 19 150 * ring-ref%u i=(-1UL) would take 11 characters + 'ring-ref' is 8, so 19
118 * characters are enough. Define to 20 to keep consist with backend. 151 * characters are enough. Define to 20 to keep consistent with backend.
119 */ 152 */
120#define RINGREF_NAME_LEN (20) 153#define RINGREF_NAME_LEN (20)
154/*
155 * queue-%u would take 7 + 10(UINT_MAX) = 17 characters.
156 */
157#define QUEUE_NAME_LEN (17)
158
159/*
160 * Per-ring info.
161 * Every blkfront device can associate with one or more blkfront_ring_info,
162 * depending on how many hardware queues/rings to be used.
163 */
164struct blkfront_ring_info {
165 /* Lock to protect data in every ring buffer. */
166 spinlock_t ring_lock;
167 struct blkif_front_ring ring;
168 unsigned int ring_ref[XENBUS_MAX_RING_GRANTS];
169 unsigned int evtchn, irq;
170 struct work_struct work;
171 struct gnttab_free_callback callback;
172 struct blk_shadow shadow[BLK_MAX_RING_SIZE];
173 struct list_head indirect_pages;
174 struct list_head grants;
175 unsigned int persistent_gnts_c;
176 unsigned long shadow_free;
177 struct blkfront_info *dev_info;
178};
121 179
122/* 180/*
123 * We have one of these per vbd, whether ide, scsi or 'other'. They 181 * We have one of these per vbd, whether ide, scsi or 'other'. They
@@ -126,25 +184,15 @@ MODULE_PARM_DESC(max_ring_page_order, "Maximum order of pages to be used for the
126 */ 184 */
127struct blkfront_info 185struct blkfront_info
128{ 186{
129 spinlock_t io_lock;
130 struct mutex mutex; 187 struct mutex mutex;
131 struct xenbus_device *xbdev; 188 struct xenbus_device *xbdev;
132 struct gendisk *gd; 189 struct gendisk *gd;
133 int vdevice; 190 int vdevice;
134 blkif_vdev_t handle; 191 blkif_vdev_t handle;
135 enum blkif_state connected; 192 enum blkif_state connected;
136 int ring_ref[XENBUS_MAX_RING_GRANTS]; 193 /* Number of pages per ring buffer. */
137 unsigned int nr_ring_pages; 194 unsigned int nr_ring_pages;
138 struct blkif_front_ring ring;
139 unsigned int evtchn, irq;
140 struct request_queue *rq; 195 struct request_queue *rq;
141 struct work_struct work;
142 struct gnttab_free_callback callback;
143 struct blk_shadow shadow[BLK_MAX_RING_SIZE];
144 struct list_head grants;
145 struct list_head indirect_pages;
146 unsigned int persistent_gnts_c;
147 unsigned long shadow_free;
148 unsigned int feature_flush; 196 unsigned int feature_flush;
149 unsigned int feature_discard:1; 197 unsigned int feature_discard:1;
150 unsigned int feature_secdiscard:1; 198 unsigned int feature_secdiscard:1;
@@ -155,6 +203,8 @@ struct blkfront_info
155 unsigned int max_indirect_segments; 203 unsigned int max_indirect_segments;
156 int is_ready; 204 int is_ready;
157 struct blk_mq_tag_set tag_set; 205 struct blk_mq_tag_set tag_set;
206 struct blkfront_ring_info *rinfo;
207 unsigned int nr_rings;
158}; 208};
159 209
160static unsigned int nr_minors; 210static unsigned int nr_minors;
@@ -198,38 +248,40 @@ static DEFINE_SPINLOCK(minor_lock);
198 248
199#define GREFS(_psegs) ((_psegs) * GRANTS_PER_PSEG) 249#define GREFS(_psegs) ((_psegs) * GRANTS_PER_PSEG)
200 250
201static int blkfront_setup_indirect(struct blkfront_info *info); 251static int blkfront_setup_indirect(struct blkfront_ring_info *rinfo);
202static int blkfront_gather_backend_features(struct blkfront_info *info); 252static void blkfront_gather_backend_features(struct blkfront_info *info);
203 253
204static int get_id_from_freelist(struct blkfront_info *info) 254static int get_id_from_freelist(struct blkfront_ring_info *rinfo)
205{ 255{
206 unsigned long free = info->shadow_free; 256 unsigned long free = rinfo->shadow_free;
207 BUG_ON(free >= BLK_RING_SIZE(info)); 257
208 info->shadow_free = info->shadow[free].req.u.rw.id; 258 BUG_ON(free >= BLK_RING_SIZE(rinfo->dev_info));
209 info->shadow[free].req.u.rw.id = 0x0fffffee; /* debug */ 259 rinfo->shadow_free = rinfo->shadow[free].req.u.rw.id;
260 rinfo->shadow[free].req.u.rw.id = 0x0fffffee; /* debug */
210 return free; 261 return free;
211} 262}
212 263
213static int add_id_to_freelist(struct blkfront_info *info, 264static int add_id_to_freelist(struct blkfront_ring_info *rinfo,
214 unsigned long id) 265 unsigned long id)
215{ 266{
216 if (info->shadow[id].req.u.rw.id != id) 267 if (rinfo->shadow[id].req.u.rw.id != id)
217 return -EINVAL; 268 return -EINVAL;
218 if (info->shadow[id].request == NULL) 269 if (rinfo->shadow[id].request == NULL)
219 return -EINVAL; 270 return -EINVAL;
220 info->shadow[id].req.u.rw.id = info->shadow_free; 271 rinfo->shadow[id].req.u.rw.id = rinfo->shadow_free;
221 info->shadow[id].request = NULL; 272 rinfo->shadow[id].request = NULL;
222 info->shadow_free = id; 273 rinfo->shadow_free = id;
223 return 0; 274 return 0;
224} 275}
225 276
226static int fill_grant_buffer(struct blkfront_info *info, int num) 277static int fill_grant_buffer(struct blkfront_ring_info *rinfo, int num)
227{ 278{
279 struct blkfront_info *info = rinfo->dev_info;
228 struct page *granted_page; 280 struct page *granted_page;
229 struct grant *gnt_list_entry, *n; 281 struct grant *gnt_list_entry, *n;
230 int i = 0; 282 int i = 0;
231 283
232 while(i < num) { 284 while (i < num) {
233 gnt_list_entry = kzalloc(sizeof(struct grant), GFP_NOIO); 285 gnt_list_entry = kzalloc(sizeof(struct grant), GFP_NOIO);
234 if (!gnt_list_entry) 286 if (!gnt_list_entry)
235 goto out_of_memory; 287 goto out_of_memory;
@@ -244,7 +296,7 @@ static int fill_grant_buffer(struct blkfront_info *info, int num)
244 } 296 }
245 297
246 gnt_list_entry->gref = GRANT_INVALID_REF; 298 gnt_list_entry->gref = GRANT_INVALID_REF;
247 list_add(&gnt_list_entry->node, &info->grants); 299 list_add(&gnt_list_entry->node, &rinfo->grants);
248 i++; 300 i++;
249 } 301 }
250 302
@@ -252,7 +304,7 @@ static int fill_grant_buffer(struct blkfront_info *info, int num)
252 304
253out_of_memory: 305out_of_memory:
254 list_for_each_entry_safe(gnt_list_entry, n, 306 list_for_each_entry_safe(gnt_list_entry, n,
255 &info->grants, node) { 307 &rinfo->grants, node) {
256 list_del(&gnt_list_entry->node); 308 list_del(&gnt_list_entry->node);
257 if (info->feature_persistent) 309 if (info->feature_persistent)
258 __free_page(gnt_list_entry->page); 310 __free_page(gnt_list_entry->page);
@@ -263,17 +315,17 @@ out_of_memory:
263 return -ENOMEM; 315 return -ENOMEM;
264} 316}
265 317
266static struct grant *get_free_grant(struct blkfront_info *info) 318static struct grant *get_free_grant(struct blkfront_ring_info *rinfo)
267{ 319{
268 struct grant *gnt_list_entry; 320 struct grant *gnt_list_entry;
269 321
270 BUG_ON(list_empty(&info->grants)); 322 BUG_ON(list_empty(&rinfo->grants));
271 gnt_list_entry = list_first_entry(&info->grants, struct grant, 323 gnt_list_entry = list_first_entry(&rinfo->grants, struct grant,
272 node); 324 node);
273 list_del(&gnt_list_entry->node); 325 list_del(&gnt_list_entry->node);
274 326
275 if (gnt_list_entry->gref != GRANT_INVALID_REF) 327 if (gnt_list_entry->gref != GRANT_INVALID_REF)
276 info->persistent_gnts_c--; 328 rinfo->persistent_gnts_c--;
277 329
278 return gnt_list_entry; 330 return gnt_list_entry;
279} 331}
@@ -289,9 +341,10 @@ static inline void grant_foreign_access(const struct grant *gnt_list_entry,
289 341
290static struct grant *get_grant(grant_ref_t *gref_head, 342static struct grant *get_grant(grant_ref_t *gref_head,
291 unsigned long gfn, 343 unsigned long gfn,
292 struct blkfront_info *info) 344 struct blkfront_ring_info *rinfo)
293{ 345{
294 struct grant *gnt_list_entry = get_free_grant(info); 346 struct grant *gnt_list_entry = get_free_grant(rinfo);
347 struct blkfront_info *info = rinfo->dev_info;
295 348
296 if (gnt_list_entry->gref != GRANT_INVALID_REF) 349 if (gnt_list_entry->gref != GRANT_INVALID_REF)
297 return gnt_list_entry; 350 return gnt_list_entry;
@@ -312,9 +365,10 @@ static struct grant *get_grant(grant_ref_t *gref_head,
312} 365}
313 366
314static struct grant *get_indirect_grant(grant_ref_t *gref_head, 367static struct grant *get_indirect_grant(grant_ref_t *gref_head,
315 struct blkfront_info *info) 368 struct blkfront_ring_info *rinfo)
316{ 369{
317 struct grant *gnt_list_entry = get_free_grant(info); 370 struct grant *gnt_list_entry = get_free_grant(rinfo);
371 struct blkfront_info *info = rinfo->dev_info;
318 372
319 if (gnt_list_entry->gref != GRANT_INVALID_REF) 373 if (gnt_list_entry->gref != GRANT_INVALID_REF)
320 return gnt_list_entry; 374 return gnt_list_entry;
@@ -326,8 +380,8 @@ static struct grant *get_indirect_grant(grant_ref_t *gref_head,
326 struct page *indirect_page; 380 struct page *indirect_page;
327 381
328 /* Fetch a pre-allocated page to use for indirect grefs */ 382 /* Fetch a pre-allocated page to use for indirect grefs */
329 BUG_ON(list_empty(&info->indirect_pages)); 383 BUG_ON(list_empty(&rinfo->indirect_pages));
330 indirect_page = list_first_entry(&info->indirect_pages, 384 indirect_page = list_first_entry(&rinfo->indirect_pages,
331 struct page, lru); 385 struct page, lru);
332 list_del(&indirect_page->lru); 386 list_del(&indirect_page->lru);
333 gnt_list_entry->page = indirect_page; 387 gnt_list_entry->page = indirect_page;
@@ -403,8 +457,8 @@ static void xlbd_release_minors(unsigned int minor, unsigned int nr)
403 457
404static void blkif_restart_queue_callback(void *arg) 458static void blkif_restart_queue_callback(void *arg)
405{ 459{
406 struct blkfront_info *info = (struct blkfront_info *)arg; 460 struct blkfront_ring_info *rinfo = (struct blkfront_ring_info *)arg;
407 schedule_work(&info->work); 461 schedule_work(&rinfo->work);
408} 462}
409 463
410static int blkif_getgeo(struct block_device *bd, struct hd_geometry *hg) 464static int blkif_getgeo(struct block_device *bd, struct hd_geometry *hg)
@@ -456,16 +510,33 @@ static int blkif_ioctl(struct block_device *bdev, fmode_t mode,
456 return 0; 510 return 0;
457} 511}
458 512
459static int blkif_queue_discard_req(struct request *req) 513static unsigned long blkif_ring_get_request(struct blkfront_ring_info *rinfo,
514 struct request *req,
515 struct blkif_request **ring_req)
460{ 516{
461 struct blkfront_info *info = req->rq_disk->private_data; 517 unsigned long id;
518
519 *ring_req = RING_GET_REQUEST(&rinfo->ring, rinfo->ring.req_prod_pvt);
520 rinfo->ring.req_prod_pvt++;
521
522 id = get_id_from_freelist(rinfo);
523 rinfo->shadow[id].request = req;
524 rinfo->shadow[id].status = REQ_WAITING;
525 rinfo->shadow[id].associated_id = NO_ASSOCIATED_ID;
526
527 (*ring_req)->u.rw.id = id;
528
529 return id;
530}
531
532static int blkif_queue_discard_req(struct request *req, struct blkfront_ring_info *rinfo)
533{
534 struct blkfront_info *info = rinfo->dev_info;
462 struct blkif_request *ring_req; 535 struct blkif_request *ring_req;
463 unsigned long id; 536 unsigned long id;
464 537
465 /* Fill out a communications ring structure. */ 538 /* Fill out a communications ring structure. */
466 ring_req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt); 539 id = blkif_ring_get_request(rinfo, req, &ring_req);
467 id = get_id_from_freelist(info);
468 info->shadow[id].request = req;
469 540
470 ring_req->operation = BLKIF_OP_DISCARD; 541 ring_req->operation = BLKIF_OP_DISCARD;
471 ring_req->u.discard.nr_sectors = blk_rq_sectors(req); 542 ring_req->u.discard.nr_sectors = blk_rq_sectors(req);
@@ -476,10 +547,8 @@ static int blkif_queue_discard_req(struct request *req)
476 else 547 else
477 ring_req->u.discard.flag = 0; 548 ring_req->u.discard.flag = 0;
478 549
479 info->ring.req_prod_pvt++;
480
481 /* Keep a private copy so we can reissue requests when recovering. */ 550 /* Keep a private copy so we can reissue requests when recovering. */
482 info->shadow[id].req = *ring_req; 551 rinfo->shadow[id].req = *ring_req;
483 552
484 return 0; 553 return 0;
485} 554}
@@ -487,7 +556,7 @@ static int blkif_queue_discard_req(struct request *req)
487struct setup_rw_req { 556struct setup_rw_req {
488 unsigned int grant_idx; 557 unsigned int grant_idx;
489 struct blkif_request_segment *segments; 558 struct blkif_request_segment *segments;
490 struct blkfront_info *info; 559 struct blkfront_ring_info *rinfo;
491 struct blkif_request *ring_req; 560 struct blkif_request *ring_req;
492 grant_ref_t gref_head; 561 grant_ref_t gref_head;
493 unsigned int id; 562 unsigned int id;
@@ -495,6 +564,9 @@ struct setup_rw_req {
495 bool need_copy; 564 bool need_copy;
496 unsigned int bvec_off; 565 unsigned int bvec_off;
497 char *bvec_data; 566 char *bvec_data;
567
568 bool require_extra_req;
569 struct blkif_request *extra_ring_req;
498}; 570};
499 571
500static void blkif_setup_rw_req_grant(unsigned long gfn, unsigned int offset, 572static void blkif_setup_rw_req_grant(unsigned long gfn, unsigned int offset,
@@ -507,8 +579,24 @@ static void blkif_setup_rw_req_grant(unsigned long gfn, unsigned int offset,
507 /* Convenient aliases */ 579 /* Convenient aliases */
508 unsigned int grant_idx = setup->grant_idx; 580 unsigned int grant_idx = setup->grant_idx;
509 struct blkif_request *ring_req = setup->ring_req; 581 struct blkif_request *ring_req = setup->ring_req;
510 struct blkfront_info *info = setup->info; 582 struct blkfront_ring_info *rinfo = setup->rinfo;
511 struct blk_shadow *shadow = &info->shadow[setup->id]; 583 /*
584 * We always use the shadow of the first request to store the list
585 * of grant associated to the block I/O request. This made the
586 * completion more easy to handle even if the block I/O request is
587 * split.
588 */
589 struct blk_shadow *shadow = &rinfo->shadow[setup->id];
590
591 if (unlikely(setup->require_extra_req &&
592 grant_idx >= BLKIF_MAX_SEGMENTS_PER_REQUEST)) {
593 /*
594 * We are using the second request, setup grant_idx
595 * to be the index of the segment array.
596 */
597 grant_idx -= BLKIF_MAX_SEGMENTS_PER_REQUEST;
598 ring_req = setup->extra_ring_req;
599 }
512 600
513 if ((ring_req->operation == BLKIF_OP_INDIRECT) && 601 if ((ring_req->operation == BLKIF_OP_INDIRECT) &&
514 (grant_idx % GRANTS_PER_INDIRECT_FRAME == 0)) { 602 (grant_idx % GRANTS_PER_INDIRECT_FRAME == 0)) {
@@ -516,15 +604,19 @@ static void blkif_setup_rw_req_grant(unsigned long gfn, unsigned int offset,
516 kunmap_atomic(setup->segments); 604 kunmap_atomic(setup->segments);
517 605
518 n = grant_idx / GRANTS_PER_INDIRECT_FRAME; 606 n = grant_idx / GRANTS_PER_INDIRECT_FRAME;
519 gnt_list_entry = get_indirect_grant(&setup->gref_head, info); 607 gnt_list_entry = get_indirect_grant(&setup->gref_head, rinfo);
520 shadow->indirect_grants[n] = gnt_list_entry; 608 shadow->indirect_grants[n] = gnt_list_entry;
521 setup->segments = kmap_atomic(gnt_list_entry->page); 609 setup->segments = kmap_atomic(gnt_list_entry->page);
522 ring_req->u.indirect.indirect_grefs[n] = gnt_list_entry->gref; 610 ring_req->u.indirect.indirect_grefs[n] = gnt_list_entry->gref;
523 } 611 }
524 612
525 gnt_list_entry = get_grant(&setup->gref_head, gfn, info); 613 gnt_list_entry = get_grant(&setup->gref_head, gfn, rinfo);
526 ref = gnt_list_entry->gref; 614 ref = gnt_list_entry->gref;
527 shadow->grants_used[grant_idx] = gnt_list_entry; 615 /*
616 * All the grants are stored in the shadow of the first
617 * request. Therefore we have to use the global index.
618 */
619 shadow->grants_used[setup->grant_idx] = gnt_list_entry;
528 620
529 if (setup->need_copy) { 621 if (setup->need_copy) {
530 void *shared_data; 622 void *shared_data;
@@ -566,16 +658,36 @@ static void blkif_setup_rw_req_grant(unsigned long gfn, unsigned int offset,
566 (setup->grant_idx)++; 658 (setup->grant_idx)++;
567} 659}
568 660
569static int blkif_queue_rw_req(struct request *req) 661static void blkif_setup_extra_req(struct blkif_request *first,
662 struct blkif_request *second)
570{ 663{
571 struct blkfront_info *info = req->rq_disk->private_data; 664 uint16_t nr_segments = first->u.rw.nr_segments;
572 struct blkif_request *ring_req; 665
573 unsigned long id; 666 /*
667 * The second request is only present when the first request uses
668 * all its segments. It's always the continuity of the first one.
669 */
670 first->u.rw.nr_segments = BLKIF_MAX_SEGMENTS_PER_REQUEST;
671
672 second->u.rw.nr_segments = nr_segments - BLKIF_MAX_SEGMENTS_PER_REQUEST;
673 second->u.rw.sector_number = first->u.rw.sector_number +
674 (BLKIF_MAX_SEGMENTS_PER_REQUEST * XEN_PAGE_SIZE) / 512;
675
676 second->u.rw.handle = first->u.rw.handle;
677 second->operation = first->operation;
678}
679
680static int blkif_queue_rw_req(struct request *req, struct blkfront_ring_info *rinfo)
681{
682 struct blkfront_info *info = rinfo->dev_info;
683 struct blkif_request *ring_req, *extra_ring_req = NULL;
684 unsigned long id, extra_id = NO_ASSOCIATED_ID;
685 bool require_extra_req = false;
574 int i; 686 int i;
575 struct setup_rw_req setup = { 687 struct setup_rw_req setup = {
576 .grant_idx = 0, 688 .grant_idx = 0,
577 .segments = NULL, 689 .segments = NULL,
578 .info = info, 690 .rinfo = rinfo,
579 .need_copy = rq_data_dir(req) && info->feature_persistent, 691 .need_copy = rq_data_dir(req) && info->feature_persistent,
580 }; 692 };
581 693
@@ -584,7 +696,6 @@ static int blkif_queue_rw_req(struct request *req)
584 * existing persistent grants, or if we have to get new grants, 696 * existing persistent grants, or if we have to get new grants,
585 * as there are not sufficiently many free. 697 * as there are not sufficiently many free.
586 */ 698 */
587 bool new_persistent_gnts;
588 struct scatterlist *sg; 699 struct scatterlist *sg;
589 int num_sg, max_grefs, num_grant; 700 int num_sg, max_grefs, num_grant;
590 701
@@ -596,41 +707,36 @@ static int blkif_queue_rw_req(struct request *req)
596 */ 707 */
597 max_grefs += INDIRECT_GREFS(max_grefs); 708 max_grefs += INDIRECT_GREFS(max_grefs);
598 709
599 /* Check if we have enough grants to allocate a requests */ 710 /*
600 if (info->persistent_gnts_c < max_grefs) { 711 * We have to reserve 'max_grefs' grants because persistent
601 new_persistent_gnts = 1; 712 * grants are shared by all rings.
602 if (gnttab_alloc_grant_references( 713 */
603 max_grefs - info->persistent_gnts_c, 714 if (max_grefs > 0)
604 &setup.gref_head) < 0) { 715 if (gnttab_alloc_grant_references(max_grefs, &setup.gref_head) < 0) {
605 gnttab_request_free_callback( 716 gnttab_request_free_callback(
606 &info->callback, 717 &rinfo->callback,
607 blkif_restart_queue_callback, 718 blkif_restart_queue_callback,
608 info, 719 rinfo,
609 max_grefs); 720 max_grefs);
610 return 1; 721 return 1;
611 } 722 }
612 } else
613 new_persistent_gnts = 0;
614 723
615 /* Fill out a communications ring structure. */ 724 /* Fill out a communications ring structure. */
616 ring_req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt); 725 id = blkif_ring_get_request(rinfo, req, &ring_req);
617 id = get_id_from_freelist(info);
618 info->shadow[id].request = req;
619
620 BUG_ON(info->max_indirect_segments == 0 &&
621 GREFS(req->nr_phys_segments) > BLKIF_MAX_SEGMENTS_PER_REQUEST);
622 BUG_ON(info->max_indirect_segments &&
623 GREFS(req->nr_phys_segments) > info->max_indirect_segments);
624 726
625 num_sg = blk_rq_map_sg(req->q, req, info->shadow[id].sg); 727 num_sg = blk_rq_map_sg(req->q, req, rinfo->shadow[id].sg);
626 num_grant = 0; 728 num_grant = 0;
627 /* Calculate the number of grant used */ 729 /* Calculate the number of grant used */
628 for_each_sg(info->shadow[id].sg, sg, num_sg, i) 730 for_each_sg(rinfo->shadow[id].sg, sg, num_sg, i)
629 num_grant += gnttab_count_grant(sg->offset, sg->length); 731 num_grant += gnttab_count_grant(sg->offset, sg->length);
630 732
631 ring_req->u.rw.id = id; 733 require_extra_req = info->max_indirect_segments == 0 &&
632 info->shadow[id].num_sg = num_sg; 734 num_grant > BLKIF_MAX_SEGMENTS_PER_REQUEST;
633 if (num_grant > BLKIF_MAX_SEGMENTS_PER_REQUEST) { 735 BUG_ON(!HAS_EXTRA_REQ && require_extra_req);
736
737 rinfo->shadow[id].num_sg = num_sg;
738 if (num_grant > BLKIF_MAX_SEGMENTS_PER_REQUEST &&
739 likely(!require_extra_req)) {
634 /* 740 /*
635 * The indirect operation can only be a BLKIF_OP_READ or 741 * The indirect operation can only be a BLKIF_OP_READ or
636 * BLKIF_OP_WRITE 742 * BLKIF_OP_WRITE
@@ -670,11 +776,31 @@ static int blkif_queue_rw_req(struct request *req)
670 } 776 }
671 } 777 }
672 ring_req->u.rw.nr_segments = num_grant; 778 ring_req->u.rw.nr_segments = num_grant;
779 if (unlikely(require_extra_req)) {
780 extra_id = blkif_ring_get_request(rinfo, req,
781 &extra_ring_req);
782 /*
783 * Only the first request contains the scatter-gather
784 * list.
785 */
786 rinfo->shadow[extra_id].num_sg = 0;
787
788 blkif_setup_extra_req(ring_req, extra_ring_req);
789
790 /* Link the 2 requests together */
791 rinfo->shadow[extra_id].associated_id = id;
792 rinfo->shadow[id].associated_id = extra_id;
793 }
673 } 794 }
674 795
675 setup.ring_req = ring_req; 796 setup.ring_req = ring_req;
676 setup.id = id; 797 setup.id = id;
677 for_each_sg(info->shadow[id].sg, sg, num_sg, i) { 798
799 setup.require_extra_req = require_extra_req;
800 if (unlikely(require_extra_req))
801 setup.extra_ring_req = extra_ring_req;
802
803 for_each_sg(rinfo->shadow[id].sg, sg, num_sg, i) {
678 BUG_ON(sg->offset + sg->length > PAGE_SIZE); 804 BUG_ON(sg->offset + sg->length > PAGE_SIZE);
679 805
680 if (setup.need_copy) { 806 if (setup.need_copy) {
@@ -694,12 +820,12 @@ static int blkif_queue_rw_req(struct request *req)
694 if (setup.segments) 820 if (setup.segments)
695 kunmap_atomic(setup.segments); 821 kunmap_atomic(setup.segments);
696 822
697 info->ring.req_prod_pvt++;
698
699 /* Keep a private copy so we can reissue requests when recovering. */ 823 /* Keep a private copy so we can reissue requests when recovering. */
700 info->shadow[id].req = *ring_req; 824 rinfo->shadow[id].req = *ring_req;
825 if (unlikely(require_extra_req))
826 rinfo->shadow[extra_id].req = *extra_ring_req;
701 827
702 if (new_persistent_gnts) 828 if (max_grefs > 0)
703 gnttab_free_grant_references(setup.gref_head); 829 gnttab_free_grant_references(setup.gref_head);
704 830
705 return 0; 831 return 0;
@@ -711,27 +837,25 @@ static int blkif_queue_rw_req(struct request *req)
711 * 837 *
712 * @req: a request struct 838 * @req: a request struct
713 */ 839 */
714static int blkif_queue_request(struct request *req) 840static int blkif_queue_request(struct request *req, struct blkfront_ring_info *rinfo)
715{ 841{
716 struct blkfront_info *info = req->rq_disk->private_data; 842 if (unlikely(rinfo->dev_info->connected != BLKIF_STATE_CONNECTED))
717
718 if (unlikely(info->connected != BLKIF_STATE_CONNECTED))
719 return 1; 843 return 1;
720 844
721 if (unlikely(req->cmd_flags & (REQ_DISCARD | REQ_SECURE))) 845 if (unlikely(req->cmd_flags & (REQ_DISCARD | REQ_SECURE)))
722 return blkif_queue_discard_req(req); 846 return blkif_queue_discard_req(req, rinfo);
723 else 847 else
724 return blkif_queue_rw_req(req); 848 return blkif_queue_rw_req(req, rinfo);
725} 849}
726 850
727static inline void flush_requests(struct blkfront_info *info) 851static inline void flush_requests(struct blkfront_ring_info *rinfo)
728{ 852{
729 int notify; 853 int notify;
730 854
731 RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&info->ring, notify); 855 RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&rinfo->ring, notify);
732 856
733 if (notify) 857 if (notify)
734 notify_remote_via_irq(info->irq); 858 notify_remote_via_irq(rinfo->irq);
735} 859}
736 860
737static inline bool blkif_request_flush_invalid(struct request *req, 861static inline bool blkif_request_flush_invalid(struct request *req,
@@ -745,38 +869,50 @@ static inline bool blkif_request_flush_invalid(struct request *req,
745} 869}
746 870
747static int blkif_queue_rq(struct blk_mq_hw_ctx *hctx, 871static int blkif_queue_rq(struct blk_mq_hw_ctx *hctx,
748 const struct blk_mq_queue_data *qd) 872 const struct blk_mq_queue_data *qd)
749{ 873{
750 struct blkfront_info *info = qd->rq->rq_disk->private_data; 874 unsigned long flags;
875 struct blkfront_ring_info *rinfo = (struct blkfront_ring_info *)hctx->driver_data;
751 876
752 blk_mq_start_request(qd->rq); 877 blk_mq_start_request(qd->rq);
753 spin_lock_irq(&info->io_lock); 878 spin_lock_irqsave(&rinfo->ring_lock, flags);
754 if (RING_FULL(&info->ring)) 879 if (RING_FULL(&rinfo->ring))
755 goto out_busy; 880 goto out_busy;
756 881
757 if (blkif_request_flush_invalid(qd->rq, info)) 882 if (blkif_request_flush_invalid(qd->rq, rinfo->dev_info))
758 goto out_err; 883 goto out_err;
759 884
760 if (blkif_queue_request(qd->rq)) 885 if (blkif_queue_request(qd->rq, rinfo))
761 goto out_busy; 886 goto out_busy;
762 887
763 flush_requests(info); 888 flush_requests(rinfo);
764 spin_unlock_irq(&info->io_lock); 889 spin_unlock_irqrestore(&rinfo->ring_lock, flags);
765 return BLK_MQ_RQ_QUEUE_OK; 890 return BLK_MQ_RQ_QUEUE_OK;
766 891
767out_err: 892out_err:
768 spin_unlock_irq(&info->io_lock); 893 spin_unlock_irqrestore(&rinfo->ring_lock, flags);
769 return BLK_MQ_RQ_QUEUE_ERROR; 894 return BLK_MQ_RQ_QUEUE_ERROR;
770 895
771out_busy: 896out_busy:
772 spin_unlock_irq(&info->io_lock); 897 spin_unlock_irqrestore(&rinfo->ring_lock, flags);
773 blk_mq_stop_hw_queue(hctx); 898 blk_mq_stop_hw_queue(hctx);
774 return BLK_MQ_RQ_QUEUE_BUSY; 899 return BLK_MQ_RQ_QUEUE_BUSY;
775} 900}
776 901
902static int blk_mq_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
903 unsigned int index)
904{
905 struct blkfront_info *info = (struct blkfront_info *)data;
906
907 BUG_ON(info->nr_rings <= index);
908 hctx->driver_data = &info->rinfo[index];
909 return 0;
910}
911
777static struct blk_mq_ops blkfront_mq_ops = { 912static struct blk_mq_ops blkfront_mq_ops = {
778 .queue_rq = blkif_queue_rq, 913 .queue_rq = blkif_queue_rq,
779 .map_queue = blk_mq_map_queue, 914 .map_queue = blk_mq_map_queue,
915 .init_hctx = blk_mq_init_hctx,
780}; 916};
781 917
782static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size, 918static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size,
@@ -788,19 +924,28 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size,
788 924
789 memset(&info->tag_set, 0, sizeof(info->tag_set)); 925 memset(&info->tag_set, 0, sizeof(info->tag_set));
790 info->tag_set.ops = &blkfront_mq_ops; 926 info->tag_set.ops = &blkfront_mq_ops;
791 info->tag_set.nr_hw_queues = 1; 927 info->tag_set.nr_hw_queues = info->nr_rings;
792 info->tag_set.queue_depth = BLK_RING_SIZE(info); 928 if (HAS_EXTRA_REQ && info->max_indirect_segments == 0) {
929 /*
930 * When indirect descriptior is not supported, the I/O request
931 * will be split between multiple request in the ring.
932 * To avoid problems when sending the request, divide by
933 * 2 the depth of the queue.
934 */
935 info->tag_set.queue_depth = BLK_RING_SIZE(info) / 2;
936 } else
937 info->tag_set.queue_depth = BLK_RING_SIZE(info);
793 info->tag_set.numa_node = NUMA_NO_NODE; 938 info->tag_set.numa_node = NUMA_NO_NODE;
794 info->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE; 939 info->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
795 info->tag_set.cmd_size = 0; 940 info->tag_set.cmd_size = 0;
796 info->tag_set.driver_data = info; 941 info->tag_set.driver_data = info;
797 942
798 if (blk_mq_alloc_tag_set(&info->tag_set)) 943 if (blk_mq_alloc_tag_set(&info->tag_set))
799 return -1; 944 return -EINVAL;
800 rq = blk_mq_init_queue(&info->tag_set); 945 rq = blk_mq_init_queue(&info->tag_set);
801 if (IS_ERR(rq)) { 946 if (IS_ERR(rq)) {
802 blk_mq_free_tag_set(&info->tag_set); 947 blk_mq_free_tag_set(&info->tag_set);
803 return -1; 948 return PTR_ERR(rq);
804 } 949 }
805 950
806 queue_flag_set_unlocked(QUEUE_FLAG_VIRT, rq); 951 queue_flag_set_unlocked(QUEUE_FLAG_VIRT, rq);
@@ -1028,7 +1173,7 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
1028 1173
1029static void xlvbd_release_gendisk(struct blkfront_info *info) 1174static void xlvbd_release_gendisk(struct blkfront_info *info)
1030{ 1175{
1031 unsigned int minor, nr_minors; 1176 unsigned int minor, nr_minors, i;
1032 1177
1033 if (info->rq == NULL) 1178 if (info->rq == NULL)
1034 return; 1179 return;
@@ -1036,11 +1181,15 @@ static void xlvbd_release_gendisk(struct blkfront_info *info)
1036 /* No more blkif_request(). */ 1181 /* No more blkif_request(). */
1037 blk_mq_stop_hw_queues(info->rq); 1182 blk_mq_stop_hw_queues(info->rq);
1038 1183
1039 /* No more gnttab callback work. */ 1184 for (i = 0; i < info->nr_rings; i++) {
1040 gnttab_cancel_free_callback(&info->callback); 1185 struct blkfront_ring_info *rinfo = &info->rinfo[i];
1041 1186
1042 /* Flush gnttab callback work. Must be done with no locks held. */ 1187 /* No more gnttab callback work. */
1043 flush_work(&info->work); 1188 gnttab_cancel_free_callback(&rinfo->callback);
1189
1190 /* Flush gnttab callback work. Must be done with no locks held. */
1191 flush_work(&rinfo->work);
1192 }
1044 1193
1045 del_gendisk(info->gd); 1194 del_gendisk(info->gd);
1046 1195
@@ -1056,88 +1205,87 @@ static void xlvbd_release_gendisk(struct blkfront_info *info)
1056 info->gd = NULL; 1205 info->gd = NULL;
1057} 1206}
1058 1207
1059/* Must be called with io_lock holded */ 1208/* Already hold rinfo->ring_lock. */
1060static void kick_pending_request_queues(struct blkfront_info *info) 1209static inline void kick_pending_request_queues_locked(struct blkfront_ring_info *rinfo)
1061{ 1210{
1062 if (!RING_FULL(&info->ring)) 1211 if (!RING_FULL(&rinfo->ring))
1063 blk_mq_start_stopped_hw_queues(info->rq, true); 1212 blk_mq_start_stopped_hw_queues(rinfo->dev_info->rq, true);
1064} 1213}
1065 1214
1066static void blkif_restart_queue(struct work_struct *work) 1215static void kick_pending_request_queues(struct blkfront_ring_info *rinfo)
1067{ 1216{
1068 struct blkfront_info *info = container_of(work, struct blkfront_info, work); 1217 unsigned long flags;
1069 1218
1070 spin_lock_irq(&info->io_lock); 1219 spin_lock_irqsave(&rinfo->ring_lock, flags);
1071 if (info->connected == BLKIF_STATE_CONNECTED) 1220 kick_pending_request_queues_locked(rinfo);
1072 kick_pending_request_queues(info); 1221 spin_unlock_irqrestore(&rinfo->ring_lock, flags);
1073 spin_unlock_irq(&info->io_lock);
1074} 1222}
1075 1223
1076static void blkif_free(struct blkfront_info *info, int suspend) 1224static void blkif_restart_queue(struct work_struct *work)
1077{ 1225{
1078 struct grant *persistent_gnt; 1226 struct blkfront_ring_info *rinfo = container_of(work, struct blkfront_ring_info, work);
1079 struct grant *n;
1080 int i, j, segs;
1081 1227
1082 /* Prevent new requests being issued until we fix things up. */ 1228 if (rinfo->dev_info->connected == BLKIF_STATE_CONNECTED)
1083 spin_lock_irq(&info->io_lock); 1229 kick_pending_request_queues(rinfo);
1084 info->connected = suspend ? 1230}
1085 BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED;
1086 /* No more blkif_request(). */
1087 if (info->rq)
1088 blk_mq_stop_hw_queues(info->rq);
1089 1231
1090 /* Remove all persistent grants */ 1232static void blkif_free_ring(struct blkfront_ring_info *rinfo)
1091 if (!list_empty(&info->grants)) { 1233{
1092 list_for_each_entry_safe(persistent_gnt, n, 1234 struct grant *persistent_gnt, *n;
1093 &info->grants, node) { 1235 struct blkfront_info *info = rinfo->dev_info;
1094 list_del(&persistent_gnt->node); 1236 int i, j, segs;
1095 if (persistent_gnt->gref != GRANT_INVALID_REF) {
1096 gnttab_end_foreign_access(persistent_gnt->gref,
1097 0, 0UL);
1098 info->persistent_gnts_c--;
1099 }
1100 if (info->feature_persistent)
1101 __free_page(persistent_gnt->page);
1102 kfree(persistent_gnt);
1103 }
1104 }
1105 BUG_ON(info->persistent_gnts_c != 0);
1106 1237
1107 /* 1238 /*
1108 * Remove indirect pages, this only happens when using indirect 1239 * Remove indirect pages, this only happens when using indirect
1109 * descriptors but not persistent grants 1240 * descriptors but not persistent grants
1110 */ 1241 */
1111 if (!list_empty(&info->indirect_pages)) { 1242 if (!list_empty(&rinfo->indirect_pages)) {
1112 struct page *indirect_page, *n; 1243 struct page *indirect_page, *n;
1113 1244
1114 BUG_ON(info->feature_persistent); 1245 BUG_ON(info->feature_persistent);
1115 list_for_each_entry_safe(indirect_page, n, &info->indirect_pages, lru) { 1246 list_for_each_entry_safe(indirect_page, n, &rinfo->indirect_pages, lru) {
1116 list_del(&indirect_page->lru); 1247 list_del(&indirect_page->lru);
1117 __free_page(indirect_page); 1248 __free_page(indirect_page);
1118 } 1249 }
1119 } 1250 }
1120 1251
1252 /* Remove all persistent grants. */
1253 if (!list_empty(&rinfo->grants)) {
1254 list_for_each_entry_safe(persistent_gnt, n,
1255 &rinfo->grants, node) {
1256 list_del(&persistent_gnt->node);
1257 if (persistent_gnt->gref != GRANT_INVALID_REF) {
1258 gnttab_end_foreign_access(persistent_gnt->gref,
1259 0, 0UL);
1260 rinfo->persistent_gnts_c--;
1261 }
1262 if (info->feature_persistent)
1263 __free_page(persistent_gnt->page);
1264 kfree(persistent_gnt);
1265 }
1266 }
1267 BUG_ON(rinfo->persistent_gnts_c != 0);
1268
1121 for (i = 0; i < BLK_RING_SIZE(info); i++) { 1269 for (i = 0; i < BLK_RING_SIZE(info); i++) {
1122 /* 1270 /*
1123 * Clear persistent grants present in requests already 1271 * Clear persistent grants present in requests already
1124 * on the shared ring 1272 * on the shared ring
1125 */ 1273 */
1126 if (!info->shadow[i].request) 1274 if (!rinfo->shadow[i].request)
1127 goto free_shadow; 1275 goto free_shadow;
1128 1276
1129 segs = info->shadow[i].req.operation == BLKIF_OP_INDIRECT ? 1277 segs = rinfo->shadow[i].req.operation == BLKIF_OP_INDIRECT ?
1130 info->shadow[i].req.u.indirect.nr_segments : 1278 rinfo->shadow[i].req.u.indirect.nr_segments :
1131 info->shadow[i].req.u.rw.nr_segments; 1279 rinfo->shadow[i].req.u.rw.nr_segments;
1132 for (j = 0; j < segs; j++) { 1280 for (j = 0; j < segs; j++) {
1133 persistent_gnt = info->shadow[i].grants_used[j]; 1281 persistent_gnt = rinfo->shadow[i].grants_used[j];
1134 gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL); 1282 gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL);
1135 if (info->feature_persistent) 1283 if (info->feature_persistent)
1136 __free_page(persistent_gnt->page); 1284 __free_page(persistent_gnt->page);
1137 kfree(persistent_gnt); 1285 kfree(persistent_gnt);
1138 } 1286 }
1139 1287
1140 if (info->shadow[i].req.operation != BLKIF_OP_INDIRECT) 1288 if (rinfo->shadow[i].req.operation != BLKIF_OP_INDIRECT)
1141 /* 1289 /*
1142 * If this is not an indirect operation don't try to 1290 * If this is not an indirect operation don't try to
1143 * free indirect segments 1291 * free indirect segments
@@ -1145,42 +1293,59 @@ static void blkif_free(struct blkfront_info *info, int suspend)
1145 goto free_shadow; 1293 goto free_shadow;
1146 1294
1147 for (j = 0; j < INDIRECT_GREFS(segs); j++) { 1295 for (j = 0; j < INDIRECT_GREFS(segs); j++) {
1148 persistent_gnt = info->shadow[i].indirect_grants[j]; 1296 persistent_gnt = rinfo->shadow[i].indirect_grants[j];
1149 gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL); 1297 gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL);
1150 __free_page(persistent_gnt->page); 1298 __free_page(persistent_gnt->page);
1151 kfree(persistent_gnt); 1299 kfree(persistent_gnt);
1152 } 1300 }
1153 1301
1154free_shadow: 1302free_shadow:
1155 kfree(info->shadow[i].grants_used); 1303 kfree(rinfo->shadow[i].grants_used);
1156 info->shadow[i].grants_used = NULL; 1304 rinfo->shadow[i].grants_used = NULL;
1157 kfree(info->shadow[i].indirect_grants); 1305 kfree(rinfo->shadow[i].indirect_grants);
1158 info->shadow[i].indirect_grants = NULL; 1306 rinfo->shadow[i].indirect_grants = NULL;
1159 kfree(info->shadow[i].sg); 1307 kfree(rinfo->shadow[i].sg);
1160 info->shadow[i].sg = NULL; 1308 rinfo->shadow[i].sg = NULL;
1161 } 1309 }
1162 1310
1163 /* No more gnttab callback work. */ 1311 /* No more gnttab callback work. */
1164 gnttab_cancel_free_callback(&info->callback); 1312 gnttab_cancel_free_callback(&rinfo->callback);
1165 spin_unlock_irq(&info->io_lock);
1166 1313
1167 /* Flush gnttab callback work. Must be done with no locks held. */ 1314 /* Flush gnttab callback work. Must be done with no locks held. */
1168 flush_work(&info->work); 1315 flush_work(&rinfo->work);
1169 1316
1170 /* Free resources associated with old device channel. */ 1317 /* Free resources associated with old device channel. */
1171 for (i = 0; i < info->nr_ring_pages; i++) { 1318 for (i = 0; i < info->nr_ring_pages; i++) {
1172 if (info->ring_ref[i] != GRANT_INVALID_REF) { 1319 if (rinfo->ring_ref[i] != GRANT_INVALID_REF) {
1173 gnttab_end_foreign_access(info->ring_ref[i], 0, 0); 1320 gnttab_end_foreign_access(rinfo->ring_ref[i], 0, 0);
1174 info->ring_ref[i] = GRANT_INVALID_REF; 1321 rinfo->ring_ref[i] = GRANT_INVALID_REF;
1175 } 1322 }
1176 } 1323 }
1177 free_pages((unsigned long)info->ring.sring, get_order(info->nr_ring_pages * PAGE_SIZE)); 1324 free_pages((unsigned long)rinfo->ring.sring, get_order(info->nr_ring_pages * PAGE_SIZE));
1178 info->ring.sring = NULL; 1325 rinfo->ring.sring = NULL;
1179 1326
1180 if (info->irq) 1327 if (rinfo->irq)
1181 unbind_from_irqhandler(info->irq, info); 1328 unbind_from_irqhandler(rinfo->irq, rinfo);
1182 info->evtchn = info->irq = 0; 1329 rinfo->evtchn = rinfo->irq = 0;
1330}
1183 1331
1332static void blkif_free(struct blkfront_info *info, int suspend)
1333{
1334 unsigned int i;
1335
1336 /* Prevent new requests being issued until we fix things up. */
1337 info->connected = suspend ?
1338 BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED;
1339 /* No more blkif_request(). */
1340 if (info->rq)
1341 blk_mq_stop_hw_queues(info->rq);
1342
1343 for (i = 0; i < info->nr_rings; i++)
1344 blkif_free_ring(&info->rinfo[i]);
1345
1346 kfree(info->rinfo);
1347 info->rinfo = NULL;
1348 info->nr_rings = 0;
1184} 1349}
1185 1350
1186struct copy_from_grant { 1351struct copy_from_grant {
@@ -1209,19 +1374,93 @@ static void blkif_copy_from_grant(unsigned long gfn, unsigned int offset,
1209 kunmap_atomic(shared_data); 1374 kunmap_atomic(shared_data);
1210} 1375}
1211 1376
1212static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info, 1377static enum blk_req_status blkif_rsp_to_req_status(int rsp)
1378{
1379 switch (rsp)
1380 {
1381 case BLKIF_RSP_OKAY:
1382 return REQ_DONE;
1383 case BLKIF_RSP_EOPNOTSUPP:
1384 return REQ_EOPNOTSUPP;
1385 case BLKIF_RSP_ERROR:
1386 /* Fallthrough. */
1387 default:
1388 return REQ_ERROR;
1389 }
1390}
1391
1392/*
1393 * Get the final status of the block request based on two ring response
1394 */
1395static int blkif_get_final_status(enum blk_req_status s1,
1396 enum blk_req_status s2)
1397{
1398 BUG_ON(s1 == REQ_WAITING);
1399 BUG_ON(s2 == REQ_WAITING);
1400
1401 if (s1 == REQ_ERROR || s2 == REQ_ERROR)
1402 return BLKIF_RSP_ERROR;
1403 else if (s1 == REQ_EOPNOTSUPP || s2 == REQ_EOPNOTSUPP)
1404 return BLKIF_RSP_EOPNOTSUPP;
1405 return BLKIF_RSP_OKAY;
1406}
1407
1408static bool blkif_completion(unsigned long *id,
1409 struct blkfront_ring_info *rinfo,
1213 struct blkif_response *bret) 1410 struct blkif_response *bret)
1214{ 1411{
1215 int i = 0; 1412 int i = 0;
1216 struct scatterlist *sg; 1413 struct scatterlist *sg;
1217 int num_sg, num_grant; 1414 int num_sg, num_grant;
1415 struct blkfront_info *info = rinfo->dev_info;
1416 struct blk_shadow *s = &rinfo->shadow[*id];
1218 struct copy_from_grant data = { 1417 struct copy_from_grant data = {
1219 .s = s,
1220 .grant_idx = 0, 1418 .grant_idx = 0,
1221 }; 1419 };
1222 1420
1223 num_grant = s->req.operation == BLKIF_OP_INDIRECT ? 1421 num_grant = s->req.operation == BLKIF_OP_INDIRECT ?
1224 s->req.u.indirect.nr_segments : s->req.u.rw.nr_segments; 1422 s->req.u.indirect.nr_segments : s->req.u.rw.nr_segments;
1423
1424 /* The I/O request may be split in two. */
1425 if (unlikely(s->associated_id != NO_ASSOCIATED_ID)) {
1426 struct blk_shadow *s2 = &rinfo->shadow[s->associated_id];
1427
1428 /* Keep the status of the current response in shadow. */
1429 s->status = blkif_rsp_to_req_status(bret->status);
1430
1431 /* Wait the second response if not yet here. */
1432 if (s2->status == REQ_WAITING)
1433 return 0;
1434
1435 bret->status = blkif_get_final_status(s->status,
1436 s2->status);
1437
1438 /*
1439 * All the grants is stored in the first shadow in order
1440 * to make the completion code simpler.
1441 */
1442 num_grant += s2->req.u.rw.nr_segments;
1443
1444 /*
1445 * The two responses may not come in order. Only the
1446 * first request will store the scatter-gather list.
1447 */
1448 if (s2->num_sg != 0) {
1449 /* Update "id" with the ID of the first response. */
1450 *id = s->associated_id;
1451 s = s2;
1452 }
1453
1454 /*
1455 * We don't need anymore the second request, so recycling
1456 * it now.
1457 */
1458 if (add_id_to_freelist(rinfo, s->associated_id))
1459 WARN(1, "%s: can't recycle the second part (id = %ld) of the request\n",
1460 info->gd->disk_name, s->associated_id);
1461 }
1462
1463 data.s = s;
1225 num_sg = s->num_sg; 1464 num_sg = s->num_sg;
1226 1465
1227 if (bret->operation == BLKIF_OP_READ && info->feature_persistent) { 1466 if (bret->operation == BLKIF_OP_READ && info->feature_persistent) {
@@ -1252,8 +1491,8 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info,
1252 if (!info->feature_persistent) 1491 if (!info->feature_persistent)
1253 pr_alert_ratelimited("backed has not unmapped grant: %u\n", 1492 pr_alert_ratelimited("backed has not unmapped grant: %u\n",
1254 s->grants_used[i]->gref); 1493 s->grants_used[i]->gref);
1255 list_add(&s->grants_used[i]->node, &info->grants); 1494 list_add(&s->grants_used[i]->node, &rinfo->grants);
1256 info->persistent_gnts_c++; 1495 rinfo->persistent_gnts_c++;
1257 } else { 1496 } else {
1258 /* 1497 /*
1259 * If the grant is not mapped by the backend we end the 1498 * If the grant is not mapped by the backend we end the
@@ -1263,7 +1502,7 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info,
1263 */ 1502 */
1264 gnttab_end_foreign_access(s->grants_used[i]->gref, 0, 0UL); 1503 gnttab_end_foreign_access(s->grants_used[i]->gref, 0, 0UL);
1265 s->grants_used[i]->gref = GRANT_INVALID_REF; 1504 s->grants_used[i]->gref = GRANT_INVALID_REF;
1266 list_add_tail(&s->grants_used[i]->node, &info->grants); 1505 list_add_tail(&s->grants_used[i]->node, &rinfo->grants);
1267 } 1506 }
1268 } 1507 }
1269 if (s->req.operation == BLKIF_OP_INDIRECT) { 1508 if (s->req.operation == BLKIF_OP_INDIRECT) {
@@ -1272,8 +1511,8 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info,
1272 if (!info->feature_persistent) 1511 if (!info->feature_persistent)
1273 pr_alert_ratelimited("backed has not unmapped grant: %u\n", 1512 pr_alert_ratelimited("backed has not unmapped grant: %u\n",
1274 s->indirect_grants[i]->gref); 1513 s->indirect_grants[i]->gref);
1275 list_add(&s->indirect_grants[i]->node, &info->grants); 1514 list_add(&s->indirect_grants[i]->node, &rinfo->grants);
1276 info->persistent_gnts_c++; 1515 rinfo->persistent_gnts_c++;
1277 } else { 1516 } else {
1278 struct page *indirect_page; 1517 struct page *indirect_page;
1279 1518
@@ -1284,13 +1523,15 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info,
1284 */ 1523 */
1285 if (!info->feature_persistent) { 1524 if (!info->feature_persistent) {
1286 indirect_page = s->indirect_grants[i]->page; 1525 indirect_page = s->indirect_grants[i]->page;
1287 list_add(&indirect_page->lru, &info->indirect_pages); 1526 list_add(&indirect_page->lru, &rinfo->indirect_pages);
1288 } 1527 }
1289 s->indirect_grants[i]->gref = GRANT_INVALID_REF; 1528 s->indirect_grants[i]->gref = GRANT_INVALID_REF;
1290 list_add_tail(&s->indirect_grants[i]->node, &info->grants); 1529 list_add_tail(&s->indirect_grants[i]->node, &rinfo->grants);
1291 } 1530 }
1292 } 1531 }
1293 } 1532 }
1533
1534 return 1;
1294} 1535}
1295 1536
1296static irqreturn_t blkif_interrupt(int irq, void *dev_id) 1537static irqreturn_t blkif_interrupt(int irq, void *dev_id)
@@ -1299,24 +1540,22 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
1299 struct blkif_response *bret; 1540 struct blkif_response *bret;
1300 RING_IDX i, rp; 1541 RING_IDX i, rp;
1301 unsigned long flags; 1542 unsigned long flags;
1302 struct blkfront_info *info = (struct blkfront_info *)dev_id; 1543 struct blkfront_ring_info *rinfo = (struct blkfront_ring_info *)dev_id;
1544 struct blkfront_info *info = rinfo->dev_info;
1303 int error; 1545 int error;
1304 1546
1305 spin_lock_irqsave(&info->io_lock, flags); 1547 if (unlikely(info->connected != BLKIF_STATE_CONNECTED))
1306
1307 if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) {
1308 spin_unlock_irqrestore(&info->io_lock, flags);
1309 return IRQ_HANDLED; 1548 return IRQ_HANDLED;
1310 }
1311 1549
1550 spin_lock_irqsave(&rinfo->ring_lock, flags);
1312 again: 1551 again:
1313 rp = info->ring.sring->rsp_prod; 1552 rp = rinfo->ring.sring->rsp_prod;
1314 rmb(); /* Ensure we see queued responses up to 'rp'. */ 1553 rmb(); /* Ensure we see queued responses up to 'rp'. */
1315 1554
1316 for (i = info->ring.rsp_cons; i != rp; i++) { 1555 for (i = rinfo->ring.rsp_cons; i != rp; i++) {
1317 unsigned long id; 1556 unsigned long id;
1318 1557
1319 bret = RING_GET_RESPONSE(&info->ring, i); 1558 bret = RING_GET_RESPONSE(&rinfo->ring, i);
1320 id = bret->id; 1559 id = bret->id;
1321 /* 1560 /*
1322 * The backend has messed up and given us an id that we would 1561 * The backend has messed up and given us an id that we would
@@ -1330,12 +1569,18 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
1330 * the id is busted. */ 1569 * the id is busted. */
1331 continue; 1570 continue;
1332 } 1571 }
1333 req = info->shadow[id].request; 1572 req = rinfo->shadow[id].request;
1334 1573
1335 if (bret->operation != BLKIF_OP_DISCARD) 1574 if (bret->operation != BLKIF_OP_DISCARD) {
1336 blkif_completion(&info->shadow[id], info, bret); 1575 /*
1576 * We may need to wait for an extra response if the
1577 * I/O request is split in 2
1578 */
1579 if (!blkif_completion(&id, rinfo, bret))
1580 continue;
1581 }
1337 1582
1338 if (add_id_to_freelist(info, id)) { 1583 if (add_id_to_freelist(rinfo, id)) {
1339 WARN(1, "%s: response to %s (id %ld) couldn't be recycled!\n", 1584 WARN(1, "%s: response to %s (id %ld) couldn't be recycled!\n",
1340 info->gd->disk_name, op_name(bret->operation), id); 1585 info->gd->disk_name, op_name(bret->operation), id);
1341 continue; 1586 continue;
@@ -1364,7 +1609,7 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
1364 error = -EOPNOTSUPP; 1609 error = -EOPNOTSUPP;
1365 } 1610 }
1366 if (unlikely(bret->status == BLKIF_RSP_ERROR && 1611 if (unlikely(bret->status == BLKIF_RSP_ERROR &&
1367 info->shadow[id].req.u.rw.nr_segments == 0)) { 1612 rinfo->shadow[id].req.u.rw.nr_segments == 0)) {
1368 printk(KERN_WARNING "blkfront: %s: empty %s op failed\n", 1613 printk(KERN_WARNING "blkfront: %s: empty %s op failed\n",
1369 info->gd->disk_name, op_name(bret->operation)); 1614 info->gd->disk_name, op_name(bret->operation));
1370 error = -EOPNOTSUPP; 1615 error = -EOPNOTSUPP;
@@ -1389,34 +1634,35 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
1389 } 1634 }
1390 } 1635 }
1391 1636
1392 info->ring.rsp_cons = i; 1637 rinfo->ring.rsp_cons = i;
1393 1638
1394 if (i != info->ring.req_prod_pvt) { 1639 if (i != rinfo->ring.req_prod_pvt) {
1395 int more_to_do; 1640 int more_to_do;
1396 RING_FINAL_CHECK_FOR_RESPONSES(&info->ring, more_to_do); 1641 RING_FINAL_CHECK_FOR_RESPONSES(&rinfo->ring, more_to_do);
1397 if (more_to_do) 1642 if (more_to_do)
1398 goto again; 1643 goto again;
1399 } else 1644 } else
1400 info->ring.sring->rsp_event = i + 1; 1645 rinfo->ring.sring->rsp_event = i + 1;
1401 1646
1402 kick_pending_request_queues(info); 1647 kick_pending_request_queues_locked(rinfo);
1403 1648
1404 spin_unlock_irqrestore(&info->io_lock, flags); 1649 spin_unlock_irqrestore(&rinfo->ring_lock, flags);
1405 1650
1406 return IRQ_HANDLED; 1651 return IRQ_HANDLED;
1407} 1652}
1408 1653
1409 1654
1410static int setup_blkring(struct xenbus_device *dev, 1655static int setup_blkring(struct xenbus_device *dev,
1411 struct blkfront_info *info) 1656 struct blkfront_ring_info *rinfo)
1412{ 1657{
1413 struct blkif_sring *sring; 1658 struct blkif_sring *sring;
1414 int err, i; 1659 int err, i;
1660 struct blkfront_info *info = rinfo->dev_info;
1415 unsigned long ring_size = info->nr_ring_pages * XEN_PAGE_SIZE; 1661 unsigned long ring_size = info->nr_ring_pages * XEN_PAGE_SIZE;
1416 grant_ref_t gref[XENBUS_MAX_RING_GRANTS]; 1662 grant_ref_t gref[XENBUS_MAX_RING_GRANTS];
1417 1663
1418 for (i = 0; i < info->nr_ring_pages; i++) 1664 for (i = 0; i < info->nr_ring_pages; i++)
1419 info->ring_ref[i] = GRANT_INVALID_REF; 1665 rinfo->ring_ref[i] = GRANT_INVALID_REF;
1420 1666
1421 sring = (struct blkif_sring *)__get_free_pages(GFP_NOIO | __GFP_HIGH, 1667 sring = (struct blkif_sring *)__get_free_pages(GFP_NOIO | __GFP_HIGH,
1422 get_order(ring_size)); 1668 get_order(ring_size));
@@ -1425,29 +1671,29 @@ static int setup_blkring(struct xenbus_device *dev,
1425 return -ENOMEM; 1671 return -ENOMEM;
1426 } 1672 }
1427 SHARED_RING_INIT(sring); 1673 SHARED_RING_INIT(sring);
1428 FRONT_RING_INIT(&info->ring, sring, ring_size); 1674 FRONT_RING_INIT(&rinfo->ring, sring, ring_size);
1429 1675
1430 err = xenbus_grant_ring(dev, info->ring.sring, info->nr_ring_pages, gref); 1676 err = xenbus_grant_ring(dev, rinfo->ring.sring, info->nr_ring_pages, gref);
1431 if (err < 0) { 1677 if (err < 0) {
1432 free_pages((unsigned long)sring, get_order(ring_size)); 1678 free_pages((unsigned long)sring, get_order(ring_size));
1433 info->ring.sring = NULL; 1679 rinfo->ring.sring = NULL;
1434 goto fail; 1680 goto fail;
1435 } 1681 }
1436 for (i = 0; i < info->nr_ring_pages; i++) 1682 for (i = 0; i < info->nr_ring_pages; i++)
1437 info->ring_ref[i] = gref[i]; 1683 rinfo->ring_ref[i] = gref[i];
1438 1684
1439 err = xenbus_alloc_evtchn(dev, &info->evtchn); 1685 err = xenbus_alloc_evtchn(dev, &rinfo->evtchn);
1440 if (err) 1686 if (err)
1441 goto fail; 1687 goto fail;
1442 1688
1443 err = bind_evtchn_to_irqhandler(info->evtchn, blkif_interrupt, 0, 1689 err = bind_evtchn_to_irqhandler(rinfo->evtchn, blkif_interrupt, 0,
1444 "blkif", info); 1690 "blkif", rinfo);
1445 if (err <= 0) { 1691 if (err <= 0) {
1446 xenbus_dev_fatal(dev, err, 1692 xenbus_dev_fatal(dev, err,
1447 "bind_evtchn_to_irqhandler failed"); 1693 "bind_evtchn_to_irqhandler failed");
1448 goto fail; 1694 goto fail;
1449 } 1695 }
1450 info->irq = err; 1696 rinfo->irq = err;
1451 1697
1452 return 0; 1698 return 0;
1453fail: 1699fail:
@@ -1455,6 +1701,53 @@ fail:
1455 return err; 1701 return err;
1456} 1702}
1457 1703
1704/*
1705 * Write out per-ring/queue nodes including ring-ref and event-channel, and each
1706 * ring buffer may have multi pages depending on ->nr_ring_pages.
1707 */
1708static int write_per_ring_nodes(struct xenbus_transaction xbt,
1709 struct blkfront_ring_info *rinfo, const char *dir)
1710{
1711 int err;
1712 unsigned int i;
1713 const char *message = NULL;
1714 struct blkfront_info *info = rinfo->dev_info;
1715
1716 if (info->nr_ring_pages == 1) {
1717 err = xenbus_printf(xbt, dir, "ring-ref", "%u", rinfo->ring_ref[0]);
1718 if (err) {
1719 message = "writing ring-ref";
1720 goto abort_transaction;
1721 }
1722 } else {
1723 for (i = 0; i < info->nr_ring_pages; i++) {
1724 char ring_ref_name[RINGREF_NAME_LEN];
1725
1726 snprintf(ring_ref_name, RINGREF_NAME_LEN, "ring-ref%u", i);
1727 err = xenbus_printf(xbt, dir, ring_ref_name,
1728 "%u", rinfo->ring_ref[i]);
1729 if (err) {
1730 message = "writing ring-ref";
1731 goto abort_transaction;
1732 }
1733 }
1734 }
1735
1736 err = xenbus_printf(xbt, dir, "event-channel", "%u", rinfo->evtchn);
1737 if (err) {
1738 message = "writing event-channel";
1739 goto abort_transaction;
1740 }
1741
1742 return 0;
1743
1744abort_transaction:
1745 xenbus_transaction_end(xbt, 1);
1746 if (message)
1747 xenbus_dev_fatal(info->xbdev, err, "%s", message);
1748
1749 return err;
1750}
1458 1751
1459/* Common code used when first setting up, and when resuming. */ 1752/* Common code used when first setting up, and when resuming. */
1460static int talk_to_blkback(struct xenbus_device *dev, 1753static int talk_to_blkback(struct xenbus_device *dev,
@@ -1462,8 +1755,8 @@ static int talk_to_blkback(struct xenbus_device *dev,
1462{ 1755{
1463 const char *message = NULL; 1756 const char *message = NULL;
1464 struct xenbus_transaction xbt; 1757 struct xenbus_transaction xbt;
1465 int err, i; 1758 int err;
1466 unsigned int max_page_order = 0; 1759 unsigned int i, max_page_order = 0;
1467 unsigned int ring_page_order = 0; 1760 unsigned int ring_page_order = 0;
1468 1761
1469 err = xenbus_scanf(XBT_NIL, info->xbdev->otherend, 1762 err = xenbus_scanf(XBT_NIL, info->xbdev->otherend,
@@ -1475,10 +1768,14 @@ static int talk_to_blkback(struct xenbus_device *dev,
1475 info->nr_ring_pages = 1 << ring_page_order; 1768 info->nr_ring_pages = 1 << ring_page_order;
1476 } 1769 }
1477 1770
1478 /* Create shared ring, alloc event channel. */ 1771 for (i = 0; i < info->nr_rings; i++) {
1479 err = setup_blkring(dev, info); 1772 struct blkfront_ring_info *rinfo = &info->rinfo[i];
1480 if (err) 1773
1481 goto out; 1774 /* Create shared ring, alloc event channel. */
1775 err = setup_blkring(dev, rinfo);
1776 if (err)
1777 goto destroy_blkring;
1778 }
1482 1779
1483again: 1780again:
1484 err = xenbus_transaction_start(&xbt); 1781 err = xenbus_transaction_start(&xbt);
@@ -1487,38 +1784,49 @@ again:
1487 goto destroy_blkring; 1784 goto destroy_blkring;
1488 } 1785 }
1489 1786
1490 if (info->nr_ring_pages == 1) { 1787 if (info->nr_ring_pages > 1) {
1491 err = xenbus_printf(xbt, dev->nodename, 1788 err = xenbus_printf(xbt, dev->nodename, "ring-page-order", "%u",
1492 "ring-ref", "%u", info->ring_ref[0]); 1789 ring_page_order);
1493 if (err) { 1790 if (err) {
1494 message = "writing ring-ref"; 1791 message = "writing ring-page-order";
1495 goto abort_transaction; 1792 goto abort_transaction;
1496 } 1793 }
1794 }
1795
1796 /* We already got the number of queues/rings in _probe */
1797 if (info->nr_rings == 1) {
1798 err = write_per_ring_nodes(xbt, &info->rinfo[0], dev->nodename);
1799 if (err)
1800 goto destroy_blkring;
1497 } else { 1801 } else {
1498 err = xenbus_printf(xbt, dev->nodename, 1802 char *path;
1499 "ring-page-order", "%u", ring_page_order); 1803 size_t pathsize;
1804
1805 err = xenbus_printf(xbt, dev->nodename, "multi-queue-num-queues", "%u",
1806 info->nr_rings);
1500 if (err) { 1807 if (err) {
1501 message = "writing ring-page-order"; 1808 message = "writing multi-queue-num-queues";
1502 goto abort_transaction; 1809 goto abort_transaction;
1503 } 1810 }
1504 1811
1505 for (i = 0; i < info->nr_ring_pages; i++) { 1812 pathsize = strlen(dev->nodename) + QUEUE_NAME_LEN;
1506 char ring_ref_name[RINGREF_NAME_LEN]; 1813 path = kmalloc(pathsize, GFP_KERNEL);
1814 if (!path) {
1815 err = -ENOMEM;
1816 message = "ENOMEM while writing ring references";
1817 goto abort_transaction;
1818 }
1507 1819
1508 snprintf(ring_ref_name, RINGREF_NAME_LEN, "ring-ref%u", i); 1820 for (i = 0; i < info->nr_rings; i++) {
1509 err = xenbus_printf(xbt, dev->nodename, ring_ref_name, 1821 memset(path, 0, pathsize);
1510 "%u", info->ring_ref[i]); 1822 snprintf(path, pathsize, "%s/queue-%u", dev->nodename, i);
1823 err = write_per_ring_nodes(xbt, &info->rinfo[i], path);
1511 if (err) { 1824 if (err) {
1512 message = "writing ring-ref"; 1825 kfree(path);
1513 goto abort_transaction; 1826 goto destroy_blkring;
1514 } 1827 }
1515 } 1828 }
1516 } 1829 kfree(path);
1517 err = xenbus_printf(xbt, dev->nodename,
1518 "event-channel", "%u", info->evtchn);
1519 if (err) {
1520 message = "writing event-channel";
1521 goto abort_transaction;
1522 } 1830 }
1523 err = xenbus_printf(xbt, dev->nodename, "protocol", "%s", 1831 err = xenbus_printf(xbt, dev->nodename, "protocol", "%s",
1524 XEN_IO_PROTO_ABI_NATIVE); 1832 XEN_IO_PROTO_ABI_NATIVE);
@@ -1540,9 +1848,14 @@ again:
1540 goto destroy_blkring; 1848 goto destroy_blkring;
1541 } 1849 }
1542 1850
1543 for (i = 0; i < BLK_RING_SIZE(info); i++) 1851 for (i = 0; i < info->nr_rings; i++) {
1544 info->shadow[i].req.u.rw.id = i+1; 1852 unsigned int j;
1545 info->shadow[BLK_RING_SIZE(info)-1].req.u.rw.id = 0x0fffffff; 1853 struct blkfront_ring_info *rinfo = &info->rinfo[i];
1854
1855 for (j = 0; j < BLK_RING_SIZE(info); j++)
1856 rinfo->shadow[j].req.u.rw.id = j + 1;
1857 rinfo->shadow[BLK_RING_SIZE(info)-1].req.u.rw.id = 0x0fffffff;
1858 }
1546 xenbus_switch_state(dev, XenbusStateInitialised); 1859 xenbus_switch_state(dev, XenbusStateInitialised);
1547 1860
1548 return 0; 1861 return 0;
@@ -1553,7 +1866,10 @@ again:
1553 xenbus_dev_fatal(dev, err, "%s", message); 1866 xenbus_dev_fatal(dev, err, "%s", message);
1554 destroy_blkring: 1867 destroy_blkring:
1555 blkif_free(info, 0); 1868 blkif_free(info, 0);
1556 out: 1869
1870 kfree(info);
1871 dev_set_drvdata(&dev->dev, NULL);
1872
1557 return err; 1873 return err;
1558} 1874}
1559 1875
@@ -1567,7 +1883,9 @@ static int blkfront_probe(struct xenbus_device *dev,
1567 const struct xenbus_device_id *id) 1883 const struct xenbus_device_id *id)
1568{ 1884{
1569 int err, vdevice; 1885 int err, vdevice;
1886 unsigned int r_index;
1570 struct blkfront_info *info; 1887 struct blkfront_info *info;
1888 unsigned int backend_max_queues = 0;
1571 1889
1572 /* FIXME: Use dynamic device id if this is not set. */ 1890 /* FIXME: Use dynamic device id if this is not set. */
1573 err = xenbus_scanf(XBT_NIL, dev->nodename, 1891 err = xenbus_scanf(XBT_NIL, dev->nodename,
@@ -1617,15 +1935,39 @@ static int blkfront_probe(struct xenbus_device *dev,
1617 return -ENOMEM; 1935 return -ENOMEM;
1618 } 1936 }
1619 1937
1620 mutex_init(&info->mutex);
1621 spin_lock_init(&info->io_lock);
1622 info->xbdev = dev; 1938 info->xbdev = dev;
1939 /* Check if backend supports multiple queues. */
1940 err = xenbus_scanf(XBT_NIL, info->xbdev->otherend,
1941 "multi-queue-max-queues", "%u", &backend_max_queues);
1942 if (err < 0)
1943 backend_max_queues = 1;
1944
1945 info->nr_rings = min(backend_max_queues, xen_blkif_max_queues);
1946 /* We need at least one ring. */
1947 if (!info->nr_rings)
1948 info->nr_rings = 1;
1949
1950 info->rinfo = kzalloc(sizeof(struct blkfront_ring_info) * info->nr_rings, GFP_KERNEL);
1951 if (!info->rinfo) {
1952 xenbus_dev_fatal(dev, -ENOMEM, "allocating ring_info structure");
1953 kfree(info);
1954 return -ENOMEM;
1955 }
1956
1957 for (r_index = 0; r_index < info->nr_rings; r_index++) {
1958 struct blkfront_ring_info *rinfo;
1959
1960 rinfo = &info->rinfo[r_index];
1961 INIT_LIST_HEAD(&rinfo->indirect_pages);
1962 INIT_LIST_HEAD(&rinfo->grants);
1963 rinfo->dev_info = info;
1964 INIT_WORK(&rinfo->work, blkif_restart_queue);
1965 spin_lock_init(&rinfo->ring_lock);
1966 }
1967
1968 mutex_init(&info->mutex);
1623 info->vdevice = vdevice; 1969 info->vdevice = vdevice;
1624 INIT_LIST_HEAD(&info->grants);
1625 INIT_LIST_HEAD(&info->indirect_pages);
1626 info->persistent_gnts_c = 0;
1627 info->connected = BLKIF_STATE_DISCONNECTED; 1970 info->connected = BLKIF_STATE_DISCONNECTED;
1628 INIT_WORK(&info->work, blkif_restart_queue);
1629 1971
1630 /* Front end dir is a number, which is used as the id. */ 1972 /* Front end dir is a number, which is used as the id. */
1631 info->handle = simple_strtoul(strrchr(dev->nodename, '/')+1, NULL, 0); 1973 info->handle = simple_strtoul(strrchr(dev->nodename, '/')+1, NULL, 0);
@@ -1649,7 +1991,7 @@ static void split_bio_end(struct bio *bio)
1649 1991
1650static int blkif_recover(struct blkfront_info *info) 1992static int blkif_recover(struct blkfront_info *info)
1651{ 1993{
1652 int i; 1994 unsigned int i, r_index;
1653 struct request *req, *n; 1995 struct request *req, *n;
1654 struct blk_shadow *copy; 1996 struct blk_shadow *copy;
1655 int rc; 1997 int rc;
@@ -1660,64 +2002,73 @@ static int blkif_recover(struct blkfront_info *info)
1660 struct split_bio *split_bio; 2002 struct split_bio *split_bio;
1661 struct list_head requests; 2003 struct list_head requests;
1662 2004
1663 /* Stage 1: Make a safe copy of the shadow state. */ 2005 blkfront_gather_backend_features(info);
1664 copy = kmemdup(info->shadow, sizeof(info->shadow),
1665 GFP_NOIO | __GFP_REPEAT | __GFP_HIGH);
1666 if (!copy)
1667 return -ENOMEM;
1668
1669 /* Stage 2: Set up free list. */
1670 memset(&info->shadow, 0, sizeof(info->shadow));
1671 for (i = 0; i < BLK_RING_SIZE(info); i++)
1672 info->shadow[i].req.u.rw.id = i+1;
1673 info->shadow_free = info->ring.req_prod_pvt;
1674 info->shadow[BLK_RING_SIZE(info)-1].req.u.rw.id = 0x0fffffff;
1675
1676 rc = blkfront_gather_backend_features(info);
1677 if (rc) {
1678 kfree(copy);
1679 return rc;
1680 }
1681
1682 segs = info->max_indirect_segments ? : BLKIF_MAX_SEGMENTS_PER_REQUEST; 2006 segs = info->max_indirect_segments ? : BLKIF_MAX_SEGMENTS_PER_REQUEST;
1683 blk_queue_max_segments(info->rq, segs); 2007 blk_queue_max_segments(info->rq, segs);
1684 bio_list_init(&bio_list); 2008 bio_list_init(&bio_list);
1685 INIT_LIST_HEAD(&requests); 2009 INIT_LIST_HEAD(&requests);
1686 for (i = 0; i < BLK_RING_SIZE(info); i++) {
1687 /* Not in use? */
1688 if (!copy[i].request)
1689 continue;
1690 2010
1691 /* 2011 for (r_index = 0; r_index < info->nr_rings; r_index++) {
1692 * Get the bios in the request so we can re-queue them. 2012 struct blkfront_ring_info *rinfo;
1693 */ 2013
1694 if (copy[i].request->cmd_flags & 2014 rinfo = &info->rinfo[r_index];
1695 (REQ_FLUSH | REQ_FUA | REQ_DISCARD | REQ_SECURE)) { 2015 /* Stage 1: Make a safe copy of the shadow state. */
2016 copy = kmemdup(rinfo->shadow, sizeof(rinfo->shadow),
2017 GFP_NOIO | __GFP_REPEAT | __GFP_HIGH);
2018 if (!copy)
2019 return -ENOMEM;
2020
2021 /* Stage 2: Set up free list. */
2022 memset(&rinfo->shadow, 0, sizeof(rinfo->shadow));
2023 for (i = 0; i < BLK_RING_SIZE(info); i++)
2024 rinfo->shadow[i].req.u.rw.id = i+1;
2025 rinfo->shadow_free = rinfo->ring.req_prod_pvt;
2026 rinfo->shadow[BLK_RING_SIZE(info)-1].req.u.rw.id = 0x0fffffff;
2027
2028 rc = blkfront_setup_indirect(rinfo);
2029 if (rc) {
2030 kfree(copy);
2031 return rc;
2032 }
2033
2034 for (i = 0; i < BLK_RING_SIZE(info); i++) {
2035 /* Not in use? */
2036 if (!copy[i].request)
2037 continue;
2038
1696 /* 2039 /*
1697 * Flush operations don't contain bios, so 2040 * Get the bios in the request so we can re-queue them.
1698 * we need to requeue the whole request
1699 */ 2041 */
1700 list_add(&copy[i].request->queuelist, &requests); 2042 if (copy[i].request->cmd_flags &
1701 continue; 2043 (REQ_FLUSH | REQ_FUA | REQ_DISCARD | REQ_SECURE)) {
2044 /*
2045 * Flush operations don't contain bios, so
2046 * we need to requeue the whole request
2047 */
2048 list_add(&copy[i].request->queuelist, &requests);
2049 continue;
2050 }
2051 merge_bio.head = copy[i].request->bio;
2052 merge_bio.tail = copy[i].request->biotail;
2053 bio_list_merge(&bio_list, &merge_bio);
2054 copy[i].request->bio = NULL;
2055 blk_end_request_all(copy[i].request, 0);
1702 } 2056 }
1703 merge_bio.head = copy[i].request->bio;
1704 merge_bio.tail = copy[i].request->biotail;
1705 bio_list_merge(&bio_list, &merge_bio);
1706 copy[i].request->bio = NULL;
1707 blk_end_request_all(copy[i].request, 0);
1708 }
1709
1710 kfree(copy);
1711 2057
2058 kfree(copy);
2059 }
1712 xenbus_switch_state(info->xbdev, XenbusStateConnected); 2060 xenbus_switch_state(info->xbdev, XenbusStateConnected);
1713 2061
1714 spin_lock_irq(&info->io_lock);
1715
1716 /* Now safe for us to use the shared ring */ 2062 /* Now safe for us to use the shared ring */
1717 info->connected = BLKIF_STATE_CONNECTED; 2063 info->connected = BLKIF_STATE_CONNECTED;
1718 2064
1719 /* Kick any other new requests queued since we resumed */ 2065 for (r_index = 0; r_index < info->nr_rings; r_index++) {
1720 kick_pending_request_queues(info); 2066 struct blkfront_ring_info *rinfo;
2067
2068 rinfo = &info->rinfo[r_index];
2069 /* Kick any other new requests queued since we resumed */
2070 kick_pending_request_queues(rinfo);
2071 }
1721 2072
1722 list_for_each_entry_safe(req, n, &requests, queuelist) { 2073 list_for_each_entry_safe(req, n, &requests, queuelist) {
1723 /* Requeue pending requests (flush or discard) */ 2074 /* Requeue pending requests (flush or discard) */
@@ -1725,7 +2076,6 @@ static int blkif_recover(struct blkfront_info *info)
1725 BUG_ON(req->nr_phys_segments > segs); 2076 BUG_ON(req->nr_phys_segments > segs);
1726 blk_mq_requeue_request(req); 2077 blk_mq_requeue_request(req);
1727 } 2078 }
1728 spin_unlock_irq(&info->io_lock);
1729 blk_mq_kick_requeue_list(info->rq); 2079 blk_mq_kick_requeue_list(info->rq);
1730 2080
1731 while ((bio = bio_list_pop(&bio_list)) != NULL) { 2081 while ((bio = bio_list_pop(&bio_list)) != NULL) {
@@ -1790,8 +2140,7 @@ static int blkfront_resume(struct xenbus_device *dev)
1790 return err; 2140 return err;
1791} 2141}
1792 2142
1793static void 2143static void blkfront_closing(struct blkfront_info *info)
1794blkfront_closing(struct blkfront_info *info)
1795{ 2144{
1796 struct xenbus_device *xbdev = info->xbdev; 2145 struct xenbus_device *xbdev = info->xbdev;
1797 struct block_device *bdev = NULL; 2146 struct block_device *bdev = NULL;
@@ -1851,18 +2200,29 @@ static void blkfront_setup_discard(struct blkfront_info *info)
1851 info->feature_secdiscard = !!discard_secure; 2200 info->feature_secdiscard = !!discard_secure;
1852} 2201}
1853 2202
1854static int blkfront_setup_indirect(struct blkfront_info *info) 2203static int blkfront_setup_indirect(struct blkfront_ring_info *rinfo)
1855{ 2204{
1856 unsigned int psegs, grants; 2205 unsigned int psegs, grants;
1857 int err, i; 2206 int err, i;
2207 struct blkfront_info *info = rinfo->dev_info;
1858 2208
1859 if (info->max_indirect_segments == 0) 2209 if (info->max_indirect_segments == 0) {
1860 grants = BLKIF_MAX_SEGMENTS_PER_REQUEST; 2210 if (!HAS_EXTRA_REQ)
2211 grants = BLKIF_MAX_SEGMENTS_PER_REQUEST;
2212 else {
2213 /*
2214 * When an extra req is required, the maximum
2215 * grants supported is related to the size of the
2216 * Linux block segment.
2217 */
2218 grants = GRANTS_PER_PSEG;
2219 }
2220 }
1861 else 2221 else
1862 grants = info->max_indirect_segments; 2222 grants = info->max_indirect_segments;
1863 psegs = grants / GRANTS_PER_PSEG; 2223 psegs = grants / GRANTS_PER_PSEG;
1864 2224
1865 err = fill_grant_buffer(info, 2225 err = fill_grant_buffer(rinfo,
1866 (grants + INDIRECT_GREFS(grants)) * BLK_RING_SIZE(info)); 2226 (grants + INDIRECT_GREFS(grants)) * BLK_RING_SIZE(info));
1867 if (err) 2227 if (err)
1868 goto out_of_memory; 2228 goto out_of_memory;
@@ -1875,31 +2235,31 @@ static int blkfront_setup_indirect(struct blkfront_info *info)
1875 */ 2235 */
1876 int num = INDIRECT_GREFS(grants) * BLK_RING_SIZE(info); 2236 int num = INDIRECT_GREFS(grants) * BLK_RING_SIZE(info);
1877 2237
1878 BUG_ON(!list_empty(&info->indirect_pages)); 2238 BUG_ON(!list_empty(&rinfo->indirect_pages));
1879 for (i = 0; i < num; i++) { 2239 for (i = 0; i < num; i++) {
1880 struct page *indirect_page = alloc_page(GFP_NOIO); 2240 struct page *indirect_page = alloc_page(GFP_NOIO);
1881 if (!indirect_page) 2241 if (!indirect_page)
1882 goto out_of_memory; 2242 goto out_of_memory;
1883 list_add(&indirect_page->lru, &info->indirect_pages); 2243 list_add(&indirect_page->lru, &rinfo->indirect_pages);
1884 } 2244 }
1885 } 2245 }
1886 2246
1887 for (i = 0; i < BLK_RING_SIZE(info); i++) { 2247 for (i = 0; i < BLK_RING_SIZE(info); i++) {
1888 info->shadow[i].grants_used = kzalloc( 2248 rinfo->shadow[i].grants_used = kzalloc(
1889 sizeof(info->shadow[i].grants_used[0]) * grants, 2249 sizeof(rinfo->shadow[i].grants_used[0]) * grants,
1890 GFP_NOIO); 2250 GFP_NOIO);
1891 info->shadow[i].sg = kzalloc(sizeof(info->shadow[i].sg[0]) * psegs, GFP_NOIO); 2251 rinfo->shadow[i].sg = kzalloc(sizeof(rinfo->shadow[i].sg[0]) * psegs, GFP_NOIO);
1892 if (info->max_indirect_segments) 2252 if (info->max_indirect_segments)
1893 info->shadow[i].indirect_grants = kzalloc( 2253 rinfo->shadow[i].indirect_grants = kzalloc(
1894 sizeof(info->shadow[i].indirect_grants[0]) * 2254 sizeof(rinfo->shadow[i].indirect_grants[0]) *
1895 INDIRECT_GREFS(grants), 2255 INDIRECT_GREFS(grants),
1896 GFP_NOIO); 2256 GFP_NOIO);
1897 if ((info->shadow[i].grants_used == NULL) || 2257 if ((rinfo->shadow[i].grants_used == NULL) ||
1898 (info->shadow[i].sg == NULL) || 2258 (rinfo->shadow[i].sg == NULL) ||
1899 (info->max_indirect_segments && 2259 (info->max_indirect_segments &&
1900 (info->shadow[i].indirect_grants == NULL))) 2260 (rinfo->shadow[i].indirect_grants == NULL)))
1901 goto out_of_memory; 2261 goto out_of_memory;
1902 sg_init_table(info->shadow[i].sg, psegs); 2262 sg_init_table(rinfo->shadow[i].sg, psegs);
1903 } 2263 }
1904 2264
1905 2265
@@ -1907,16 +2267,16 @@ static int blkfront_setup_indirect(struct blkfront_info *info)
1907 2267
1908out_of_memory: 2268out_of_memory:
1909 for (i = 0; i < BLK_RING_SIZE(info); i++) { 2269 for (i = 0; i < BLK_RING_SIZE(info); i++) {
1910 kfree(info->shadow[i].grants_used); 2270 kfree(rinfo->shadow[i].grants_used);
1911 info->shadow[i].grants_used = NULL; 2271 rinfo->shadow[i].grants_used = NULL;
1912 kfree(info->shadow[i].sg); 2272 kfree(rinfo->shadow[i].sg);
1913 info->shadow[i].sg = NULL; 2273 rinfo->shadow[i].sg = NULL;
1914 kfree(info->shadow[i].indirect_grants); 2274 kfree(rinfo->shadow[i].indirect_grants);
1915 info->shadow[i].indirect_grants = NULL; 2275 rinfo->shadow[i].indirect_grants = NULL;
1916 } 2276 }
1917 if (!list_empty(&info->indirect_pages)) { 2277 if (!list_empty(&rinfo->indirect_pages)) {
1918 struct page *indirect_page, *n; 2278 struct page *indirect_page, *n;
1919 list_for_each_entry_safe(indirect_page, n, &info->indirect_pages, lru) { 2279 list_for_each_entry_safe(indirect_page, n, &rinfo->indirect_pages, lru) {
1920 list_del(&indirect_page->lru); 2280 list_del(&indirect_page->lru);
1921 __free_page(indirect_page); 2281 __free_page(indirect_page);
1922 } 2282 }
@@ -1927,7 +2287,7 @@ out_of_memory:
1927/* 2287/*
1928 * Gather all backend feature-* 2288 * Gather all backend feature-*
1929 */ 2289 */
1930static int blkfront_gather_backend_features(struct blkfront_info *info) 2290static void blkfront_gather_backend_features(struct blkfront_info *info)
1931{ 2291{
1932 int err; 2292 int err;
1933 int barrier, flush, discard, persistent; 2293 int barrier, flush, discard, persistent;
@@ -1982,8 +2342,6 @@ static int blkfront_gather_backend_features(struct blkfront_info *info)
1982 else 2342 else
1983 info->max_indirect_segments = min(indirect_segments, 2343 info->max_indirect_segments = min(indirect_segments,
1984 xen_blkif_max_segments); 2344 xen_blkif_max_segments);
1985
1986 return blkfront_setup_indirect(info);
1987} 2345}
1988 2346
1989/* 2347/*
@@ -1996,7 +2354,7 @@ static void blkfront_connect(struct blkfront_info *info)
1996 unsigned long sector_size; 2354 unsigned long sector_size;
1997 unsigned int physical_sector_size; 2355 unsigned int physical_sector_size;
1998 unsigned int binfo; 2356 unsigned int binfo;
1999 int err; 2357 int err, i;
2000 2358
2001 switch (info->connected) { 2359 switch (info->connected) {
2002 case BLKIF_STATE_CONNECTED: 2360 case BLKIF_STATE_CONNECTED:
@@ -2053,11 +2411,15 @@ static void blkfront_connect(struct blkfront_info *info)
2053 if (err != 1) 2411 if (err != 1)
2054 physical_sector_size = sector_size; 2412 physical_sector_size = sector_size;
2055 2413
2056 err = blkfront_gather_backend_features(info); 2414 blkfront_gather_backend_features(info);
2057 if (err) { 2415 for (i = 0; i < info->nr_rings; i++) {
2058 xenbus_dev_fatal(info->xbdev, err, "setup_indirect at %s", 2416 err = blkfront_setup_indirect(&info->rinfo[i]);
2059 info->xbdev->otherend); 2417 if (err) {
2060 return; 2418 xenbus_dev_fatal(info->xbdev, err, "setup_indirect at %s",
2419 info->xbdev->otherend);
2420 blkif_free(info, 0);
2421 break;
2422 }
2061 } 2423 }
2062 2424
2063 err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size, 2425 err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size,
@@ -2071,10 +2433,9 @@ static void blkfront_connect(struct blkfront_info *info)
2071 xenbus_switch_state(info->xbdev, XenbusStateConnected); 2433 xenbus_switch_state(info->xbdev, XenbusStateConnected);
2072 2434
2073 /* Kick pending requests. */ 2435 /* Kick pending requests. */
2074 spin_lock_irq(&info->io_lock);
2075 info->connected = BLKIF_STATE_CONNECTED; 2436 info->connected = BLKIF_STATE_CONNECTED;
2076 kick_pending_request_queues(info); 2437 for (i = 0; i < info->nr_rings; i++)
2077 spin_unlock_irq(&info->io_lock); 2438 kick_pending_request_queues(&info->rinfo[i]);
2078 2439
2079 add_disk(info->gd); 2440 add_disk(info->gd);
2080 2441
@@ -2095,11 +2456,8 @@ static void blkback_changed(struct xenbus_device *dev,
2095 case XenbusStateInitWait: 2456 case XenbusStateInitWait:
2096 if (dev->state != XenbusStateInitialising) 2457 if (dev->state != XenbusStateInitialising)
2097 break; 2458 break;
2098 if (talk_to_blkback(dev, info)) { 2459 if (talk_to_blkback(dev, info))
2099 kfree(info);
2100 dev_set_drvdata(&dev->dev, NULL);
2101 break; 2460 break;
2102 }
2103 case XenbusStateInitialising: 2461 case XenbusStateInitialising:
2104 case XenbusStateInitialised: 2462 case XenbusStateInitialised:
2105 case XenbusStateReconfiguring: 2463 case XenbusStateReconfiguring:
@@ -2108,6 +2466,10 @@ static void blkback_changed(struct xenbus_device *dev,
2108 break; 2466 break;
2109 2467
2110 case XenbusStateConnected: 2468 case XenbusStateConnected:
2469 if (dev->state != XenbusStateInitialised) {
2470 if (talk_to_blkback(dev, info))
2471 break;
2472 }
2111 blkfront_connect(info); 2473 blkfront_connect(info);
2112 break; 2474 break;
2113 2475
@@ -2281,6 +2643,7 @@ static struct xenbus_driver blkfront_driver = {
2281static int __init xlblk_init(void) 2643static int __init xlblk_init(void)
2282{ 2644{
2283 int ret; 2645 int ret;
2646 int nr_cpus = num_online_cpus();
2284 2647
2285 if (!xen_domain()) 2648 if (!xen_domain())
2286 return -ENODEV; 2649 return -ENODEV;
@@ -2288,7 +2651,13 @@ static int __init xlblk_init(void)
2288 if (xen_blkif_max_ring_order > XENBUS_MAX_RING_GRANT_ORDER) { 2651 if (xen_blkif_max_ring_order > XENBUS_MAX_RING_GRANT_ORDER) {
2289 pr_info("Invalid max_ring_order (%d), will use default max: %d.\n", 2652 pr_info("Invalid max_ring_order (%d), will use default max: %d.\n",
2290 xen_blkif_max_ring_order, XENBUS_MAX_RING_GRANT_ORDER); 2653 xen_blkif_max_ring_order, XENBUS_MAX_RING_GRANT_ORDER);
2291 xen_blkif_max_ring_order = 0; 2654 xen_blkif_max_ring_order = XENBUS_MAX_RING_GRANT_ORDER;
2655 }
2656
2657 if (xen_blkif_max_queues > nr_cpus) {
2658 pr_info("Invalid max_queues (%d), will use default max: %d.\n",
2659 xen_blkif_max_queues, nr_cpus);
2660 xen_blkif_max_queues = nr_cpus;
2292 } 2661 }
2293 2662
2294 if (!xen_has_pv_disk_devices()) 2663 if (!xen_has_pv_disk_devices())
diff --git a/include/xen/interface/io/blkif.h b/include/xen/interface/io/blkif.h
index c33e1c489eb2..8b8cfadf7833 100644
--- a/include/xen/interface/io/blkif.h
+++ b/include/xen/interface/io/blkif.h
@@ -28,6 +28,54 @@ typedef uint16_t blkif_vdev_t;
28typedef uint64_t blkif_sector_t; 28typedef uint64_t blkif_sector_t;
29 29
30/* 30/*
31 * Multiple hardware queues/rings:
32 * If supported, the backend will write the key "multi-queue-max-queues" to
33 * the directory for that vbd, and set its value to the maximum supported
34 * number of queues.
35 * Frontends that are aware of this feature and wish to use it can write the
36 * key "multi-queue-num-queues" with the number they wish to use, which must be
37 * greater than zero, and no more than the value reported by the backend in
38 * "multi-queue-max-queues".
39 *
40 * For frontends requesting just one queue, the usual event-channel and
41 * ring-ref keys are written as before, simplifying the backend processing
42 * to avoid distinguishing between a frontend that doesn't understand the
43 * multi-queue feature, and one that does, but requested only one queue.
44 *
45 * Frontends requesting two or more queues must not write the toplevel
46 * event-channel and ring-ref keys, instead writing those keys under sub-keys
47 * having the name "queue-N" where N is the integer ID of the queue/ring for
48 * which those keys belong. Queues are indexed from zero.
49 * For example, a frontend with two queues must write the following set of
50 * queue-related keys:
51 *
52 * /local/domain/1/device/vbd/0/multi-queue-num-queues = "2"
53 * /local/domain/1/device/vbd/0/queue-0 = ""
54 * /local/domain/1/device/vbd/0/queue-0/ring-ref = "<ring-ref#0>"
55 * /local/domain/1/device/vbd/0/queue-0/event-channel = "<evtchn#0>"
56 * /local/domain/1/device/vbd/0/queue-1 = ""
57 * /local/domain/1/device/vbd/0/queue-1/ring-ref = "<ring-ref#1>"
58 * /local/domain/1/device/vbd/0/queue-1/event-channel = "<evtchn#1>"
59 *
60 * It is also possible to use multiple queues/rings together with
61 * feature multi-page ring buffer.
62 * For example, a frontend requests two queues/rings and the size of each ring
63 * buffer is two pages must write the following set of related keys:
64 *
65 * /local/domain/1/device/vbd/0/multi-queue-num-queues = "2"
66 * /local/domain/1/device/vbd/0/ring-page-order = "1"
67 * /local/domain/1/device/vbd/0/queue-0 = ""
68 * /local/domain/1/device/vbd/0/queue-0/ring-ref0 = "<ring-ref#0>"
69 * /local/domain/1/device/vbd/0/queue-0/ring-ref1 = "<ring-ref#1>"
70 * /local/domain/1/device/vbd/0/queue-0/event-channel = "<evtchn#0>"
71 * /local/domain/1/device/vbd/0/queue-1 = ""
72 * /local/domain/1/device/vbd/0/queue-1/ring-ref0 = "<ring-ref#2>"
73 * /local/domain/1/device/vbd/0/queue-1/ring-ref1 = "<ring-ref#3>"
74 * /local/domain/1/device/vbd/0/queue-1/event-channel = "<evtchn#1>"
75 *
76 */
77
78/*
31 * REQUEST CODES. 79 * REQUEST CODES.
32 */ 80 */
33#define BLKIF_OP_READ 0 81#define BLKIF_OP_READ 0