aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/block/xen-blkfront.c
diff options
context:
space:
mode:
authorJens Axboe <axboe@kernel.dk>2013-06-28 10:01:14 -0400
committerJens Axboe <axboe@kernel.dk>2013-06-28 10:01:14 -0400
commitf35546e072a7a86ccb950d4d1508879b0d49e374 (patch)
tree7e581edae814482ac4cfb00b9aea2e76ebe05f6d /drivers/block/xen-blkfront.c
parent36f988e978f81ffa415df4d77bbcd8887917f25c (diff)
parent1e0f7a21b2fffc70f27cc4a454c60321501045b1 (diff)
Merge branch 'stable/for-jens-3.10' of git://git.kernel.org/pub/scm/linux/kernel/git/konrad/xen into for-3.11/drivers
Konrad writes: It has the 'feature-max-indirect-segments' implemented in both backend and frontend. The current problem with the backend and frontend is that the segment size is limited to 11 pages. It means we can at most squeeze in 44kB per request. The ring can hold 32 (next power of two below 36) requests, meaning we can do 1.4M of outstanding requests. Nowadays that is not enough. The problem in the past was addressed in two ways - but neither one went upstream. The first solution to this proposed by Justin from Spectralogic was to negotiate the segment size. This means that the ‘struct blkif_sring_entry’ is now a variable size. It can expand from 112 bytes (cover 11 pages of data - 44kB) to 1580 bytes (256 pages of data - so 1MB). It is a simple extension by just making the array in the request expand from 11 to a variable size negotiated. But it had limits: this extension still limits the number of segments per request to 255 (as the total number must be specified in the request, which only has an 8-bit field for that purpose). The other solution (from Intel - Ronghui) was to create one extra ring that only has the ‘struct blkif_request_segment’ in them. The ‘struct blkif_request’ would be changed to have an index in said ‘segment ring’. There is only one segment ring. This means that the size of the initial ring is still the same. The requests would point to the segment and enumerate out how many of the indexes it wants to use. The limit is of course the size of the segment. If one assumes a one-page segment this means we can in one request cover ~4MB. Those patches were posted as RFC and the author never followed up on the ideas on changing it to be a bit more flexible. There is yet another mechanism that could be employed  (which these patches implement) - and it borrows from VirtIO protocol. And that is the ‘indirect descriptors’. This very similar to what Intel suggests, but with a twist. The twist is to negotiate how many of these 'segment' pages (aka indirect descriptor pages) we want to support (in reality we negotiate how many entries in the segment we want to cover, and we module the number if it is bigger than the segment size). This means that with the existing 36 slots in the ring (single page) we can cover: 32 slots * each blkif_request_indirect covers: 512 * 4096 ~= 64M. Since we ample space in the blkif_request_indirect to span more than one indirect page, that number (64M) can be also multiplied by eight = 512MB. Roger Pau Monne took the idea and implemented them in these patches. They work great and the corner cases (migration between backends with and without this extension) work nicely. The backend has a limit right now off how many indirect entries it can handle: one indirect page, and at maximum 256 entries (out of 512 - so 50% of the page is used). That comes out to 32 slots * 256 entries in a indirect page * 1 indirect page per request * 4096 = 32MB. This is a conservative number that can change in the future. Right now it strikes a good balance between giving excellent performance, memory usage in the backend, and balancing the needs of many guests. In the patchset there is also the split of the blkback structure to be per-VBD. This means that the spinlock contention we had with many guests trying to do I/O and all the blkback threads hitting the same lock has been eliminated. Also there are bug-fixes to deal with oddly sized sectors, insane amounts on th ring, and also a security fix (posted earlier).
Diffstat (limited to 'drivers/block/xen-blkfront.c')
-rw-r--r--drivers/block/xen-blkfront.c532
1 files changed, 432 insertions, 100 deletions
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index d89ef86220f4..a4660bbee8a6 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -74,12 +74,30 @@ struct grant {
74struct blk_shadow { 74struct blk_shadow {
75 struct blkif_request req; 75 struct blkif_request req;
76 struct request *request; 76 struct request *request;
77 struct grant *grants_used[BLKIF_MAX_SEGMENTS_PER_REQUEST]; 77 struct grant **grants_used;
78 struct grant **indirect_grants;
79 struct scatterlist *sg;
80};
81
82struct split_bio {
83 struct bio *bio;
84 atomic_t pending;
85 int err;
78}; 86};
79 87
80static DEFINE_MUTEX(blkfront_mutex); 88static DEFINE_MUTEX(blkfront_mutex);
81static const struct block_device_operations xlvbd_block_fops; 89static const struct block_device_operations xlvbd_block_fops;
82 90
91/*
92 * Maximum number of segments in indirect requests, the actual value used by
93 * the frontend driver is the minimum of this value and the value provided
94 * by the backend driver.
95 */
96
97static unsigned int xen_blkif_max_segments = 32;
98module_param_named(max, xen_blkif_max_segments, int, S_IRUGO);
99MODULE_PARM_DESC(max, "Maximum amount of segments in indirect requests (default is 32)");
100
83#define BLK_RING_SIZE __CONST_RING_SIZE(blkif, PAGE_SIZE) 101#define BLK_RING_SIZE __CONST_RING_SIZE(blkif, PAGE_SIZE)
84 102
85/* 103/*
@@ -98,7 +116,6 @@ struct blkfront_info
98 enum blkif_state connected; 116 enum blkif_state connected;
99 int ring_ref; 117 int ring_ref;
100 struct blkif_front_ring ring; 118 struct blkif_front_ring ring;
101 struct scatterlist sg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
102 unsigned int evtchn, irq; 119 unsigned int evtchn, irq;
103 struct request_queue *rq; 120 struct request_queue *rq;
104 struct work_struct work; 121 struct work_struct work;
@@ -114,6 +131,7 @@ struct blkfront_info
114 unsigned int discard_granularity; 131 unsigned int discard_granularity;
115 unsigned int discard_alignment; 132 unsigned int discard_alignment;
116 unsigned int feature_persistent:1; 133 unsigned int feature_persistent:1;
134 unsigned int max_indirect_segments;
117 int is_ready; 135 int is_ready;
118}; 136};
119 137
@@ -142,6 +160,13 @@ static DEFINE_SPINLOCK(minor_lock);
142 160
143#define DEV_NAME "xvd" /* name in /dev */ 161#define DEV_NAME "xvd" /* name in /dev */
144 162
163#define SEGS_PER_INDIRECT_FRAME \
164 (PAGE_SIZE/sizeof(struct blkif_request_segment_aligned))
165#define INDIRECT_GREFS(_segs) \
166 ((_segs + SEGS_PER_INDIRECT_FRAME - 1)/SEGS_PER_INDIRECT_FRAME)
167
168static int blkfront_setup_indirect(struct blkfront_info *info);
169
145static int get_id_from_freelist(struct blkfront_info *info) 170static int get_id_from_freelist(struct blkfront_info *info)
146{ 171{
147 unsigned long free = info->shadow_free; 172 unsigned long free = info->shadow_free;
@@ -358,7 +383,8 @@ static int blkif_queue_request(struct request *req)
358 struct blkif_request *ring_req; 383 struct blkif_request *ring_req;
359 unsigned long id; 384 unsigned long id;
360 unsigned int fsect, lsect; 385 unsigned int fsect, lsect;
361 int i, ref; 386 int i, ref, n;
387 struct blkif_request_segment_aligned *segments = NULL;
362 388
363 /* 389 /*
364 * Used to store if we are able to queue the request by just using 390 * Used to store if we are able to queue the request by just using
@@ -369,21 +395,27 @@ static int blkif_queue_request(struct request *req)
369 grant_ref_t gref_head; 395 grant_ref_t gref_head;
370 struct grant *gnt_list_entry = NULL; 396 struct grant *gnt_list_entry = NULL;
371 struct scatterlist *sg; 397 struct scatterlist *sg;
398 int nseg, max_grefs;
372 399
373 if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) 400 if (unlikely(info->connected != BLKIF_STATE_CONNECTED))
374 return 1; 401 return 1;
375 402
376 /* Check if we have enought grants to allocate a requests */ 403 max_grefs = info->max_indirect_segments ?
377 if (info->persistent_gnts_c < BLKIF_MAX_SEGMENTS_PER_REQUEST) { 404 info->max_indirect_segments +
405 INDIRECT_GREFS(info->max_indirect_segments) :
406 BLKIF_MAX_SEGMENTS_PER_REQUEST;
407
408 /* Check if we have enough grants to allocate a requests */
409 if (info->persistent_gnts_c < max_grefs) {
378 new_persistent_gnts = 1; 410 new_persistent_gnts = 1;
379 if (gnttab_alloc_grant_references( 411 if (gnttab_alloc_grant_references(
380 BLKIF_MAX_SEGMENTS_PER_REQUEST - info->persistent_gnts_c, 412 max_grefs - info->persistent_gnts_c,
381 &gref_head) < 0) { 413 &gref_head) < 0) {
382 gnttab_request_free_callback( 414 gnttab_request_free_callback(
383 &info->callback, 415 &info->callback,
384 blkif_restart_queue_callback, 416 blkif_restart_queue_callback,
385 info, 417 info,
386 BLKIF_MAX_SEGMENTS_PER_REQUEST); 418 max_grefs);
387 return 1; 419 return 1;
388 } 420 }
389 } else 421 } else
@@ -394,42 +426,67 @@ static int blkif_queue_request(struct request *req)
394 id = get_id_from_freelist(info); 426 id = get_id_from_freelist(info);
395 info->shadow[id].request = req; 427 info->shadow[id].request = req;
396 428
397 ring_req->u.rw.id = id;
398 ring_req->u.rw.sector_number = (blkif_sector_t)blk_rq_pos(req);
399 ring_req->u.rw.handle = info->handle;
400
401 ring_req->operation = rq_data_dir(req) ?
402 BLKIF_OP_WRITE : BLKIF_OP_READ;
403
404 if (req->cmd_flags & (REQ_FLUSH | REQ_FUA)) {
405 /*
406 * Ideally we can do an unordered flush-to-disk. In case the
407 * backend onlysupports barriers, use that. A barrier request
408 * a superset of FUA, so we can implement it the same
409 * way. (It's also a FLUSH+FUA, since it is
410 * guaranteed ordered WRT previous writes.)
411 */
412 ring_req->operation = info->flush_op;
413 }
414
415 if (unlikely(req->cmd_flags & (REQ_DISCARD | REQ_SECURE))) { 429 if (unlikely(req->cmd_flags & (REQ_DISCARD | REQ_SECURE))) {
416 /* id, sector_number and handle are set above. */
417 ring_req->operation = BLKIF_OP_DISCARD; 430 ring_req->operation = BLKIF_OP_DISCARD;
418 ring_req->u.discard.nr_sectors = blk_rq_sectors(req); 431 ring_req->u.discard.nr_sectors = blk_rq_sectors(req);
432 ring_req->u.discard.id = id;
433 ring_req->u.discard.sector_number = (blkif_sector_t)blk_rq_pos(req);
419 if ((req->cmd_flags & REQ_SECURE) && info->feature_secdiscard) 434 if ((req->cmd_flags & REQ_SECURE) && info->feature_secdiscard)
420 ring_req->u.discard.flag = BLKIF_DISCARD_SECURE; 435 ring_req->u.discard.flag = BLKIF_DISCARD_SECURE;
421 else 436 else
422 ring_req->u.discard.flag = 0; 437 ring_req->u.discard.flag = 0;
423 } else { 438 } else {
424 ring_req->u.rw.nr_segments = blk_rq_map_sg(req->q, req, 439 BUG_ON(info->max_indirect_segments == 0 &&
425 info->sg); 440 req->nr_phys_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST);
426 BUG_ON(ring_req->u.rw.nr_segments > 441 BUG_ON(info->max_indirect_segments &&
427 BLKIF_MAX_SEGMENTS_PER_REQUEST); 442 req->nr_phys_segments > info->max_indirect_segments);
428 443 nseg = blk_rq_map_sg(req->q, req, info->shadow[id].sg);
429 for_each_sg(info->sg, sg, ring_req->u.rw.nr_segments, i) { 444 ring_req->u.rw.id = id;
445 if (nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST) {
446 /*
447 * The indirect operation can only be a BLKIF_OP_READ or
448 * BLKIF_OP_WRITE
449 */
450 BUG_ON(req->cmd_flags & (REQ_FLUSH | REQ_FUA));
451 ring_req->operation = BLKIF_OP_INDIRECT;
452 ring_req->u.indirect.indirect_op = rq_data_dir(req) ?
453 BLKIF_OP_WRITE : BLKIF_OP_READ;
454 ring_req->u.indirect.sector_number = (blkif_sector_t)blk_rq_pos(req);
455 ring_req->u.indirect.handle = info->handle;
456 ring_req->u.indirect.nr_segments = nseg;
457 } else {
458 ring_req->u.rw.sector_number = (blkif_sector_t)blk_rq_pos(req);
459 ring_req->u.rw.handle = info->handle;
460 ring_req->operation = rq_data_dir(req) ?
461 BLKIF_OP_WRITE : BLKIF_OP_READ;
462 if (req->cmd_flags & (REQ_FLUSH | REQ_FUA)) {
463 /*
464 * Ideally we can do an unordered flush-to-disk. In case the
465 * backend onlysupports barriers, use that. A barrier request
466 * a superset of FUA, so we can implement it the same
467 * way. (It's also a FLUSH+FUA, since it is
468 * guaranteed ordered WRT previous writes.)
469 */
470 ring_req->operation = info->flush_op;
471 }
472 ring_req->u.rw.nr_segments = nseg;
473 }
474 for_each_sg(info->shadow[id].sg, sg, nseg, i) {
430 fsect = sg->offset >> 9; 475 fsect = sg->offset >> 9;
431 lsect = fsect + (sg->length >> 9) - 1; 476 lsect = fsect + (sg->length >> 9) - 1;
432 477
478 if ((ring_req->operation == BLKIF_OP_INDIRECT) &&
479 (i % SEGS_PER_INDIRECT_FRAME == 0)) {
480 if (segments)
481 kunmap_atomic(segments);
482
483 n = i / SEGS_PER_INDIRECT_FRAME;
484 gnt_list_entry = get_grant(&gref_head, info);
485 info->shadow[id].indirect_grants[n] = gnt_list_entry;
486 segments = kmap_atomic(pfn_to_page(gnt_list_entry->pfn));
487 ring_req->u.indirect.indirect_grefs[n] = gnt_list_entry->gref;
488 }
489
433 gnt_list_entry = get_grant(&gref_head, info); 490 gnt_list_entry = get_grant(&gref_head, info);
434 ref = gnt_list_entry->gref; 491 ref = gnt_list_entry->gref;
435 492
@@ -441,8 +498,7 @@ static int blkif_queue_request(struct request *req)
441 498
442 BUG_ON(sg->offset + sg->length > PAGE_SIZE); 499 BUG_ON(sg->offset + sg->length > PAGE_SIZE);
443 500
444 shared_data = kmap_atomic( 501 shared_data = kmap_atomic(pfn_to_page(gnt_list_entry->pfn));
445 pfn_to_page(gnt_list_entry->pfn));
446 bvec_data = kmap_atomic(sg_page(sg)); 502 bvec_data = kmap_atomic(sg_page(sg));
447 503
448 /* 504 /*
@@ -461,13 +517,23 @@ static int blkif_queue_request(struct request *req)
461 kunmap_atomic(bvec_data); 517 kunmap_atomic(bvec_data);
462 kunmap_atomic(shared_data); 518 kunmap_atomic(shared_data);
463 } 519 }
464 520 if (ring_req->operation != BLKIF_OP_INDIRECT) {
465 ring_req->u.rw.seg[i] = 521 ring_req->u.rw.seg[i] =
466 (struct blkif_request_segment) { 522 (struct blkif_request_segment) {
467 .gref = ref, 523 .gref = ref,
468 .first_sect = fsect, 524 .first_sect = fsect,
469 .last_sect = lsect }; 525 .last_sect = lsect };
526 } else {
527 n = i % SEGS_PER_INDIRECT_FRAME;
528 segments[n] =
529 (struct blkif_request_segment_aligned) {
530 .gref = ref,
531 .first_sect = fsect,
532 .last_sect = lsect };
533 }
470 } 534 }
535 if (segments)
536 kunmap_atomic(segments);
471 } 537 }
472 538
473 info->ring.req_prod_pvt++; 539 info->ring.req_prod_pvt++;
@@ -542,7 +608,9 @@ wait:
542 flush_requests(info); 608 flush_requests(info);
543} 609}
544 610
545static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size) 611static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size,
612 unsigned int physical_sector_size,
613 unsigned int segments)
546{ 614{
547 struct request_queue *rq; 615 struct request_queue *rq;
548 struct blkfront_info *info = gd->private_data; 616 struct blkfront_info *info = gd->private_data;
@@ -564,14 +632,15 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size)
564 632
565 /* Hard sector size and max sectors impersonate the equiv. hardware. */ 633 /* Hard sector size and max sectors impersonate the equiv. hardware. */
566 blk_queue_logical_block_size(rq, sector_size); 634 blk_queue_logical_block_size(rq, sector_size);
567 blk_queue_max_hw_sectors(rq, 512); 635 blk_queue_physical_block_size(rq, physical_sector_size);
636 blk_queue_max_hw_sectors(rq, (segments * PAGE_SIZE) / 512);
568 637
569 /* Each segment in a request is up to an aligned page in size. */ 638 /* Each segment in a request is up to an aligned page in size. */
570 blk_queue_segment_boundary(rq, PAGE_SIZE - 1); 639 blk_queue_segment_boundary(rq, PAGE_SIZE - 1);
571 blk_queue_max_segment_size(rq, PAGE_SIZE); 640 blk_queue_max_segment_size(rq, PAGE_SIZE);
572 641
573 /* Ensure a merged request will fit in a single I/O ring slot. */ 642 /* Ensure a merged request will fit in a single I/O ring slot. */
574 blk_queue_max_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST); 643 blk_queue_max_segments(rq, segments);
575 644
576 /* Make sure buffer addresses are sector-aligned. */ 645 /* Make sure buffer addresses are sector-aligned. */
577 blk_queue_dma_alignment(rq, 511); 646 blk_queue_dma_alignment(rq, 511);
@@ -588,13 +657,16 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size)
588static void xlvbd_flush(struct blkfront_info *info) 657static void xlvbd_flush(struct blkfront_info *info)
589{ 658{
590 blk_queue_flush(info->rq, info->feature_flush); 659 blk_queue_flush(info->rq, info->feature_flush);
591 printk(KERN_INFO "blkfront: %s: %s: %s %s\n", 660 printk(KERN_INFO "blkfront: %s: %s: %s %s %s %s %s\n",
592 info->gd->disk_name, 661 info->gd->disk_name,
593 info->flush_op == BLKIF_OP_WRITE_BARRIER ? 662 info->flush_op == BLKIF_OP_WRITE_BARRIER ?
594 "barrier" : (info->flush_op == BLKIF_OP_FLUSH_DISKCACHE ? 663 "barrier" : (info->flush_op == BLKIF_OP_FLUSH_DISKCACHE ?
595 "flush diskcache" : "barrier or flush"), 664 "flush diskcache" : "barrier or flush"),
596 info->feature_flush ? "enabled" : "disabled", 665 info->feature_flush ? "enabled;" : "disabled;",
597 info->feature_persistent ? "using persistent grants" : ""); 666 "persistent grants:",
667 info->feature_persistent ? "enabled;" : "disabled;",
668 "indirect descriptors:",
669 info->max_indirect_segments ? "enabled;" : "disabled;");
598} 670}
599 671
600static int xen_translate_vdev(int vdevice, int *minor, unsigned int *offset) 672static int xen_translate_vdev(int vdevice, int *minor, unsigned int *offset)
@@ -667,7 +739,8 @@ static char *encode_disk_name(char *ptr, unsigned int n)
667 739
668static int xlvbd_alloc_gendisk(blkif_sector_t capacity, 740static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
669 struct blkfront_info *info, 741 struct blkfront_info *info,
670 u16 vdisk_info, u16 sector_size) 742 u16 vdisk_info, u16 sector_size,
743 unsigned int physical_sector_size)
671{ 744{
672 struct gendisk *gd; 745 struct gendisk *gd;
673 int nr_minors = 1; 746 int nr_minors = 1;
@@ -734,7 +807,9 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
734 gd->driverfs_dev = &(info->xbdev->dev); 807 gd->driverfs_dev = &(info->xbdev->dev);
735 set_capacity(gd, capacity); 808 set_capacity(gd, capacity);
736 809
737 if (xlvbd_init_blk_queue(gd, sector_size)) { 810 if (xlvbd_init_blk_queue(gd, sector_size, physical_sector_size,
811 info->max_indirect_segments ? :
812 BLKIF_MAX_SEGMENTS_PER_REQUEST)) {
738 del_gendisk(gd); 813 del_gendisk(gd);
739 goto release; 814 goto release;
740 } 815 }
@@ -818,6 +893,7 @@ static void blkif_free(struct blkfront_info *info, int suspend)
818{ 893{
819 struct grant *persistent_gnt; 894 struct grant *persistent_gnt;
820 struct grant *n; 895 struct grant *n;
896 int i, j, segs;
821 897
822 /* Prevent new requests being issued until we fix things up. */ 898 /* Prevent new requests being issued until we fix things up. */
823 spin_lock_irq(&info->io_lock); 899 spin_lock_irq(&info->io_lock);
@@ -843,6 +919,47 @@ static void blkif_free(struct blkfront_info *info, int suspend)
843 } 919 }
844 BUG_ON(info->persistent_gnts_c != 0); 920 BUG_ON(info->persistent_gnts_c != 0);
845 921
922 for (i = 0; i < BLK_RING_SIZE; i++) {
923 /*
924 * Clear persistent grants present in requests already
925 * on the shared ring
926 */
927 if (!info->shadow[i].request)
928 goto free_shadow;
929
930 segs = info->shadow[i].req.operation == BLKIF_OP_INDIRECT ?
931 info->shadow[i].req.u.indirect.nr_segments :
932 info->shadow[i].req.u.rw.nr_segments;
933 for (j = 0; j < segs; j++) {
934 persistent_gnt = info->shadow[i].grants_used[j];
935 gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL);
936 __free_page(pfn_to_page(persistent_gnt->pfn));
937 kfree(persistent_gnt);
938 }
939
940 if (info->shadow[i].req.operation != BLKIF_OP_INDIRECT)
941 /*
942 * If this is not an indirect operation don't try to
943 * free indirect segments
944 */
945 goto free_shadow;
946
947 for (j = 0; j < INDIRECT_GREFS(segs); j++) {
948 persistent_gnt = info->shadow[i].indirect_grants[j];
949 gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL);
950 __free_page(pfn_to_page(persistent_gnt->pfn));
951 kfree(persistent_gnt);
952 }
953
954free_shadow:
955 kfree(info->shadow[i].grants_used);
956 info->shadow[i].grants_used = NULL;
957 kfree(info->shadow[i].indirect_grants);
958 info->shadow[i].indirect_grants = NULL;
959 kfree(info->shadow[i].sg);
960 info->shadow[i].sg = NULL;
961 }
962
846 /* No more gnttab callback work. */ 963 /* No more gnttab callback work. */
847 gnttab_cancel_free_callback(&info->callback); 964 gnttab_cancel_free_callback(&info->callback);
848 spin_unlock_irq(&info->io_lock); 965 spin_unlock_irq(&info->io_lock);
@@ -867,12 +984,13 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info,
867 struct blkif_response *bret) 984 struct blkif_response *bret)
868{ 985{
869 int i = 0; 986 int i = 0;
870 struct bio_vec *bvec; 987 struct scatterlist *sg;
871 struct req_iterator iter;
872 unsigned long flags;
873 char *bvec_data; 988 char *bvec_data;
874 void *shared_data; 989 void *shared_data;
875 unsigned int offset = 0; 990 int nseg;
991
992 nseg = s->req.operation == BLKIF_OP_INDIRECT ?
993 s->req.u.indirect.nr_segments : s->req.u.rw.nr_segments;
876 994
877 if (bret->operation == BLKIF_OP_READ) { 995 if (bret->operation == BLKIF_OP_READ) {
878 /* 996 /*
@@ -881,26 +999,29 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info,
881 * than PAGE_SIZE, we have to keep track of the current offset, 999 * than PAGE_SIZE, we have to keep track of the current offset,
882 * to be sure we are copying the data from the right shared page. 1000 * to be sure we are copying the data from the right shared page.
883 */ 1001 */
884 rq_for_each_segment(bvec, s->request, iter) { 1002 for_each_sg(s->sg, sg, nseg, i) {
885 BUG_ON((bvec->bv_offset + bvec->bv_len) > PAGE_SIZE); 1003 BUG_ON(sg->offset + sg->length > PAGE_SIZE);
886 if (bvec->bv_offset < offset)
887 i++;
888 BUG_ON(i >= s->req.u.rw.nr_segments);
889 shared_data = kmap_atomic( 1004 shared_data = kmap_atomic(
890 pfn_to_page(s->grants_used[i]->pfn)); 1005 pfn_to_page(s->grants_used[i]->pfn));
891 bvec_data = bvec_kmap_irq(bvec, &flags); 1006 bvec_data = kmap_atomic(sg_page(sg));
892 memcpy(bvec_data, shared_data + bvec->bv_offset, 1007 memcpy(bvec_data + sg->offset,
893 bvec->bv_len); 1008 shared_data + sg->offset,
894 bvec_kunmap_irq(bvec_data, &flags); 1009 sg->length);
1010 kunmap_atomic(bvec_data);
895 kunmap_atomic(shared_data); 1011 kunmap_atomic(shared_data);
896 offset = bvec->bv_offset + bvec->bv_len;
897 } 1012 }
898 } 1013 }
899 /* Add the persistent grant into the list of free grants */ 1014 /* Add the persistent grant into the list of free grants */
900 for (i = 0; i < s->req.u.rw.nr_segments; i++) { 1015 for (i = 0; i < nseg; i++) {
901 list_add(&s->grants_used[i]->node, &info->persistent_gnts); 1016 list_add(&s->grants_used[i]->node, &info->persistent_gnts);
902 info->persistent_gnts_c++; 1017 info->persistent_gnts_c++;
903 } 1018 }
1019 if (s->req.operation == BLKIF_OP_INDIRECT) {
1020 for (i = 0; i < INDIRECT_GREFS(nseg); i++) {
1021 list_add(&s->indirect_grants[i]->node, &info->persistent_gnts);
1022 info->persistent_gnts_c++;
1023 }
1024 }
904} 1025}
905 1026
906static irqreturn_t blkif_interrupt(int irq, void *dev_id) 1027static irqreturn_t blkif_interrupt(int irq, void *dev_id)
@@ -1034,14 +1155,6 @@ static int setup_blkring(struct xenbus_device *dev,
1034 SHARED_RING_INIT(sring); 1155 SHARED_RING_INIT(sring);
1035 FRONT_RING_INIT(&info->ring, sring, PAGE_SIZE); 1156 FRONT_RING_INIT(&info->ring, sring, PAGE_SIZE);
1036 1157
1037 sg_init_table(info->sg, BLKIF_MAX_SEGMENTS_PER_REQUEST);
1038
1039 /* Allocate memory for grants */
1040 err = fill_grant_buffer(info, BLK_RING_SIZE *
1041 BLKIF_MAX_SEGMENTS_PER_REQUEST);
1042 if (err)
1043 goto fail;
1044
1045 err = xenbus_grant_ring(dev, virt_to_mfn(info->ring.sring)); 1158 err = xenbus_grant_ring(dev, virt_to_mfn(info->ring.sring));
1046 if (err < 0) { 1159 if (err < 0) {
1047 free_page((unsigned long)sring); 1160 free_page((unsigned long)sring);
@@ -1223,13 +1336,84 @@ static int blkfront_probe(struct xenbus_device *dev,
1223 return 0; 1336 return 0;
1224} 1337}
1225 1338
1339/*
1340 * This is a clone of md_trim_bio, used to split a bio into smaller ones
1341 */
1342static void trim_bio(struct bio *bio, int offset, int size)
1343{
1344 /* 'bio' is a cloned bio which we need to trim to match
1345 * the given offset and size.
1346 * This requires adjusting bi_sector, bi_size, and bi_io_vec
1347 */
1348 int i;
1349 struct bio_vec *bvec;
1350 int sofar = 0;
1351
1352 size <<= 9;
1353 if (offset == 0 && size == bio->bi_size)
1354 return;
1355
1356 bio->bi_sector += offset;
1357 bio->bi_size = size;
1358 offset <<= 9;
1359 clear_bit(BIO_SEG_VALID, &bio->bi_flags);
1360
1361 while (bio->bi_idx < bio->bi_vcnt &&
1362 bio->bi_io_vec[bio->bi_idx].bv_len <= offset) {
1363 /* remove this whole bio_vec */
1364 offset -= bio->bi_io_vec[bio->bi_idx].bv_len;
1365 bio->bi_idx++;
1366 }
1367 if (bio->bi_idx < bio->bi_vcnt) {
1368 bio->bi_io_vec[bio->bi_idx].bv_offset += offset;
1369 bio->bi_io_vec[bio->bi_idx].bv_len -= offset;
1370 }
1371 /* avoid any complications with bi_idx being non-zero*/
1372 if (bio->bi_idx) {
1373 memmove(bio->bi_io_vec, bio->bi_io_vec+bio->bi_idx,
1374 (bio->bi_vcnt - bio->bi_idx) * sizeof(struct bio_vec));
1375 bio->bi_vcnt -= bio->bi_idx;
1376 bio->bi_idx = 0;
1377 }
1378 /* Make sure vcnt and last bv are not too big */
1379 bio_for_each_segment(bvec, bio, i) {
1380 if (sofar + bvec->bv_len > size)
1381 bvec->bv_len = size - sofar;
1382 if (bvec->bv_len == 0) {
1383 bio->bi_vcnt = i;
1384 break;
1385 }
1386 sofar += bvec->bv_len;
1387 }
1388}
1389
1390static void split_bio_end(struct bio *bio, int error)
1391{
1392 struct split_bio *split_bio = bio->bi_private;
1393
1394 if (error)
1395 split_bio->err = error;
1396
1397 if (atomic_dec_and_test(&split_bio->pending)) {
1398 split_bio->bio->bi_phys_segments = 0;
1399 bio_endio(split_bio->bio, split_bio->err);
1400 kfree(split_bio);
1401 }
1402 bio_put(bio);
1403}
1226 1404
1227static int blkif_recover(struct blkfront_info *info) 1405static int blkif_recover(struct blkfront_info *info)
1228{ 1406{
1229 int i; 1407 int i;
1230 struct blkif_request *req; 1408 struct request *req, *n;
1231 struct blk_shadow *copy; 1409 struct blk_shadow *copy;
1232 int j; 1410 int rc;
1411 struct bio *bio, *cloned_bio;
1412 struct bio_list bio_list, merge_bio;
1413 unsigned int segs, offset;
1414 int pending, size;
1415 struct split_bio *split_bio;
1416 struct list_head requests;
1233 1417
1234 /* Stage 1: Make a safe copy of the shadow state. */ 1418 /* Stage 1: Make a safe copy of the shadow state. */
1235 copy = kmemdup(info->shadow, sizeof(info->shadow), 1419 copy = kmemdup(info->shadow, sizeof(info->shadow),
@@ -1244,36 +1428,64 @@ static int blkif_recover(struct blkfront_info *info)
1244 info->shadow_free = info->ring.req_prod_pvt; 1428 info->shadow_free = info->ring.req_prod_pvt;
1245 info->shadow[BLK_RING_SIZE-1].req.u.rw.id = 0x0fffffff; 1429 info->shadow[BLK_RING_SIZE-1].req.u.rw.id = 0x0fffffff;
1246 1430
1247 /* Stage 3: Find pending requests and requeue them. */ 1431 rc = blkfront_setup_indirect(info);
1432 if (rc) {
1433 kfree(copy);
1434 return rc;
1435 }
1436
1437 segs = info->max_indirect_segments ? : BLKIF_MAX_SEGMENTS_PER_REQUEST;
1438 blk_queue_max_segments(info->rq, segs);
1439 bio_list_init(&bio_list);
1440 INIT_LIST_HEAD(&requests);
1248 for (i = 0; i < BLK_RING_SIZE; i++) { 1441 for (i = 0; i < BLK_RING_SIZE; i++) {
1249 /* Not in use? */ 1442 /* Not in use? */
1250 if (!copy[i].request) 1443 if (!copy[i].request)
1251 continue; 1444 continue;
1252 1445
1253 /* Grab a request slot and copy shadow state into it. */ 1446 /*
1254 req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt); 1447 * Get the bios in the request so we can re-queue them.
1255 *req = copy[i].req; 1448 */
1256 1449 if (copy[i].request->cmd_flags &
1257 /* We get a new request id, and must reset the shadow state. */ 1450 (REQ_FLUSH | REQ_FUA | REQ_DISCARD | REQ_SECURE)) {
1258 req->u.rw.id = get_id_from_freelist(info); 1451 /*
1259 memcpy(&info->shadow[req->u.rw.id], &copy[i], sizeof(copy[i])); 1452 * Flush operations don't contain bios, so
1260 1453 * we need to requeue the whole request
1261 if (req->operation != BLKIF_OP_DISCARD) { 1454 */
1262 /* Rewrite any grant references invalidated by susp/resume. */ 1455 list_add(&copy[i].request->queuelist, &requests);
1263 for (j = 0; j < req->u.rw.nr_segments; j++) 1456 continue;
1264 gnttab_grant_foreign_access_ref(
1265 req->u.rw.seg[j].gref,
1266 info->xbdev->otherend_id,
1267 pfn_to_mfn(copy[i].grants_used[j]->pfn),
1268 0);
1269 } 1457 }
1270 info->shadow[req->u.rw.id].req = *req; 1458 merge_bio.head = copy[i].request->bio;
1271 1459 merge_bio.tail = copy[i].request->biotail;
1272 info->ring.req_prod_pvt++; 1460 bio_list_merge(&bio_list, &merge_bio);
1461 copy[i].request->bio = NULL;
1462 blk_put_request(copy[i].request);
1273 } 1463 }
1274 1464
1275 kfree(copy); 1465 kfree(copy);
1276 1466
1467 /*
1468 * Empty the queue, this is important because we might have
1469 * requests in the queue with more segments than what we
1470 * can handle now.
1471 */
1472 spin_lock_irq(&info->io_lock);
1473 while ((req = blk_fetch_request(info->rq)) != NULL) {
1474 if (req->cmd_flags &
1475 (REQ_FLUSH | REQ_FUA | REQ_DISCARD | REQ_SECURE)) {
1476 list_add(&req->queuelist, &requests);
1477 continue;
1478 }
1479 merge_bio.head = req->bio;
1480 merge_bio.tail = req->biotail;
1481 bio_list_merge(&bio_list, &merge_bio);
1482 req->bio = NULL;
1483 if (req->cmd_flags & (REQ_FLUSH | REQ_FUA))
1484 pr_alert("diskcache flush request found!\n");
1485 __blk_put_request(info->rq, req);
1486 }
1487 spin_unlock_irq(&info->io_lock);
1488
1277 xenbus_switch_state(info->xbdev, XenbusStateConnected); 1489 xenbus_switch_state(info->xbdev, XenbusStateConnected);
1278 1490
1279 spin_lock_irq(&info->io_lock); 1491 spin_lock_irq(&info->io_lock);
@@ -1281,14 +1493,50 @@ static int blkif_recover(struct blkfront_info *info)
1281 /* Now safe for us to use the shared ring */ 1493 /* Now safe for us to use the shared ring */
1282 info->connected = BLKIF_STATE_CONNECTED; 1494 info->connected = BLKIF_STATE_CONNECTED;
1283 1495
1284 /* Send off requeued requests */
1285 flush_requests(info);
1286
1287 /* Kick any other new requests queued since we resumed */ 1496 /* Kick any other new requests queued since we resumed */
1288 kick_pending_request_queues(info); 1497 kick_pending_request_queues(info);
1289 1498
1499 list_for_each_entry_safe(req, n, &requests, queuelist) {
1500 /* Requeue pending requests (flush or discard) */
1501 list_del_init(&req->queuelist);
1502 BUG_ON(req->nr_phys_segments > segs);
1503 blk_requeue_request(info->rq, req);
1504 }
1290 spin_unlock_irq(&info->io_lock); 1505 spin_unlock_irq(&info->io_lock);
1291 1506
1507 while ((bio = bio_list_pop(&bio_list)) != NULL) {
1508 /* Traverse the list of pending bios and re-queue them */
1509 if (bio_segments(bio) > segs) {
1510 /*
1511 * This bio has more segments than what we can
1512 * handle, we have to split it.
1513 */
1514 pending = (bio_segments(bio) + segs - 1) / segs;
1515 split_bio = kzalloc(sizeof(*split_bio), GFP_NOIO);
1516 BUG_ON(split_bio == NULL);
1517 atomic_set(&split_bio->pending, pending);
1518 split_bio->bio = bio;
1519 for (i = 0; i < pending; i++) {
1520 offset = (i * segs * PAGE_SIZE) >> 9;
1521 size = min((unsigned int)(segs * PAGE_SIZE) >> 9,
1522 (unsigned int)(bio->bi_size >> 9) - offset);
1523 cloned_bio = bio_clone(bio, GFP_NOIO);
1524 BUG_ON(cloned_bio == NULL);
1525 trim_bio(cloned_bio, offset, size);
1526 cloned_bio->bi_private = split_bio;
1527 cloned_bio->bi_end_io = split_bio_end;
1528 submit_bio(cloned_bio->bi_rw, cloned_bio);
1529 }
1530 /*
1531 * Now we have to wait for all those smaller bios to
1532 * end, so we can also end the "parent" bio.
1533 */
1534 continue;
1535 }
1536 /* We don't need to split this bio */
1537 submit_bio(bio->bi_rw, bio);
1538 }
1539
1292 return 0; 1540 return 0;
1293} 1541}
1294 1542
@@ -1308,8 +1556,12 @@ static int blkfront_resume(struct xenbus_device *dev)
1308 blkif_free(info, info->connected == BLKIF_STATE_CONNECTED); 1556 blkif_free(info, info->connected == BLKIF_STATE_CONNECTED);
1309 1557
1310 err = talk_to_blkback(dev, info); 1558 err = talk_to_blkback(dev, info);
1311 if (info->connected == BLKIF_STATE_SUSPENDED && !err) 1559
1312 err = blkif_recover(info); 1560 /*
1561 * We have to wait for the backend to switch to
1562 * connected state, since we want to read which
1563 * features it supports.
1564 */
1313 1565
1314 return err; 1566 return err;
1315} 1567}
@@ -1387,6 +1639,60 @@ static void blkfront_setup_discard(struct blkfront_info *info)
1387 kfree(type); 1639 kfree(type);
1388} 1640}
1389 1641
1642static int blkfront_setup_indirect(struct blkfront_info *info)
1643{
1644 unsigned int indirect_segments, segs;
1645 int err, i;
1646
1647 err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
1648 "feature-max-indirect-segments", "%u", &indirect_segments,
1649 NULL);
1650 if (err) {
1651 info->max_indirect_segments = 0;
1652 segs = BLKIF_MAX_SEGMENTS_PER_REQUEST;
1653 } else {
1654 info->max_indirect_segments = min(indirect_segments,
1655 xen_blkif_max_segments);
1656 segs = info->max_indirect_segments;
1657 }
1658
1659 err = fill_grant_buffer(info, (segs + INDIRECT_GREFS(segs)) * BLK_RING_SIZE);
1660 if (err)
1661 goto out_of_memory;
1662
1663 for (i = 0; i < BLK_RING_SIZE; i++) {
1664 info->shadow[i].grants_used = kzalloc(
1665 sizeof(info->shadow[i].grants_used[0]) * segs,
1666 GFP_NOIO);
1667 info->shadow[i].sg = kzalloc(sizeof(info->shadow[i].sg[0]) * segs, GFP_NOIO);
1668 if (info->max_indirect_segments)
1669 info->shadow[i].indirect_grants = kzalloc(
1670 sizeof(info->shadow[i].indirect_grants[0]) *
1671 INDIRECT_GREFS(segs),
1672 GFP_NOIO);
1673 if ((info->shadow[i].grants_used == NULL) ||
1674 (info->shadow[i].sg == NULL) ||
1675 (info->max_indirect_segments &&
1676 (info->shadow[i].indirect_grants == NULL)))
1677 goto out_of_memory;
1678 sg_init_table(info->shadow[i].sg, segs);
1679 }
1680
1681
1682 return 0;
1683
1684out_of_memory:
1685 for (i = 0; i < BLK_RING_SIZE; i++) {
1686 kfree(info->shadow[i].grants_used);
1687 info->shadow[i].grants_used = NULL;
1688 kfree(info->shadow[i].sg);
1689 info->shadow[i].sg = NULL;
1690 kfree(info->shadow[i].indirect_grants);
1691 info->shadow[i].indirect_grants = NULL;
1692 }
1693 return -ENOMEM;
1694}
1695
1390/* 1696/*
1391 * Invoked when the backend is finally 'ready' (and has told produced 1697 * Invoked when the backend is finally 'ready' (and has told produced
1392 * the details about the physical device - #sectors, size, etc). 1698 * the details about the physical device - #sectors, size, etc).
@@ -1395,6 +1701,7 @@ static void blkfront_connect(struct blkfront_info *info)
1395{ 1701{
1396 unsigned long long sectors; 1702 unsigned long long sectors;
1397 unsigned long sector_size; 1703 unsigned long sector_size;
1704 unsigned int physical_sector_size;
1398 unsigned int binfo; 1705 unsigned int binfo;
1399 int err; 1706 int err;
1400 int barrier, flush, discard, persistent; 1707 int barrier, flush, discard, persistent;
@@ -1414,8 +1721,15 @@ static void blkfront_connect(struct blkfront_info *info)
1414 set_capacity(info->gd, sectors); 1721 set_capacity(info->gd, sectors);
1415 revalidate_disk(info->gd); 1722 revalidate_disk(info->gd);
1416 1723
1417 /* fall through */ 1724 return;
1418 case BLKIF_STATE_SUSPENDED: 1725 case BLKIF_STATE_SUSPENDED:
1726 /*
1727 * If we are recovering from suspension, we need to wait
1728 * for the backend to announce it's features before
1729 * reconnecting, at least we need to know if the backend
1730 * supports indirect descriptors, and how many.
1731 */
1732 blkif_recover(info);
1419 return; 1733 return;
1420 1734
1421 default: 1735 default:
@@ -1437,6 +1751,16 @@ static void blkfront_connect(struct blkfront_info *info)
1437 return; 1751 return;
1438 } 1752 }
1439 1753
1754 /*
1755 * physcial-sector-size is a newer field, so old backends may not
1756 * provide this. Assume physical sector size to be the same as
1757 * sector_size in that case.
1758 */
1759 err = xenbus_scanf(XBT_NIL, info->xbdev->otherend,
1760 "physical-sector-size", "%u", &physical_sector_size);
1761 if (err != 1)
1762 physical_sector_size = sector_size;
1763
1440 info->feature_flush = 0; 1764 info->feature_flush = 0;
1441 info->flush_op = 0; 1765 info->flush_op = 0;
1442 1766
@@ -1483,7 +1807,15 @@ static void blkfront_connect(struct blkfront_info *info)
1483 else 1807 else
1484 info->feature_persistent = persistent; 1808 info->feature_persistent = persistent;
1485 1809
1486 err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size); 1810 err = blkfront_setup_indirect(info);
1811 if (err) {
1812 xenbus_dev_fatal(info->xbdev, err, "setup_indirect at %s",
1813 info->xbdev->otherend);
1814 return;
1815 }
1816
1817 err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size,
1818 physical_sector_size);
1487 if (err) { 1819 if (err) {
1488 xenbus_dev_fatal(info->xbdev, err, "xlvbd_add at %s", 1820 xenbus_dev_fatal(info->xbdev, err, "xlvbd_add at %s",
1489 info->xbdev->otherend); 1821 info->xbdev->otherend);