aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJulien Grall <julien.grall@citrix.com>2015-08-13 08:13:35 -0400
committerKonrad Rzeszutek Wilk <konrad.wilk@oracle.com>2016-01-04 12:21:25 -0500
commit6cc5683390472c450fd69975d1283db79202667f (patch)
tree11ac3de1f8d5547455e37ad7fb37965315fbee88
parent2e073969d57f60fc0b863985779657624cbd4886 (diff)
xen/blkfront: Handle non-indirect grant with 64KB pages
The minimal size of request in the block framework is always PAGE_SIZE. It means that when 64KB guest is support, the request will at least be 64KB. Although, if the backend doesn't support indirect descriptor (such as QDISK in QEMU), a ring request is only able to accommodate 11 segments of 4KB (i.e 44KB). The current frontend is assuming that an I/O request will always fit in a ring request. This is not true any more when using 64KB page granularity and will therefore crash during boot. On ARM64, the ABI is completely neutral to the page granularity used by the domU. The guest has the choice between different page granularity supported by the processors (for instance on ARM64: 4KB, 16KB, 64KB). This can't be enforced by the hypervisor and therefore it's possible to run guests using different page granularity. So we can't mandate the block backend to support indirect descriptor when the frontend is using 64KB page granularity and have to fix it properly in the frontend. The solution exposed below is based on modifying directly the frontend guest rather than asking the block framework to support smaller size (i.e < PAGE_SIZE). This is because the change is the block framework are not trivial as everything seems to relying on a struct *page (see [1]). Although, it may be possible that someone succeed to do it in the future and we would therefore be able to use it. Given that a block request may not fit in a single ring request, a second request is introduced for the data that cannot fit in the first one. This means that the second ring request should never be used on Linux if the page size is smaller than 44KB. To achieve the support of the extra ring request, the block queue size is divided by two. Therefore, the ring will always contain enough space to accommodate 2 ring requests. While this will reduce the overall performance, it will make the implementation more contained. The way forward to get better performance is to implement in the backend either indirect descriptor or multiple grants ring. Note that the parameters blk_queue_max_* helpers haven't been updated. The block code will set the mimimum size supported and we may be able to support directly any change in the block framework that lower down the minimal size of a request. [1] http://lists.xen.org/archives/html/xen-devel/2015-08/msg02200.html Signed-off-by: Julien Grall <julien.grall@citrix.com> Acked-by: Roger Pau Monné <roger.pau@citrix.com> Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
-rw-r--r--drivers/block/xen-blkfront.c228
1 files changed, 212 insertions, 16 deletions
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 0b32c90ffc3f..f3d0d4758641 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -60,6 +60,20 @@
60 60
61#include <asm/xen/hypervisor.h> 61#include <asm/xen/hypervisor.h>
62 62
63/*
64 * The minimal size of segment supported by the block framework is PAGE_SIZE.
65 * When Linux is using a different page size than Xen, it may not be possible
66 * to put all the data in a single segment.
67 * This can happen when the backend doesn't support indirect descriptor and
68 * therefore the maximum amount of data that a request can carry is
69 * BLKIF_MAX_SEGMENTS_PER_REQUEST * XEN_PAGE_SIZE = 44KB
70 *
71 * Note that we only support one extra request. So the Linux page size
72 * should be <= ( 2 * BLKIF_MAX_SEGMENTS_PER_REQUEST * XEN_PAGE_SIZE) =
73 * 88KB.
74 */
75#define HAS_EXTRA_REQ (BLKIF_MAX_SEGMENTS_PER_REQUEST < XEN_PFN_PER_PAGE)
76
63enum blkif_state { 77enum blkif_state {
64 BLKIF_STATE_DISCONNECTED, 78 BLKIF_STATE_DISCONNECTED,
65 BLKIF_STATE_CONNECTED, 79 BLKIF_STATE_CONNECTED,
@@ -72,6 +86,13 @@ struct grant {
72 struct list_head node; 86 struct list_head node;
73}; 87};
74 88
89enum blk_req_status {
90 REQ_WAITING,
91 REQ_DONE,
92 REQ_ERROR,
93 REQ_EOPNOTSUPP,
94};
95
75struct blk_shadow { 96struct blk_shadow {
76 struct blkif_request req; 97 struct blkif_request req;
77 struct request *request; 98 struct request *request;
@@ -79,6 +100,14 @@ struct blk_shadow {
79 struct grant **indirect_grants; 100 struct grant **indirect_grants;
80 struct scatterlist *sg; 101 struct scatterlist *sg;
81 unsigned int num_sg; 102 unsigned int num_sg;
103 enum blk_req_status status;
104
105 #define NO_ASSOCIATED_ID ~0UL
106 /*
107 * Id of the sibling if we ever need 2 requests when handling a
108 * block I/O request
109 */
110 unsigned long associated_id;
82}; 111};
83 112
84struct split_bio { 113struct split_bio {
@@ -492,6 +521,8 @@ static unsigned long blkif_ring_get_request(struct blkfront_ring_info *rinfo,
492 521
493 id = get_id_from_freelist(rinfo); 522 id = get_id_from_freelist(rinfo);
494 rinfo->shadow[id].request = req; 523 rinfo->shadow[id].request = req;
524 rinfo->shadow[id].status = REQ_WAITING;
525 rinfo->shadow[id].associated_id = NO_ASSOCIATED_ID;
495 526
496 (*ring_req)->u.rw.id = id; 527 (*ring_req)->u.rw.id = id;
497 528
@@ -533,6 +564,9 @@ struct setup_rw_req {
533 bool need_copy; 564 bool need_copy;
534 unsigned int bvec_off; 565 unsigned int bvec_off;
535 char *bvec_data; 566 char *bvec_data;
567
568 bool require_extra_req;
569 struct blkif_request *extra_ring_req;
536}; 570};
537 571
538static void blkif_setup_rw_req_grant(unsigned long gfn, unsigned int offset, 572static void blkif_setup_rw_req_grant(unsigned long gfn, unsigned int offset,
@@ -546,8 +580,24 @@ static void blkif_setup_rw_req_grant(unsigned long gfn, unsigned int offset,
546 unsigned int grant_idx = setup->grant_idx; 580 unsigned int grant_idx = setup->grant_idx;
547 struct blkif_request *ring_req = setup->ring_req; 581 struct blkif_request *ring_req = setup->ring_req;
548 struct blkfront_ring_info *rinfo = setup->rinfo; 582 struct blkfront_ring_info *rinfo = setup->rinfo;
583 /*
584 * We always use the shadow of the first request to store the list
585 * of grant associated to the block I/O request. This made the
586 * completion more easy to handle even if the block I/O request is
587 * split.
588 */
549 struct blk_shadow *shadow = &rinfo->shadow[setup->id]; 589 struct blk_shadow *shadow = &rinfo->shadow[setup->id];
550 590
591 if (unlikely(setup->require_extra_req &&
592 grant_idx >= BLKIF_MAX_SEGMENTS_PER_REQUEST)) {
593 /*
594 * We are using the second request, setup grant_idx
595 * to be the index of the segment array.
596 */
597 grant_idx -= BLKIF_MAX_SEGMENTS_PER_REQUEST;
598 ring_req = setup->extra_ring_req;
599 }
600
551 if ((ring_req->operation == BLKIF_OP_INDIRECT) && 601 if ((ring_req->operation == BLKIF_OP_INDIRECT) &&
552 (grant_idx % GRANTS_PER_INDIRECT_FRAME == 0)) { 602 (grant_idx % GRANTS_PER_INDIRECT_FRAME == 0)) {
553 if (setup->segments) 603 if (setup->segments)
@@ -562,7 +612,11 @@ static void blkif_setup_rw_req_grant(unsigned long gfn, unsigned int offset,
562 612
563 gnt_list_entry = get_grant(&setup->gref_head, gfn, rinfo); 613 gnt_list_entry = get_grant(&setup->gref_head, gfn, rinfo);
564 ref = gnt_list_entry->gref; 614 ref = gnt_list_entry->gref;
565 shadow->grants_used[grant_idx] = gnt_list_entry; 615 /*
616 * All the grants are stored in the shadow of the first
617 * request. Therefore we have to use the global index.
618 */
619 shadow->grants_used[setup->grant_idx] = gnt_list_entry;
566 620
567 if (setup->need_copy) { 621 if (setup->need_copy) {
568 void *shared_data; 622 void *shared_data;
@@ -604,11 +658,31 @@ static void blkif_setup_rw_req_grant(unsigned long gfn, unsigned int offset,
604 (setup->grant_idx)++; 658 (setup->grant_idx)++;
605} 659}
606 660
661static void blkif_setup_extra_req(struct blkif_request *first,
662 struct blkif_request *second)
663{
664 uint16_t nr_segments = first->u.rw.nr_segments;
665
666 /*
667 * The second request is only present when the first request uses
668 * all its segments. It's always the continuity of the first one.
669 */
670 first->u.rw.nr_segments = BLKIF_MAX_SEGMENTS_PER_REQUEST;
671
672 second->u.rw.nr_segments = nr_segments - BLKIF_MAX_SEGMENTS_PER_REQUEST;
673 second->u.rw.sector_number = first->u.rw.sector_number +
674 (BLKIF_MAX_SEGMENTS_PER_REQUEST * XEN_PAGE_SIZE) / 512;
675
676 second->u.rw.handle = first->u.rw.handle;
677 second->operation = first->operation;
678}
679
607static int blkif_queue_rw_req(struct request *req, struct blkfront_ring_info *rinfo) 680static int blkif_queue_rw_req(struct request *req, struct blkfront_ring_info *rinfo)
608{ 681{
609 struct blkfront_info *info = rinfo->dev_info; 682 struct blkfront_info *info = rinfo->dev_info;
610 struct blkif_request *ring_req; 683 struct blkif_request *ring_req, *extra_ring_req = NULL;
611 unsigned long id; 684 unsigned long id, extra_id = NO_ASSOCIATED_ID;
685 bool require_extra_req = false;
612 int i; 686 int i;
613 struct setup_rw_req setup = { 687 struct setup_rw_req setup = {
614 .grant_idx = 0, 688 .grant_idx = 0,
@@ -650,19 +724,19 @@ static int blkif_queue_rw_req(struct request *req, struct blkfront_ring_info *ri
650 /* Fill out a communications ring structure. */ 724 /* Fill out a communications ring structure. */
651 id = blkif_ring_get_request(rinfo, req, &ring_req); 725 id = blkif_ring_get_request(rinfo, req, &ring_req);
652 726
653 BUG_ON(info->max_indirect_segments == 0 &&
654 GREFS(req->nr_phys_segments) > BLKIF_MAX_SEGMENTS_PER_REQUEST);
655 BUG_ON(info->max_indirect_segments &&
656 GREFS(req->nr_phys_segments) > info->max_indirect_segments);
657
658 num_sg = blk_rq_map_sg(req->q, req, rinfo->shadow[id].sg); 727 num_sg = blk_rq_map_sg(req->q, req, rinfo->shadow[id].sg);
659 num_grant = 0; 728 num_grant = 0;
660 /* Calculate the number of grant used */ 729 /* Calculate the number of grant used */
661 for_each_sg(rinfo->shadow[id].sg, sg, num_sg, i) 730 for_each_sg(rinfo->shadow[id].sg, sg, num_sg, i)
662 num_grant += gnttab_count_grant(sg->offset, sg->length); 731 num_grant += gnttab_count_grant(sg->offset, sg->length);
663 732
733 require_extra_req = info->max_indirect_segments == 0 &&
734 num_grant > BLKIF_MAX_SEGMENTS_PER_REQUEST;
735 BUG_ON(!HAS_EXTRA_REQ && require_extra_req);
736
664 rinfo->shadow[id].num_sg = num_sg; 737 rinfo->shadow[id].num_sg = num_sg;
665 if (num_grant > BLKIF_MAX_SEGMENTS_PER_REQUEST) { 738 if (num_grant > BLKIF_MAX_SEGMENTS_PER_REQUEST &&
739 likely(!require_extra_req)) {
666 /* 740 /*
667 * The indirect operation can only be a BLKIF_OP_READ or 741 * The indirect operation can only be a BLKIF_OP_READ or
668 * BLKIF_OP_WRITE 742 * BLKIF_OP_WRITE
@@ -702,10 +776,30 @@ static int blkif_queue_rw_req(struct request *req, struct blkfront_ring_info *ri
702 } 776 }
703 } 777 }
704 ring_req->u.rw.nr_segments = num_grant; 778 ring_req->u.rw.nr_segments = num_grant;
779 if (unlikely(require_extra_req)) {
780 extra_id = blkif_ring_get_request(rinfo, req,
781 &extra_ring_req);
782 /*
783 * Only the first request contains the scatter-gather
784 * list.
785 */
786 rinfo->shadow[extra_id].num_sg = 0;
787
788 blkif_setup_extra_req(ring_req, extra_ring_req);
789
790 /* Link the 2 requests together */
791 rinfo->shadow[extra_id].associated_id = id;
792 rinfo->shadow[id].associated_id = extra_id;
793 }
705 } 794 }
706 795
707 setup.ring_req = ring_req; 796 setup.ring_req = ring_req;
708 setup.id = id; 797 setup.id = id;
798
799 setup.require_extra_req = require_extra_req;
800 if (unlikely(require_extra_req))
801 setup.extra_ring_req = extra_ring_req;
802
709 for_each_sg(rinfo->shadow[id].sg, sg, num_sg, i) { 803 for_each_sg(rinfo->shadow[id].sg, sg, num_sg, i) {
710 BUG_ON(sg->offset + sg->length > PAGE_SIZE); 804 BUG_ON(sg->offset + sg->length > PAGE_SIZE);
711 805
@@ -728,6 +822,8 @@ static int blkif_queue_rw_req(struct request *req, struct blkfront_ring_info *ri
728 822
729 /* Keep a private copy so we can reissue requests when recovering. */ 823 /* Keep a private copy so we can reissue requests when recovering. */
730 rinfo->shadow[id].req = *ring_req; 824 rinfo->shadow[id].req = *ring_req;
825 if (unlikely(require_extra_req))
826 rinfo->shadow[extra_id].req = *extra_ring_req;
731 827
732 if (max_grefs > 0) 828 if (max_grefs > 0)
733 gnttab_free_grant_references(setup.gref_head); 829 gnttab_free_grant_references(setup.gref_head);
@@ -829,7 +925,16 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size,
829 memset(&info->tag_set, 0, sizeof(info->tag_set)); 925 memset(&info->tag_set, 0, sizeof(info->tag_set));
830 info->tag_set.ops = &blkfront_mq_ops; 926 info->tag_set.ops = &blkfront_mq_ops;
831 info->tag_set.nr_hw_queues = info->nr_rings; 927 info->tag_set.nr_hw_queues = info->nr_rings;
832 info->tag_set.queue_depth = BLK_RING_SIZE(info); 928 if (HAS_EXTRA_REQ && info->max_indirect_segments == 0) {
929 /*
930 * When indirect descriptior is not supported, the I/O request
931 * will be split between multiple request in the ring.
932 * To avoid problems when sending the request, divide by
933 * 2 the depth of the queue.
934 */
935 info->tag_set.queue_depth = BLK_RING_SIZE(info) / 2;
936 } else
937 info->tag_set.queue_depth = BLK_RING_SIZE(info);
833 info->tag_set.numa_node = NUMA_NO_NODE; 938 info->tag_set.numa_node = NUMA_NO_NODE;
834 info->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE; 939 info->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
835 info->tag_set.cmd_size = 0; 940 info->tag_set.cmd_size = 0;
@@ -1269,20 +1374,93 @@ static void blkif_copy_from_grant(unsigned long gfn, unsigned int offset,
1269 kunmap_atomic(shared_data); 1374 kunmap_atomic(shared_data);
1270} 1375}
1271 1376
1272static void blkif_completion(struct blk_shadow *s, struct blkfront_ring_info *rinfo, 1377static enum blk_req_status blkif_rsp_to_req_status(int rsp)
1378{
1379 switch (rsp)
1380 {
1381 case BLKIF_RSP_OKAY:
1382 return REQ_DONE;
1383 case BLKIF_RSP_EOPNOTSUPP:
1384 return REQ_EOPNOTSUPP;
1385 case BLKIF_RSP_ERROR:
1386 /* Fallthrough. */
1387 default:
1388 return REQ_ERROR;
1389 }
1390}
1391
1392/*
1393 * Get the final status of the block request based on two ring response
1394 */
1395static int blkif_get_final_status(enum blk_req_status s1,
1396 enum blk_req_status s2)
1397{
1398 BUG_ON(s1 == REQ_WAITING);
1399 BUG_ON(s2 == REQ_WAITING);
1400
1401 if (s1 == REQ_ERROR || s2 == REQ_ERROR)
1402 return BLKIF_RSP_ERROR;
1403 else if (s1 == REQ_EOPNOTSUPP || s2 == REQ_EOPNOTSUPP)
1404 return BLKIF_RSP_EOPNOTSUPP;
1405 return BLKIF_RSP_OKAY;
1406}
1407
1408static bool blkif_completion(unsigned long *id,
1409 struct blkfront_ring_info *rinfo,
1273 struct blkif_response *bret) 1410 struct blkif_response *bret)
1274{ 1411{
1275 int i = 0; 1412 int i = 0;
1276 struct scatterlist *sg; 1413 struct scatterlist *sg;
1277 int num_sg, num_grant; 1414 int num_sg, num_grant;
1278 struct blkfront_info *info = rinfo->dev_info; 1415 struct blkfront_info *info = rinfo->dev_info;
1416 struct blk_shadow *s = &rinfo->shadow[*id];
1279 struct copy_from_grant data = { 1417 struct copy_from_grant data = {
1280 .s = s,
1281 .grant_idx = 0, 1418 .grant_idx = 0,
1282 }; 1419 };
1283 1420
1284 num_grant = s->req.operation == BLKIF_OP_INDIRECT ? 1421 num_grant = s->req.operation == BLKIF_OP_INDIRECT ?
1285 s->req.u.indirect.nr_segments : s->req.u.rw.nr_segments; 1422 s->req.u.indirect.nr_segments : s->req.u.rw.nr_segments;
1423
1424 /* The I/O request may be split in two. */
1425 if (unlikely(s->associated_id != NO_ASSOCIATED_ID)) {
1426 struct blk_shadow *s2 = &rinfo->shadow[s->associated_id];
1427
1428 /* Keep the status of the current response in shadow. */
1429 s->status = blkif_rsp_to_req_status(bret->status);
1430
1431 /* Wait the second response if not yet here. */
1432 if (s2->status == REQ_WAITING)
1433 return 0;
1434
1435 bret->status = blkif_get_final_status(s->status,
1436 s2->status);
1437
1438 /*
1439 * All the grants is stored in the first shadow in order
1440 * to make the completion code simpler.
1441 */
1442 num_grant += s2->req.u.rw.nr_segments;
1443
1444 /*
1445 * The two responses may not come in order. Only the
1446 * first request will store the scatter-gather list.
1447 */
1448 if (s2->num_sg != 0) {
1449 /* Update "id" with the ID of the first response. */
1450 *id = s->associated_id;
1451 s = s2;
1452 }
1453
1454 /*
1455 * We don't need anymore the second request, so recycling
1456 * it now.
1457 */
1458 if (add_id_to_freelist(rinfo, s->associated_id))
1459 WARN(1, "%s: can't recycle the second part (id = %ld) of the request\n",
1460 info->gd->disk_name, s->associated_id);
1461 }
1462
1463 data.s = s;
1286 num_sg = s->num_sg; 1464 num_sg = s->num_sg;
1287 1465
1288 if (bret->operation == BLKIF_OP_READ && info->feature_persistent) { 1466 if (bret->operation == BLKIF_OP_READ && info->feature_persistent) {
@@ -1352,6 +1530,8 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_ring_info *ri
1352 } 1530 }
1353 } 1531 }
1354 } 1532 }
1533
1534 return 1;
1355} 1535}
1356 1536
1357static irqreturn_t blkif_interrupt(int irq, void *dev_id) 1537static irqreturn_t blkif_interrupt(int irq, void *dev_id)
@@ -1391,8 +1571,14 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
1391 } 1571 }
1392 req = rinfo->shadow[id].request; 1572 req = rinfo->shadow[id].request;
1393 1573
1394 if (bret->operation != BLKIF_OP_DISCARD) 1574 if (bret->operation != BLKIF_OP_DISCARD) {
1395 blkif_completion(&rinfo->shadow[id], rinfo, bret); 1575 /*
1576 * We may need to wait for an extra response if the
1577 * I/O request is split in 2
1578 */
1579 if (!blkif_completion(&id, rinfo, bret))
1580 continue;
1581 }
1396 1582
1397 if (add_id_to_freelist(rinfo, id)) { 1583 if (add_id_to_freelist(rinfo, id)) {
1398 WARN(1, "%s: response to %s (id %ld) couldn't be recycled!\n", 1584 WARN(1, "%s: response to %s (id %ld) couldn't be recycled!\n",
@@ -2017,8 +2203,18 @@ static int blkfront_setup_indirect(struct blkfront_ring_info *rinfo)
2017 int err, i; 2203 int err, i;
2018 struct blkfront_info *info = rinfo->dev_info; 2204 struct blkfront_info *info = rinfo->dev_info;
2019 2205
2020 if (info->max_indirect_segments == 0) 2206 if (info->max_indirect_segments == 0) {
2021 grants = BLKIF_MAX_SEGMENTS_PER_REQUEST; 2207 if (!HAS_EXTRA_REQ)
2208 grants = BLKIF_MAX_SEGMENTS_PER_REQUEST;
2209 else {
2210 /*
2211 * When an extra req is required, the maximum
2212 * grants supported is related to the size of the
2213 * Linux block segment.
2214 */
2215 grants = GRANTS_PER_PSEG;
2216 }
2217 }
2022 else 2218 else
2023 grants = info->max_indirect_segments; 2219 grants = info->max_indirect_segments;
2024 psegs = grants / GRANTS_PER_PSEG; 2220 psegs = grants / GRANTS_PER_PSEG;